In [57]:
!pip install ta

import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from ta.momentum import RSIIndicator
from ta.volatility import BollingerBands
from ta.trend import MACD
from ta.volume import VolumeWeightedAveragePrice

# Define the ticker and date range
TICKER = 'INFY.NS'
START_DATE = '2019-01-01' # ~5 years of data
END_DATE = pd.to_datetime('today').strftime('%Y-%m-%d')

# Download data
df = yf.download(TICKER, start=START_DATE, end=END_DATE, auto_adjust=True)
df = df.dropna() # Drop any rows with missing data (e.g., trading holidays)



[*********************100%***********************]  1 of 1 completed


In [58]:
df = pd.DataFrame(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1714 entries, 2019-01-01 to 2025-12-05
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   (Close, INFY.NS)   1714 non-null   float64
 1   (High, INFY.NS)    1714 non-null   float64
 2   (Low, INFY.NS)     1714 non-null   float64
 3   (Open, INFY.NS)    1714 non-null   float64
 4   (Volume, INFY.NS)  1714 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 80.3 KB


In [59]:
df.describe()

Price,Close,High,Low,Open,Volume
Ticker,INFY.NS,INFY.NS,INFY.NS,INFY.NS,INFY.NS
count,1714.0,1714.0,1714.0,1714.0,1714.0
mean,1239.770444,1251.82722,1227.625813,1239.802055,7772874.0
std,390.737199,393.770422,387.589976,390.94648,5512769.0
min,452.361237,479.471143,437.581863,437.581863,0.0
25%,880.642426,894.371042,874.730755,884.550897,4850810.0
50%,1355.065979,1367.43773,1343.132176,1356.850924,6593246.0
75%,1519.653931,1536.463522,1504.456896,1519.395585,8857209.0
max,1942.221191,1948.777171,1920.756459,1938.093361,90432110.0


In [60]:
df.dtypes

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Price,Ticker,Unnamed: 2_level_1
Close,INFY.NS,float64
High,INFY.NS,float64
Low,INFY.NS,float64
Open,INFY.NS,float64
Volume,INFY.NS,int64


In [61]:
# Check the Null Values
df.isnull().sum()
df.columns = df.columns.get_level_values(0)

In [62]:
df = df.reset_index()

#df.columns = df.columns.get_level_values(-1)  # flatten fully (if needed)
df.columns.name = None                        # remove index name
df = df.reset_index(drop=True)                # keep Date already flattened



In [63]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [64]:
df.head()

Unnamed: 0,date,close,high,low,open,volume
0,2019-01-01,553.448547,554.488786,544.377697,550.036585,2943390
1,2019-01-02,556.777283,560.896638,550.951947,554.239111,7416655
2,2019-01-03,556.860474,563.393151,551.825679,559.232197,6827249
3,2019-01-04,550.119751,560.813434,541.756243,559.024203,7889310
4,2019-01-07,558.982605,560.563737,550.494248,553.406916,8046340


In [65]:
df['close_price'] = df['close'] # Working column for closing price

In [66]:
# Target Variable (y): 21-day (approx. 1 month) Future Return Percentage
df['future_close'] = df['close_price'].shift(-21)
df['target_return'] = ((df['future_close'] - df['close_price']) / df['close_price']) * 100

In [67]:
df.dtypes

Unnamed: 0,0
date,datetime64[ns]
close,float64
high,float64
low,float64
open,float64
volume,int64
close_price,float64
future_close,float64
target_return,float64


In [68]:
# Ensure the DataFrame is an independent copy for stability
df = df.copy()

print(f"Total data points after target calculation: {len(df)}")
# print(df.tail(3))

Total data points after target calculation: 1714


In [69]:
# --- 2. FEATURE ENGINEERING (Technical Indicators and Lagged Features) ---

print("\n--- 2. Feature Engineering ---")
# 2A. Technical Indicators (Non-Collinear Selection)
df['rsi'] = RSIIndicator(close=df['close_price'], window=14).rsi()
macd = MACD(close=df['close_price'])
df['macd'] = macd.macd()

# Correct calculation for Bollinger Bands Percentage (BBP)
bb = BollingerBands(close=df['close_price'], window=20, window_dev=2)
df['bbp'] = (df['close_price'] - bb.bollinger_lband()) / (bb.bollinger_hband() - bb.bollinger_lband())

# Correct the column name for Volume
df['vwap_ratio'] = df['volume'] / df['volume'].rolling(window=20).mean()


--- 2. Feature Engineering ---


In [70]:
df

Unnamed: 0,date,close,high,low,open,volume,close_price,future_close,target_return,rsi,macd,bbp,vwap_ratio
0,2019-01-01,553.448547,554.488786,544.377697,550.036585,2943390,553.448547,607.408081,9.749693,,,,
1,2019-01-02,556.777283,560.896638,550.951947,554.239111,7416655,556.777283,627.197571,12.647838,,,,
2,2019-01-03,556.860474,563.393151,551.825679,559.232197,6827249,556.860474,633.473267,13.757987,,,,
3,2019-01-04,550.119751,560.813434,541.756243,559.024203,7889310,550.119751,632.511108,14.976986,,,,
4,2019-01-07,558.982605,560.563737,550.494248,553.406916,8046340,558.982605,631.632324,12.996776,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1709,2025-12-01,1564.000000,1574.000000,1556.000000,1562.500000,5341800,1564.000000,,,61.360886,20.711765,0.804365,0.567659
1710,2025-12-02,1561.000000,1579.900024,1556.099976,1567.000000,6335969,1561.000000,,,60.408329,20.646627,0.759441,0.681840
1711,2025-12-03,1578.699951,1586.000000,1554.099976,1555.099976,12895312,1578.699951,,,63.962881,21.772265,0.869326,1.297677
1712,2025-12-04,1597.599976,1599.000000,1568.000000,1568.000000,12609619,1597.599976,,,67.335252,23.913753,0.985226,1.239827


In [71]:
# 2B. Lagged Features (Previous day data)
lag_days = 1
base_features = ['open', 'high', 'low', 'close_price', 'volume']
for feature in base_features:
    df[f'{feature}_Lag{lag_days}'] = df[feature].shift(lag_days)

# Final drop of NaNs created by indicators/lags
df.dropna(inplace=True)
print(f"Total data points after feature creation: {len(df)}")

Total data points after feature creation: 1668


In [72]:
df

Unnamed: 0,date,close,high,low,open,volume,close_price,future_close,target_return,rsi,macd,bbp,vwap_ratio,open_Lag1,high_Lag1,low_Lag1,close_price_Lag1,volume_Lag1
25,2019-02-05,631.632324,635.899858,625.063736,625.063736,4694366,631.632324,596.069885,-5.630244,76.878934,16.365032,0.798439,0.505636,633.431530,638.033741,626.904769,632.511108,3945391.0
26,2019-02-06,638.703003,641.757217,631.213943,632.218071,5880482,638.703003,595.149414,-6.819067,79.373942,16.843693,0.866324,0.678132,625.063736,635.899858,625.063736,631.632324,4694366.0
27,2019-02-07,639.288757,643.849139,636.778461,638.493820,3961797,639.288757,591.551270,-7.467281,79.570621,17.073489,0.865922,0.472145,632.218071,641.757217,631.213943,638.703003,5880482.0
28,2019-02-08,636.694824,646.192091,631.674232,635.941715,5915169,636.694824,592.806519,-6.893146,76.109560,16.852035,0.836336,0.732735,638.493820,643.849139,636.778461,639.288757,3961797.0
29,2019-02-11,638.284790,643.263556,635.105091,641.715560,5500216,638.284790,592.722778,-7.138195,76.776368,16.613320,0.859198,0.745076,635.941715,646.192091,631.674232,636.694824,5915169.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1688,2025-10-31,1482.300049,1497.099976,1480.699951,1490.000000,4819814,1482.300049,1564.000000,5.511701,52.071321,8.715278,0.604116,0.667491,1507.000000,1508.199951,1489.099976,1493.800049,5608867.0
1689,2025-11-03,1485.500000,1491.400024,1474.199951,1482.300049,5470600,1485.500000,1561.000000,5.082464,52.861341,8.241201,0.613356,0.765071,1490.000000,1497.099976,1480.699951,1482.300049,4819814.0
1690,2025-11-04,1467.900024,1481.900024,1462.900024,1479.699951,8691330,1467.900024,1578.699951,7.548193,48.159398,6.371870,0.436669,1.181934,1482.300049,1491.400024,1474.199951,1485.500000,5470600.0
1691,2025-11-05,1467.900024,1467.900024,1467.900024,1467.900024,0,1467.900024,1597.599976,8.835748,48.159398,4.834680,0.416692,0.000000,1479.699951,1481.900024,1462.900024,1467.900024,8691330.0


In [97]:
# Extract datetime features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['day_of_week'] = df['date'].dt.dayofweek
df['day_of_year'] = df['date'].dt.dayofyear
df['week_of_year'] = df['date'].dt.isocalendar().week.astype(int)
df['quarter'] = df['date'].dt.quarter

# --- 3. DATA SPLITTING (Time-Series Split 60/20/20) ---

# 1. Split off the Test set (e.g., last 20% of data)
# Result: df_full_train (80%), df_test (20%)
df_full_train, df_test = train_test_split(
    df,
    test_size=0.2,
    shuffle=False  # <--- CRITICAL: Keeps data in date order
)

# 2. Split the Full Train into Train and Val
# We need Val to be 20% of the ORIGINAL total.
# Since df_full_train is 80% of total, taking 0.25 (1/4) of it gives us 20%.
# Result: df_train (60%), df_val (20%)
df_train, df_val = train_test_split(
    df_full_train,
    test_size=0.25,
    shuffle=False  # <--- CRITICAL
)

print(f"Train: {len(df_train)} (60%)")
print(f"Val:   {len(df_val)}   (20%)")
print(f"Test:  {len(df_test)}  (20%)")

Train: 1000 (60%)
Val:   334   (20%)
Test:  334  (20%)


In [76]:
df_train

Unnamed: 0,date,close,high,low,open,volume,close_price,future_close,target_return,rsi,macd,bbp,vwap_ratio,open_Lag1,high_Lag1,low_Lag1,close_price_Lag1,volume_Lag1
0,2019-02-05,631.632324,635.899858,625.063736,625.063736,4694366,631.632324,596.069885,-5.630244,76.878934,16.365032,0.798439,0.505636,633.431530,638.033741,626.904769,632.511108,3945391.0
1,2019-02-06,638.703003,641.757217,631.213943,632.218071,5880482,638.703003,595.149414,-6.819067,79.373942,16.843693,0.866324,0.678132,625.063736,635.899858,625.063736,631.632324,4694366.0
2,2019-02-07,639.288757,643.849139,636.778461,638.493820,3961797,639.288757,591.551270,-7.467281,79.570621,17.073489,0.865922,0.472145,632.218071,641.757217,631.213943,638.703003,5880482.0
3,2019-02-08,636.694824,646.192091,631.674232,635.941715,5915169,636.694824,592.806519,-6.893146,76.109560,16.852035,0.836336,0.732735,638.493820,643.849139,636.778461,639.288757,3961797.0
4,2019-02-11,638.284790,643.263556,635.105091,641.715560,5500216,638.284790,592.722778,-7.138195,76.776368,16.613320,0.859198,0.745076,635.941715,646.192091,631.674232,636.694824,5915169.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2023-02-14,1461.213135,1463.735235,1447.364534,1453.371651,5043736,1461.213135,1287.738403,-11.871966,57.162471,16.949020,0.786103,0.847383,1464.652428,1464.652428,1436.221480,1437.551270,4001199.0
996,2023-02-15,1464.698364,1466.486718,1445.392788,1452.729807,3465747,1464.698364,1302.962891,-11.042238,57.803282,17.183932,0.788582,0.594540,1453.371651,1463.735235,1447.364534,1461.213135,5043736.0
997,2023-02-16,1469.054688,1480.656259,1465.431967,1467.403813,4290289,1469.054688,1287.325806,-12.370464,58.636195,17.519664,0.797413,0.745923,1452.729807,1466.486718,1445.392788,1464.698364,3465747.0
998,2023-02-17,1452.546387,1462.818169,1444.934185,1461.901041,2857734,1452.546387,1275.449219,-12.192187,54.264929,16.266147,0.626399,0.511564,1467.403813,1480.656259,1465.431967,1469.054688,4290289.0


In [98]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)


EXCLUDE_COLUMNS = ['future_close', 'target_return', 'close', 'close_price', 'date'] # Exclude target components, redundant columns, and the 'date' column

# Re-define FEATURES to include the new datetime features and exclude the original 'date' column
FEATURES = [col for col in df.columns if col not in EXCLUDE_COLUMNS and df[col].dtype != 'datetime64[ns]']
TARGET = 'target_return'

X_train, y_train = df_train[FEATURES], df_train[TARGET]
X_val, y_val = df_val[FEATURES], df_val[TARGET]
X_test, y_test = df_test[FEATURES], df_test[TARGET]

print("Updated FEATURES list:")
print(FEATURES)

Updated FEATURES list:
['high', 'low', 'open', 'volume', 'rsi', 'macd', 'bbp', 'vwap_ratio', 'open_Lag1', 'high_Lag1', 'low_Lag1', 'close_price_Lag1', 'volume_Lag1', 'year', 'month', 'day', 'day_of_week', 'day_of_year', 'week_of_year', 'quarter']


In [87]:
X_train.dtypes

Unnamed: 0,0
date,datetime64[ns]
high,float64
low,float64
open,float64
volume,int64
rsi,float64
macd,float64
bbp,float64
vwap_ratio,float64
open_Lag1,float64


In [89]:
# Feature Correlation Matrix (checking for collinearity)
feature_corr = X_train[FEATURES].corr().abs()
# We look for features highly correlated with each other (e.g., > 0.9)
high_corr_features = set()
for i in range(len(feature_corr.columns)):
    for j in range(i):
        if feature_corr.iloc[i, j] > 0.8:
            colname = feature_corr.columns[i]
            high_corr_features.add(colname)

# High correlation is expected between certain lagged price features (e.g., Close_Lag1 and Open_Lag1).
# For tree-based models, high collinearity is less of a concern than in linear models, but we note it.
print("\n--- Highly Correlated Features (Collinearity > 0.9) ---")
print(high_corr_features)

# Target Variable Correlation
target_corr = df_train[FEATURES + [TARGET]].corr()[TARGET].abs().sort_values(ascending=False)
print("\n--- Feature Correlation with Target_Return ---")
print(target_corr.head(20))


--- Highly Correlated Features (Collinearity > 0.9) ---
{'low', 'open_Lag1', 'low_Lag1', 'close_price_Lag1', 'vwap_ratio', 'high_Lag1', 'macd', 'high', 'open', 'bbp'}

--- Feature Correlation with Target_Return ---
target_return       1.000000
low_Lag1            0.191807
open_Lag1           0.191729
low                 0.191253
open                0.191228
close_price_Lag1    0.191052
high_Lag1           0.189781
high                0.189476
macd                0.151846
volume              0.134448
volume_Lag1         0.134159
date                0.087039
rsi                 0.082173
vwap_ratio          0.031758
bbp                 0.022799
Name: target_return, dtype: float64


In [92]:
from sklearn.feature_selection import mutual_info_regression

# Exclude 'date' column for mutual information calculation
FEATURES_MI = [col for col in FEATURES if col != 'date']

# Calculate mutual information
mi_scores = mutual_info_regression(X_train[FEATURES_MI], y_train, random_state=11)
mi_series = pd.Series(mi_scores, name="MI Scores", index=X_train[FEATURES_MI].columns).sort_values(ascending=False)

print("\n--- Feature Importance (Mutual Information) ---")
print(mi_series.head(100))


--- Feature Importance (Mutual Information) ---
high                0.486940
low                 0.464649
open                0.448811
high_Lag1           0.435774
close_price_Lag1    0.433984
low_Lag1            0.425463
open_Lag1           0.391209
macd                0.228296
rsi                 0.132715
volume_Lag1         0.063359
bbp                 0.059806
vwap_ratio          0.045924
volume              0.005203
Name: MI Scores, dtype: float64


In [101]:
# Function to evaluate model
def evaluate_model(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
    rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
    return rmse_train, rmse_val

# Tuning Max Depth
depths = range(3, 15)
dt_scores = []
best_dt_rmse_val = float('inf')
best_dt_depth = 0

for depth in depths:
    model = DecisionTreeRegressor(max_depth=depth, random_state=11)
    rmse_train, rmse_val = evaluate_model(model, X_train, y_train, X_val, y_val)
    dt_scores.append({'Depth': depth, 'Train RMSE': rmse_train, 'Validation RMSE': rmse_val})
    if rmse_val < best_dt_rmse_val:
        best_dt_rmse_val = rmse_val
        best_dt_depth = depth

dt_results = pd.DataFrame(dt_scores)
print("\n--- Decision Tree Max Depth Tuning Results ---")
print(dt_results)
print(f"\nBest Decision Tree Depth: {best_dt_depth} with Val RMSE: {best_dt_rmse_val:.4f}")

# Final DT Model
dt_model = DecisionTreeRegressor(max_depth=best_dt_depth, random_state=11)
dt_model.fit(X_train, y_train)


--- Decision Tree Max Depth Tuning Results ---
    Depth  Train RMSE  Validation RMSE
0       3    7.159214         7.194248
1       4    6.512044         7.011178
2       5    5.622075         8.035024
3       6    4.391336         8.929396
4       7    3.590768         8.534552
5       8    2.844240         9.460540
6       9    2.102482         9.281323
7      10    1.491408         9.331537
8      11    0.981726         9.279558
9      12    0.628176         9.461667
10     13    0.398189         9.519650
11     14    0.254890         9.371340

Best Decision Tree Depth: 4 with Val RMSE: 7.0112


In [104]:
X_train.head(100)

Unnamed: 0,high,low,open,volume,rsi,macd,bbp,vwap_ratio,open_Lag1,high_Lag1,low_Lag1,close_price_Lag1,volume_Lag1,year,month,day,day_of_week,day_of_year,week_of_year,quarter
0,635.899858,625.063736,625.063736,4694366,76.878934,16.365032,0.798439,0.505636,633.431530,638.033741,626.904769,632.511108,3945391.0,2019,2,5,1,36,6,1
1,641.757217,631.213943,632.218071,5880482,79.373942,16.843693,0.866324,0.678132,625.063736,635.899858,625.063736,631.632324,4694366.0,2019,2,6,2,37,6,1
2,643.849139,636.778461,638.493820,3961797,79.570621,17.073489,0.865922,0.472145,632.218071,641.757217,631.213943,638.703003,5880482.0,2019,2,7,3,38,6,1
3,646.192091,631.674232,635.941715,5915169,76.109560,16.852035,0.836336,0.732735,638.493820,643.849139,636.778461,639.288757,3961797.0,2019,2,8,4,39,6,1
4,643.263556,635.105091,641.715560,5500216,76.776368,16.613320,0.859198,0.745076,635.941715,646.192091,631.674232,636.694824,5915169.0,2019,2,11,0,42,7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,625.831982,619.340280,623.710517,3224968,47.467413,3.228049,0.249694,0.504232,624.134784,627.529127,618.194684,621.164734,5919494.0,2019,7,1,0,182,27,3
96,629.353678,619.552471,622.861976,4837606,53.969688,3.089175,0.489396,0.773962,623.710517,625.831982,619.340280,620.570740,3224968.0,2019,7,2,1,183,27,3
97,630.668931,619.467589,628.802032,5958379,47.621829,2.356228,0.214489,0.999447,622.861976,629.353678,619.552471,627.953491,4837606.0,2019,7,3,2,184,27,3
98,624.898554,618.279544,618.618999,3274064,49.461340,1.924364,0.260088,0.569682,628.802032,630.668931,619.467589,620.570740,5958379.0,2019,7,4,3,185,27,3


In [102]:
# --- 5. Random Forest Regressor Tuning ---
print("\n--- Random Forest Regressor Tuning (Max Depth) ---")

rf_scores = []
best_rf_rmse_val = float('inf')
best_rf_depth = 0

# Using a fixed n_estimators for initial depth tuning
for depth in depths: # Using the same depth range as Decision Tree for consistency
    model = RandomForestRegressor(max_depth=depth, n_estimators=100, random_state=11, n_jobs=-1)
    rmse_train, rmse_val = evaluate_model(model, X_train, y_train, X_val, y_val)
    rf_scores.append({'Depth': depth, 'Train RMSE': rmse_train, 'Validation RMSE': rmse_val})
    if rmse_val < best_rf_rmse_val:
        best_rf_rmse_val = rmse_val
        best_rf_depth = depth

rf_results = pd.DataFrame(rf_scores)
print(rf_results)
print(f"\nBest Random Forest Depth: {best_rf_depth} with Val RMSE: {best_rf_rmse_val:.4f}")

# Final RF Model
rf_model = RandomForestRegressor(max_depth=best_rf_depth, n_estimators=100, random_state=11, n_jobs=-1)
rf_model.fit(X_train, y_train)


--- Random Forest Regressor Tuning (Max Depth) ---
    Depth  Train RMSE  Validation RMSE
0       3    6.609327         7.318750
1       4    5.545689         7.486077
2       5    4.490900         7.647414
3       6    3.473070         7.963001
4       7    2.709849         8.103260
5       8    2.169057         8.116532
6       9    1.767163         8.169568
7      10    1.513181         8.253623
8      11    1.363579         8.246711
9      12    1.275131         8.288241
10     13    1.229752         8.302314
11     14    1.205373         8.302895

Best Random Forest Depth: 3 with Val RMSE: 7.3187
