In [75]:
!pip install ta

import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from ta.momentum import RSIIndicator
from ta.volatility import BollingerBands
from ta.trend import MACD
from ta.volume import VolumeWeightedAveragePrice

# Define the ticker and date range
TICKER = 'INFY.NS'
START_DATE = '2019-01-01' # ~5 years of data
END_DATE = pd.to_datetime('today').strftime('%Y-%m-%d')

# Download data
df = yf.download(TICKER, start=START_DATE, end=END_DATE, auto_adjust=True)
df = df.dropna() # Drop any rows with missing data (e.g., trading holidays)



[*********************100%***********************]  1 of 1 completed


In [76]:
df = pd.DataFrame(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1714 entries, 2019-01-01 to 2025-12-05
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   (Close, INFY.NS)   1714 non-null   float64
 1   (High, INFY.NS)    1714 non-null   float64
 2   (Low, INFY.NS)     1714 non-null   float64
 3   (Open, INFY.NS)    1714 non-null   float64
 4   (Volume, INFY.NS)  1714 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 80.3 KB


In [77]:
df.describe()

Price,Close,High,Low,Open,Volume
Ticker,INFY.NS,INFY.NS,INFY.NS,INFY.NS,INFY.NS
count,1714.0,1714.0,1714.0,1714.0,1714.0
mean,1239.77044,1251.827217,1227.62581,1239.802052,7772874.0
std,390.737201,393.770424,387.589977,390.946482,5512769.0
min,452.361298,479.471236,437.581948,437.581948,0.0
25%,880.642471,894.371042,874.730801,884.550943,4850810.0
50%,1355.065979,1367.43773,1343.132297,1356.850986,6593246.0
75%,1519.653931,1536.463491,1504.456986,1519.395615,8857209.0
max,1942.221191,1948.777171,1920.756459,1938.093361,90432110.0


In [78]:
df.dtypes

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Price,Ticker,Unnamed: 2_level_1
Close,INFY.NS,float64
High,INFY.NS,float64
Low,INFY.NS,float64
Open,INFY.NS,float64
Volume,INFY.NS,int64


In [80]:
# Check the Null Values
df.isnull().sum()
df.columns = df.columns.get_level_values(0)

In [83]:
df = df.reset_index()

#df.columns = df.columns.get_level_values(-1)  # flatten fully (if needed)
df.columns.name = None                        # remove index name
df = df.reset_index(drop=True)                # keep Date already flattened



In [85]:
df = df.drop(columns=['index'])

In [87]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [88]:
df.head()

Unnamed: 0,date,close,high,low,open,volume
0,2019-01-01,553.448425,554.488664,544.377577,550.036464,2943390
1,2019-01-02,556.777222,560.896576,550.951886,554.23905,7416655
2,2019-01-03,556.860535,563.393213,551.825739,559.232258,6827249
3,2019-01-04,550.119629,560.813309,541.756123,559.024079,7889310
4,2019-01-07,558.982483,560.563615,550.494128,553.406795,8046340


In [89]:
df['close_price'] = df['close'] # Working column for closing price

In [90]:
# Target Variable (y): 21-day (approx. 1 month) Future Return Percentage
df['future_close'] = df['close_price'].shift(-21)
df['target_return'] = ((df['future_close'] - df['close_price']) / df['close_price']) * 100

In [93]:
df.dtypes

Unnamed: 0,0
date,datetime64[ns]
close,float64
high,float64
low,float64
open,float64
volume,int64
close_price,float64
future_close,float64
target_return,float64


In [94]:
# Ensure the DataFrame is an independent copy for stability
df = df.copy()

print(f"Total data points after target calculation: {len(df)}")
# print(df.tail(3))

Total data points after target calculation: 1693


In [95]:
# --- 2. FEATURE ENGINEERING (Technical Indicators and Lagged Features) ---

print("\n--- 2. Feature Engineering ---")
# 2A. Technical Indicators (Non-Collinear Selection)
df['rsi'] = RSIIndicator(close=df['close_price'], window=14).rsi()
macd = MACD(close=df['close_price'])
df['macd'] = macd.macd()

# Correct calculation for Bollinger Bands Percentage (BBP)
bb = BollingerBands(close=df['close_price'], window=20, window_dev=2)
df['bbp'] = (df['close_price'] - bb.bollinger_lband()) / (bb.bollinger_hband() - bb.bollinger_lband())

# Correct the column name for Volume
df['vwap_ratio'] = df['volume'] / df['volume'].rolling(window=20).mean()


--- 2. Feature Engineering ---


In [99]:
df

Unnamed: 0,date,close,high,low,open,volume,close_price,future_close,target_return,rsi,macd,bbp,vwap_ratio
0,2019-01-01,553.448425,554.488664,544.377577,550.036464,2943390,553.448425,607.408020,9.749706,,,,
1,2019-01-02,556.777222,560.896576,550.951886,554.239050,7416655,556.777222,627.197510,12.647839,,,,
2,2019-01-03,556.860535,563.393213,551.825739,559.232258,6827249,556.860535,633.473328,13.757986,,,,
3,2019-01-04,550.119629,560.813309,541.756123,559.024079,7889310,550.119629,632.510986,14.976989,,,,
4,2019-01-07,558.982483,560.563615,550.494128,553.406795,8046340,558.982483,631.632385,12.996812,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1688,2025-10-31,1482.300049,1497.099976,1480.699951,1490.000000,4819814,1482.300049,1564.000000,5.511701,52.071321,8.715278,0.604116,0.667491
1689,2025-11-03,1485.500000,1491.400024,1474.199951,1482.300049,5470600,1485.500000,1561.000000,5.082464,52.861341,8.241201,0.613356,0.765071
1690,2025-11-04,1467.900024,1481.900024,1462.900024,1479.699951,8691330,1467.900024,1578.699951,7.548193,48.159398,6.371870,0.436669,1.181934
1691,2025-11-05,1467.900024,1467.900024,1467.900024,1467.900024,0,1467.900024,1597.599976,8.835748,48.159398,4.834680,0.416692,0.000000


In [51]:
# 2B. Lagged Features (Previous day data)
lag_days = 1
base_features = ['Open', 'High', 'Low', 'Close_Price', 'Volume']
for feature in base_features:
    df[f'{feature}_Lag{lag_days}'] = df[feature].shift(lag_days)

# Final drop of NaNs created by indicators/lags
df.dropna(inplace=True)
print(f"Total data points after feature creation: {len(df)}")

Total data points after feature creation: 1668


In [52]:
df.index.name = 'Date'


In [53]:
df

Unnamed: 0_level_0,Price,Close,High,Low,Open,Volume,Close_Price,Future_Close,Target_Return,RSI,MACD,BBP,VWAP_Ratio,Open_Lag1,High_Lag1,Low_Lag1,Close_Price_Lag1,Volume_Lag1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
25,2019-02-05,631.632385,635.899920,625.063797,625.063797,4694366,631.632385,596.069885,-5.630253,76.878858,16.365048,0.798439,0.505636,633.431408,638.033618,626.904648,632.510986,3945391.0
26,2019-02-06,638.703064,641.757278,631.214003,632.218132,5880482,638.703064,595.149414,-6.819076,79.373862,16.843713,0.866324,0.678132,625.063797,635.899920,625.063797,631.632385,4694366.0
27,2019-02-07,639.288818,643.849200,636.778522,638.493881,3961797,639.288818,591.551270,-7.467290,79.570541,17.073512,0.865922,0.472145,632.218132,641.757278,631.214003,638.703064,5880482.0
28,2019-02-08,636.694824,646.192091,631.674232,635.941715,5915169,636.694824,592.806458,-6.893156,76.109422,16.852055,0.836335,0.732735,638.493881,643.849200,636.778522,639.288818,3961797.0
29,2019-02-11,638.284668,643.263433,635.104970,641.715437,5500216,638.284668,592.722900,-7.138158,76.776180,16.613327,0.859195,0.745076,635.941715,646.192091,631.674232,636.694824,5915169.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1688,2025-10-31,1482.300049,1497.099976,1480.699951,1490.000000,4819814,1482.300049,1564.000000,5.511701,52.071321,8.715278,0.604116,0.667491,1507.000000,1508.199951,1489.099976,1493.800049,5608867.0
1689,2025-11-03,1485.500000,1491.400024,1474.199951,1482.300049,5470600,1485.500000,1561.000000,5.082464,52.861341,8.241201,0.613356,0.765071,1490.000000,1497.099976,1480.699951,1482.300049,4819814.0
1690,2025-11-04,1467.900024,1481.900024,1462.900024,1479.699951,8691330,1467.900024,1578.699951,7.548193,48.159398,6.371870,0.436669,1.181934,1482.300049,1491.400024,1474.199951,1485.500000,5470600.0
1691,2025-11-05,1467.900024,1467.900024,1467.900024,1467.900024,0,1467.900024,1597.599976,8.835748,48.159398,4.834680,0.416692,0.000000,1479.699951,1481.900024,1462.900024,1467.900024,8691330.0
