In [82]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from ta.momentum import RSIIndicator
from ta.volatility import BollingerBands
from ta.trend import MACD
from ta.volume import VolumeWeightedAveragePrice

# Define the ticker and date range
TICKER = 'INFY.NS'
START_DATE = '2019-01-01' # ~5 years of data
END_DATE = pd.to_datetime('today').strftime('%Y-%m-%d')

# Download data
df = yf.download(TICKER, start=START_DATE, end=END_DATE, auto_adjust=True)
df = df.dropna() # Drop any rows with missing data (e.g., trading holidays)

[*********************100%***********************]  1 of 1 completed


In [83]:
df = pd.DataFrame(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1714 entries, 2019-01-01 to 2025-12-05
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   (Close, INFY.NS)   1714 non-null   float64
 1   (High, INFY.NS)    1714 non-null   float64
 2   (Low, INFY.NS)     1714 non-null   float64
 3   (Open, INFY.NS)    1714 non-null   float64
 4   (Volume, INFY.NS)  1714 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 80.3 KB


In [84]:
df.describe()

Price,Close,High,Low,Open,Volume
Ticker,INFY.NS,INFY.NS,INFY.NS,INFY.NS,INFY.NS
count,1714.0,1714.0,1714.0,1714.0,1714.0
mean,1239.770443,1251.827219,1227.625812,1239.802054,7772874.0
std,390.737201,393.770423,387.589977,390.946481,5512769.0
min,452.361237,479.471236,437.581948,437.581948,0.0
25%,880.642456,894.371135,874.730785,884.550989,4850810.0
50%,1355.065979,1367.43773,1343.132296,1356.851109,6593246.0
75%,1519.653931,1536.463491,1504.456896,1519.395524,8857209.0
max,1942.221191,1948.777171,1920.756459,1938.093361,90432110.0


In [85]:
# Check the Null Values
df.isnull().sum()
df.columns = df.columns.get_level_values(0)

In [86]:
df['Close_Price'] = df['Close'] # Working column for closing price

In [87]:
# Target Variable (y): 21-day (approx. 1 month) Future Return Percentage
df['Future_Close'] = df['Close_Price'].shift(-21)
df['Target_Return'] = ((df['Future_Close'] - df['Close_Price']) / df['Close_Price']) * 100

In [88]:
df.dropna(subset=['Target_Return'], inplace=True)

In [97]:
df = df.rename_axis('Price').reset_index()
df.columns = df.columns.rename(None)
df.columns




Index(['Price', 'Close', 'High', 'Low', 'Open', 'Volume', 'Close_Price',
       'Future_Close', 'Target_Return'],
      dtype='object')

In [62]:
# Ensure the DataFrame is an independent copy for stability
df = df.copy()

print(f"Total data points after target calculation: {len(df)}")
# print(df.tail(3))

Total data points after target calculation: 1693


In [64]:
# --- 2. FEATURE ENGINEERING (Technical Indicators and Lagged Features) ---

print("\n--- 2. Feature Engineering ---")
# 2A. Technical Indicators (Non-Collinear Selection)
df['RSI'] = RSIIndicator(close=df['Close_Price'], window=14).rsi()
macd = MACD(close=df['Close_Price'])
df['MACD'] = macd.macd()

# Correct calculation for Bollinger Bands Percentage (BBP)
bb = BollingerBands(close=df['Close_Price'], window=20, window_dev=2)
df['BBP'] = (df['Close_Price'] - bb.bollinger_lband()) / (bb.bollinger_hband() - bb.bollinger_lband())

# Correct the column name for Volume
df['VWAP_Ratio'] = df['Volume_INFY.NS'] / df['Volume_INFY.NS'].rolling(window=20).mean()


--- 2. Feature Engineering ---


In [66]:
df

Unnamed: 0_level_0,Close_INFY.NS,High_INFY.NS,Low_INFY.NS,Open_INFY.NS,Volume_INFY.NS,Close_Price,Future_Close,Target_Return,RSI,MACD,BBP,VWAP_Ratio
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019-01-01,553.448486,554.488725,544.377637,550.036524,2943390,553.448486,607.408020,9.749694,,,,
2019-01-02,556.777222,560.896576,550.951886,554.239050,7416655,556.777222,627.197510,12.647839,,,,
2019-01-03,556.860413,563.393089,551.825618,559.232136,6827249,556.860413,633.473267,13.758000,,,,
2019-01-04,550.119629,560.813309,541.756123,559.024079,7889310,550.119629,632.510986,14.976989,,,,
2019-01-07,558.982483,560.563615,550.494128,553.406795,8046340,558.982483,631.632324,12.996801,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2025-10-31,1482.300049,1497.099976,1480.699951,1490.000000,4819814,1482.300049,1564.000000,5.511701,52.071321,8.715278,0.604116,0.667491
2025-11-03,1485.500000,1491.400024,1474.199951,1482.300049,5470600,1485.500000,1561.000000,5.082464,52.861341,8.241201,0.613356,0.765071
2025-11-04,1467.900024,1481.900024,1462.900024,1479.699951,8691330,1467.900024,1578.699951,7.548193,48.159398,6.371870,0.436669,1.181934
2025-11-05,1467.900024,1467.900024,1467.900024,1467.900024,0,1467.900024,1597.599976,8.835748,48.159398,4.834680,0.416692,0.000000


In [67]:
# 2B. Lagged Features (Previous day data)
lag_days = 1
base_features = ['Open', 'High', 'Low', 'Close_Price', 'Volume']
for feature in base_features:
    df[f'{feature}_Lag{lag_days}'] = df[feature].shift(lag_days)

# Final drop of NaNs created by indicators/lags
df.dropna(inplace=True)
print(f"Total data points after feature creation: {len(df)}")

KeyError: 'Open'