In [17]:
!pip install ta

import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from ta.momentum import RSIIndicator
from ta.volatility import BollingerBands
from ta.trend import MACD
from ta.volume import VolumeWeightedAveragePrice

# Define the ticker and date range
TICKER = 'INFY.NS'
START_DATE = '2019-01-01' # ~5 years of data
END_DATE = pd.to_datetime('today').strftime('%Y-%m-%d')

# Download data
df = yf.download(TICKER, start=START_DATE, end=END_DATE, auto_adjust=True)
df = df.dropna() # Drop any rows with missing data (e.g., trading holidays)



[*********************100%***********************]  1 of 1 completed


In [18]:
df = pd.DataFrame(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1714 entries, 2019-01-01 to 2025-12-05
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   (Close, INFY.NS)   1714 non-null   float64
 1   (High, INFY.NS)    1714 non-null   float64
 2   (Low, INFY.NS)     1714 non-null   float64
 3   (Open, INFY.NS)    1714 non-null   float64
 4   (Volume, INFY.NS)  1714 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 80.3 KB


In [19]:
df.describe()

Price,Close,High,Low,Open,Volume
Ticker,INFY.NS,INFY.NS,INFY.NS,INFY.NS,INFY.NS
count,1714.0,1714.0,1714.0,1714.0,1714.0
mean,1239.770444,1251.82722,1227.625813,1239.802055,7772874.0
std,390.737199,393.770422,387.589976,390.94648,5512769.0
min,452.361237,479.471143,437.581863,437.581863,0.0
25%,880.642426,894.371042,874.730755,884.550897,4850810.0
50%,1355.065979,1367.43773,1343.132176,1356.850924,6593246.0
75%,1519.653931,1536.463522,1504.456896,1519.395585,8857209.0
max,1942.221191,1948.777171,1920.756459,1938.093361,90432110.0


In [20]:
df.dtypes

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Price,Ticker,Unnamed: 2_level_1
Close,INFY.NS,float64
High,INFY.NS,float64
Low,INFY.NS,float64
Open,INFY.NS,float64
Volume,INFY.NS,int64


In [21]:
# Check the Null Values
df.isnull().sum()
df.columns = df.columns.get_level_values(0)

In [22]:
df = df.reset_index()

#df.columns = df.columns.get_level_values(-1)  # flatten fully (if needed)
df.columns.name = None                        # remove index name
df = df.reset_index(drop=True)                # keep Date already flattened



In [24]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [25]:
df.head()

Unnamed: 0,date,close,high,low,open,volume
0,2019-01-01,553.448547,554.488786,544.377697,550.036585,2943390
1,2019-01-02,556.777283,560.896638,550.951947,554.239111,7416655
2,2019-01-03,556.860474,563.393151,551.825679,559.232197,6827249
3,2019-01-04,550.119751,560.813434,541.756243,559.024203,7889310
4,2019-01-07,558.982605,560.563737,550.494248,553.406916,8046340


In [26]:
df['close_price'] = df['close'] # Working column for closing price

In [27]:
# Target Variable (y): 21-day (approx. 1 month) Future Return Percentage
df['future_close'] = df['close_price'].shift(-21)
df['target_return'] = ((df['future_close'] - df['close_price']) / df['close_price']) * 100

In [28]:
df.dtypes

Unnamed: 0,0
date,datetime64[ns]
close,float64
high,float64
low,float64
open,float64
volume,int64
close_price,float64
future_close,float64
target_return,float64


In [29]:
# Ensure the DataFrame is an independent copy for stability
df = df.copy()

print(f"Total data points after target calculation: {len(df)}")
# print(df.tail(3))

Total data points after target calculation: 1714


In [30]:
# --- 2. FEATURE ENGINEERING (Technical Indicators and Lagged Features) ---

print("\n--- 2. Feature Engineering ---")
# 2A. Technical Indicators (Non-Collinear Selection)
df['rsi'] = RSIIndicator(close=df['close_price'], window=14).rsi()
macd = MACD(close=df['close_price'])
df['macd'] = macd.macd()

# Correct calculation for Bollinger Bands Percentage (BBP)
bb = BollingerBands(close=df['close_price'], window=20, window_dev=2)
df['bbp'] = (df['close_price'] - bb.bollinger_lband()) / (bb.bollinger_hband() - bb.bollinger_lband())

# Correct the column name for Volume
df['vwap_ratio'] = df['volume'] / df['volume'].rolling(window=20).mean()


--- 2. Feature Engineering ---


In [31]:
df

Unnamed: 0,date,close,high,low,open,volume,close_price,future_close,target_return,rsi,macd,bbp,vwap_ratio
0,2019-01-01,553.448547,554.488786,544.377697,550.036585,2943390,553.448547,607.408081,9.749693,,,,
1,2019-01-02,556.777283,560.896638,550.951947,554.239111,7416655,556.777283,627.197571,12.647838,,,,
2,2019-01-03,556.860474,563.393151,551.825679,559.232197,6827249,556.860474,633.473267,13.757987,,,,
3,2019-01-04,550.119751,560.813434,541.756243,559.024203,7889310,550.119751,632.511108,14.976986,,,,
4,2019-01-07,558.982605,560.563737,550.494248,553.406916,8046340,558.982605,631.632324,12.996776,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1709,2025-12-01,1564.000000,1574.000000,1556.000000,1562.500000,5341800,1564.000000,,,61.360886,20.711765,0.804365,0.567659
1710,2025-12-02,1561.000000,1579.900024,1556.099976,1567.000000,6335969,1561.000000,,,60.408329,20.646627,0.759441,0.681840
1711,2025-12-03,1578.699951,1586.000000,1554.099976,1555.099976,12895312,1578.699951,,,63.962881,21.772265,0.869326,1.297677
1712,2025-12-04,1597.599976,1599.000000,1568.000000,1568.000000,12609619,1597.599976,,,67.335252,23.913753,0.985226,1.239827


In [32]:
# 2B. Lagged Features (Previous day data)
lag_days = 1
base_features = ['open', 'high', 'low', 'close_price', 'volume']
for feature in base_features:
    df[f'{feature}_Lag{lag_days}'] = df[feature].shift(lag_days)

# Final drop of NaNs created by indicators/lags
df.dropna(inplace=True)
print(f"Total data points after feature creation: {len(df)}")

Total data points after feature creation: 1668


In [33]:
df

Unnamed: 0,date,close,high,low,open,volume,close_price,future_close,target_return,rsi,macd,bbp,vwap_ratio,open_Lag1,high_Lag1,low_Lag1,close_price_Lag1,volume_Lag1
25,2019-02-05,631.632324,635.899858,625.063736,625.063736,4694366,631.632324,596.069885,-5.630244,76.878934,16.365032,0.798439,0.505636,633.431530,638.033741,626.904769,632.511108,3945391.0
26,2019-02-06,638.703003,641.757217,631.213943,632.218071,5880482,638.703003,595.149414,-6.819067,79.373942,16.843693,0.866324,0.678132,625.063736,635.899858,625.063736,631.632324,4694366.0
27,2019-02-07,639.288757,643.849139,636.778461,638.493820,3961797,639.288757,591.551270,-7.467281,79.570621,17.073489,0.865922,0.472145,632.218071,641.757217,631.213943,638.703003,5880482.0
28,2019-02-08,636.694824,646.192091,631.674232,635.941715,5915169,636.694824,592.806519,-6.893146,76.109560,16.852035,0.836336,0.732735,638.493820,643.849139,636.778461,639.288757,3961797.0
29,2019-02-11,638.284790,643.263556,635.105091,641.715560,5500216,638.284790,592.722778,-7.138195,76.776368,16.613320,0.859198,0.745076,635.941715,646.192091,631.674232,636.694824,5915169.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1688,2025-10-31,1482.300049,1497.099976,1480.699951,1490.000000,4819814,1482.300049,1564.000000,5.511701,52.071321,8.715278,0.604116,0.667491,1507.000000,1508.199951,1489.099976,1493.800049,5608867.0
1689,2025-11-03,1485.500000,1491.400024,1474.199951,1482.300049,5470600,1485.500000,1561.000000,5.082464,52.861341,8.241201,0.613356,0.765071,1490.000000,1497.099976,1480.699951,1482.300049,4819814.0
1690,2025-11-04,1467.900024,1481.900024,1462.900024,1479.699951,8691330,1467.900024,1578.699951,7.548193,48.159398,6.371870,0.436669,1.181934,1482.300049,1491.400024,1474.199951,1485.500000,5470600.0
1691,2025-11-05,1467.900024,1467.900024,1467.900024,1467.900024,0,1467.900024,1597.599976,8.835748,48.159398,4.834680,0.416692,0.000000,1479.699951,1481.900024,1462.900024,1467.900024,8691330.0


In [34]:
# --- 3. DATA SPLITTING (Time-Series Split 60/20/20) ---

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=11)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=11)

In [35]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [37]:
df_train

Unnamed: 0,date,close,high,low,open,volume,close_price,future_close,target_return,rsi,macd,bbp,vwap_ratio,open_Lag1,high_Lag1,low_Lag1,close_price_Lag1,volume_Lag1
0,2025-02-07,1848.932007,1868.405624,1835.140097,1864.132191,4126260,1848.932007,1613.839355,-12.715051,52.891700,-2.345845,0.686373,0.665028,1861.898330,1868.697125,1845.386973,1860.587158,4765255.0
1,2022-07-29,1405.907715,1411.350992,1386.221240,1390.575906,7009465,1405.907715,1328.341064,-5.517194,60.227344,9.711857,0.986927,1.268991,1357.553388,1383.000643,1351.611099,1376.604858,6636841.0
2,2023-09-25,1370.155273,1384.747663,1367.738715,1384.747663,4217863,1370.155273,1279.901611,-6.587112,55.012824,21.589295,0.512888,0.836306,1386.652968,1400.641188,1380.425556,1390.603149,5455613.0
3,2023-09-15,1404.963379,1411.283715,1402.546820,1410.911910,8657816,1404.963379,1338.925537,-4.700325,74.498908,28.249982,0.913023,1.837852,1398.828845,1405.846271,1393.391559,1400.641235,6444213.0
4,2021-05-11,1171.398071,1180.509364,1166.512251,1176.900072,6420988,1171.398071,1267.370483,8.192980,42.375169,-5.587004,0.215714,0.743834,1188.300389,1190.985409,1175.271713,1179.233154,7195348.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2023-03-24,1266.231812,1283.015174,1263.342838,1272.972675,7559127,1266.231812,1148.931396,-9.263739,28.507575,-38.748357,0.156524,1.147660,1269.258359,1269.258359,1256.464477,1260.637451,7607718.0
996,2021-11-26,1518.603882,1542.572580,1511.736411,1528.388896,4494181,1518.603882,1675.253662,10.315381,42.676050,3.339726,0.215175,0.926829,1526.099864,1549.619712,1523.361818,1546.208496,4476260.0
997,2022-05-24,1293.594116,1319.223662,1288.432308,1318.370775,5693167,1293.594116,1303.891846,0.796056,33.402809,-54.959992,0.058266,0.752132,1306.251632,1335.786173,1301.718283,1318.191162,5887569.0
998,2025-04-02,1505.593018,1510.934927,1491.072664,1491.072664,6193478,1505.593018,1464.945923,-2.699740,33.068996,-54.685726,0.181739,0.732233,1495.006198,1502.921960,1474.221290,1482.622681,12680733.0
