<a href="https://colab.research.google.com/github/Krishnakanth834/MLProject1/blob/main/stock_price_predict_decisstion_tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install ta

import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from ta.momentum import RSIIndicator
from ta.volatility import BollingerBands
from ta.trend import MACD
from ta.volume import VolumeWeightedAveragePrice

# Define the ticker and date range
TICKER = 'INFY.NS'
START_DATE = '2019-01-01' # ~5 years of data
END_DATE = pd.to_datetime('today').strftime('%Y-%m-%d')

# Download data
df = yf.download(TICKER, start=START_DATE, end=END_DATE, auto_adjust=True)
df = df.dropna() # Drop any rows with missing data (e.g., trading holidays)

Collecting ta
  Downloading ta-0.11.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ta
  Building wheel for ta (setup.py) ... [?25l[?25hdone
  Created wheel for ta: filename=ta-0.11.0-py3-none-any.whl size=29412 sha256=ba2ba17aa6353a48ac74f3c156d0656a7124e5f9448b47a88170817dc5928388
  Stored in directory: /root/.cache/pip/wheels/5c/a1/5f/c6b85a7d9452057be4ce68a8e45d77ba34234a6d46581777c6
Successfully built ta
Installing collected packages: ta
Successfully installed ta-0.11.0


[*********************100%***********************]  1 of 1 completed


In [2]:
df = pd.DataFrame(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1719 entries, 2019-01-01 to 2025-12-12
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   (Close, INFY.NS)   1719 non-null   float64
 1   (High, INFY.NS)    1719 non-null   float64
 2   (Low, INFY.NS)     1719 non-null   float64
 3   (Open, INFY.NS)    1719 non-null   float64
 4   (Volume, INFY.NS)  1719 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 80.6 KB


In [3]:
df.describe()

Price,Close,High,Low,Open,Volume
Ticker,INFY.NS,INFY.NS,INFY.NS,INFY.NS,INFY.NS
count,1719.0,1719.0,1719.0,1719.0,1719.0
mean,1240.812877,1252.871408,1228.667329,1240.864699,7767378.0
std,390.645801,393.672635,387.506133,390.873109,5506031.0
min,452.361267,479.471111,437.581834,437.581834,0.0
25%,883.636414,896.657335,876.668718,886.663018,4850874.0
50%,1356.632568,1368.714637,1343.546005,1357.621208,6592367.0
75%,1520.702026,1537.228352,1505.18931,1520.064301,8849072.0
max,1942.221191,1948.777171,1920.756459,1938.093361,90432110.0


In [4]:
# Check the Null Values
df.isnull().sum()
df.columns = df.columns.get_level_values(0)

In [5]:
df = df.reset_index()

#df.columns = df.columns.get_level_values(-1)  # flatten fully (if needed)
df.columns.name = None                        # remove index name
df = df.reset_index(drop=True)                # keep Date already flattened

In [6]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [7]:
df.head()

Unnamed: 0,date,close,high,low,open,volume
0,2019-01-01,553.448486,554.488725,544.377637,550.036524,2943390
1,2019-01-02,556.777161,560.896515,550.951826,554.238989,7416655
2,2019-01-03,556.860474,563.393151,551.825679,559.232197,6827249
3,2019-01-04,550.119629,560.813309,541.756123,559.024079,7889310
4,2019-01-07,558.982544,560.563676,550.494188,553.406856,8046340


In [8]:
df['close_price'] = df['close'] # Working column for closing price

In [9]:
# Target Variable (y): 21-day (approx. 1 month) Future Return Percentage
df['future_close'] = df['close_price'].shift(-21)
df['target_return'] = ((df['future_close'] - df['close_price']) / df['close_price']) * 100

In [10]:
df.dtypes

Unnamed: 0,0
date,datetime64[ns]
close,float64
high,float64
low,float64
open,float64
volume,int64
close_price,float64
future_close,float64
target_return,float64


In [11]:
# Ensure the DataFrame is an independent copy for stability
df = df.copy()

print(f"Total data points after target calculation: {len(df)}")
# print(df.tail(3))

Total data points after target calculation: 1719


In [14]:
# FEATURE ENGINEERING (Technical Indicators and Lagged Features) ---

# Technical Indicators (Non-Collinear Selection)
df['rsi'] = RSIIndicator(close=df['close_price'], window=14).rsi()
macd = MACD(close=df['close_price'])
df['macd'] = macd.macd()

# Correct calculation for Bollinger Bands Percentage (BBP)
bb = BollingerBands(close=df['close_price'], window=20, window_dev=2)
df['bbp'] = (df['close_price'] - bb.bollinger_lband()) / (bb.bollinger_hband() - bb.bollinger_lband())

# Correct the column name for Volume
df['vwap_ratio'] = df['volume'] / df['volume'].rolling(window=20).mean()

In [15]:
# Lagged Features (Previous day data)
lag_days = 1
base_features = ['open', 'high', 'low', 'close_price', 'volume']
for feature in base_features:
    df[f'{feature}_Lag{lag_days}'] = df[feature].shift(lag_days)

# Final drop of NaNs created by indicators/lags
df.dropna(inplace=True)
print(f"Total data points after feature creation: {len(df)}")

Total data points after feature creation: 1673


In [18]:
df.head(5)

Unnamed: 0,date,close,high,low,open,volume,close_price,future_close,target_return,rsi,macd,bbp,vwap_ratio,open_Lag1,high_Lag1,low_Lag1,close_price_Lag1,volume_Lag1
25,2019-02-05,631.632324,635.899858,625.063736,625.063736,4694366,631.632324,596.069824,-5.630253,76.878815,16.36504,0.798439,0.505636,633.431408,638.033618,626.904648,632.510986,3945391.0
26,2019-02-06,638.703003,641.757217,631.213943,632.218071,5880482,638.703003,595.149475,-6.819058,79.373826,16.843701,0.866324,0.678132,625.063736,635.899858,625.063736,631.632324,4694366.0
27,2019-02-07,639.288879,643.849262,636.778583,638.493942,3961797,639.288879,591.551331,-7.467289,79.570546,17.073506,0.865924,0.472145,632.218071,641.757217,631.213943,638.703003,5880482.0
28,2019-02-08,636.694763,646.192029,631.674171,635.941654,5915169,636.694763,592.806396,-6.893157,76.109275,16.852044,0.836334,0.732735,638.493942,643.849262,636.778583,639.288879,3961797.0
29,2019-02-11,638.284668,643.263433,635.10497,641.715437,5500216,638.284668,592.722839,-7.138167,76.776059,16.613318,0.859196,0.745076,635.941654,646.192029,631.674171,636.694763,5915169.0


In [19]:
# Extract datetime features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['day_of_week'] = df['date'].dt.dayofweek
df['day_of_year'] = df['date'].dt.dayofyear
df['week_of_year'] = df['date'].dt.isocalendar().week.astype(int)
df['quarter'] = df['date'].dt.quarter

# --- 3. DATA SPLITTING (Time-Series Split 60/20/20) ---

# 1. Split off the Test set (e.g., last 20% of data)
# Result: df_full_train (80%), df_test (20%)
df_full_train, df_test = train_test_split(
    df,
    test_size=0.2,
    shuffle=False  # <--- CRITICAL: Keeps data in date order
)

# 2. Split the Full Train into Train and Val
# We need Val to be 20% of the ORIGINAL total.
# Since df_full_train is 80% of total, taking 0.25 (1/4) of it gives us 20%.
# Result: df_train (60%), df_val (20%)
df_train, df_val = train_test_split(
    df_full_train,
    test_size=0.25,
    shuffle=False  # <--- CRITICAL
)

print(f"Train: {len(df_train)} (60%)")
print(f"Val:   {len(df_val)}   (20%)")
print(f"Test:  {len(df_test)}  (20%)")

Train: 1003 (60%)
Val:   335   (20%)
Test:  335  (20%)


In [20]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)


EXCLUDE_COLUMNS = ['future_close', 'target_return', 'close', 'close_price', 'date'] # Exclude target components, redundant columns, and the 'date' column

# Re-define FEATURES to include the new datetime features and exclude the original 'date' column
FEATURES = [col for col in df.columns if col not in EXCLUDE_COLUMNS and df[col].dtype != 'datetime64[ns]']
TARGET = 'target_return'

X_train, y_train = df_train[FEATURES], df_train[TARGET]
X_val, y_val = df_val[FEATURES], df_val[TARGET]
X_test, y_test = df_test[FEATURES], df_test[TARGET]

print("Updated FEATURES list:")
print(FEATURES)

Updated FEATURES list:
['high', 'low', 'open', 'volume', 'rsi', 'macd', 'bbp', 'vwap_ratio', 'open_Lag1', 'high_Lag1', 'low_Lag1', 'close_price_Lag1', 'volume_Lag1', 'year', 'month', 'day', 'day_of_week', 'day_of_year', 'week_of_year', 'quarter']
