In [60]:
!pip install ta

import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from ta.momentum import RSIIndicator
from ta.volatility import BollingerBands
from ta.trend import MACD
from ta.volume import VolumeWeightedAveragePrice

# Define the ticker and date range
TICKER = 'INFY.NS'
START_DATE = '2019-01-01' # ~5 years of data
END_DATE = pd.to_datetime('today').strftime('%Y-%m-%d')

# Download data
df = yf.download(TICKER, start=START_DATE, end=END_DATE, auto_adjust=True)
df = df.dropna() # Drop any rows with missing data (e.g., trading holidays)



[*********************100%***********************]  1 of 1 completed


In [61]:
df = pd.DataFrame(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1714 entries, 2019-01-01 to 2025-12-05
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   (Close, INFY.NS)   1714 non-null   float64
 1   (High, INFY.NS)    1714 non-null   float64
 2   (Low, INFY.NS)     1714 non-null   float64
 3   (Open, INFY.NS)    1714 non-null   float64
 4   (Volume, INFY.NS)  1714 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 80.3 KB


In [62]:
df.describe()

Price,Close,High,Low,Open,Volume
Ticker,INFY.NS,INFY.NS,INFY.NS,INFY.NS,INFY.NS
count,1714.0,1714.0,1714.0,1714.0,1714.0
mean,1239.770445,1251.827222,1227.625815,1239.802057,7772874.0
std,390.737197,393.77042,387.589973,390.946478,5512769.0
min,452.361237,479.471174,437.581891,437.581891,0.0
25%,880.642487,894.371058,874.730846,884.551004,4850810.0
50%,1355.06604,1367.437792,1343.132417,1356.850862,6593246.0
75%,1519.653961,1536.463491,1504.456896,1519.395615,8857209.0
max,1942.221191,1948.777171,1920.756459,1938.093361,90432110.0


In [63]:
df.dtypes

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Price,Ticker,Unnamed: 2_level_1
Close,INFY.NS,float64
High,INFY.NS,float64
Low,INFY.NS,float64
Open,INFY.NS,float64
Volume,INFY.NS,int64


In [64]:
# Check the Null Values
df.isnull().sum()
df.columns = df.columns.get_level_values(0)

In [65]:
df = df.reset_index()

#df.columns = df.columns.get_level_values(-1)  # flatten fully (if needed)
df.columns.name = None                        # remove index name
df = df.reset_index(drop=True)                # keep Date already flattened



In [66]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [67]:
df.head()

Unnamed: 0,date,close,high,low,open,volume
0,2019-01-01,553.448486,554.488725,544.377637,550.036524,2943390
1,2019-01-02,556.777161,560.896515,550.951826,554.238989,7416655
2,2019-01-03,556.860413,563.393089,551.825618,559.232136,6827249
3,2019-01-04,550.11969,560.813372,541.756183,559.024141,7889310
4,2019-01-07,558.982605,560.563737,550.494248,553.406916,8046340


In [68]:
df['close_price'] = df['close'] # Working column for closing price

In [69]:
# Target Variable (y): 21-day (approx. 1 month) Future Return Percentage
df['future_close'] = df['close_price'].shift(-21)
df['target_return'] = ((df['future_close'] - df['close_price']) / df['close_price']) * 100

In [70]:
df.dtypes

Unnamed: 0,0
date,datetime64[ns]
close,float64
high,float64
low,float64
open,float64
volume,int64
close_price,float64
future_close,float64
target_return,float64


In [71]:
# Ensure the DataFrame is an independent copy for stability
df = df.copy()

print(f"Total data points after target calculation: {len(df)}")
# print(df.tail(3))

Total data points after target calculation: 1714


In [72]:
# --- 2. FEATURE ENGINEERING (Technical Indicators and Lagged Features) ---

print("\n--- 2. Feature Engineering ---")
# 2A. Technical Indicators (Non-Collinear Selection)
df['rsi'] = RSIIndicator(close=df['close_price'], window=14).rsi()
macd = MACD(close=df['close_price'])
df['macd'] = macd.macd()

# Correct calculation for Bollinger Bands Percentage (BBP)
bb = BollingerBands(close=df['close_price'], window=20, window_dev=2)
df['bbp'] = (df['close_price'] - bb.bollinger_lband()) / (bb.bollinger_hband() - bb.bollinger_lband())

# Correct the column name for Volume
df['vwap_ratio'] = df['volume'] / df['volume'].rolling(window=20).mean()


--- 2. Feature Engineering ---


In [73]:
df

Unnamed: 0,date,close,high,low,open,volume,close_price,future_close,target_return,rsi,macd,bbp,vwap_ratio
0,2019-01-01,553.448486,554.488725,544.377637,550.036524,2943390,553.448486,607.408020,9.749694,,,,
1,2019-01-02,556.777161,560.896515,550.951826,554.238989,7416655,556.777161,627.197571,12.647863,,,,
2,2019-01-03,556.860413,563.393089,551.825618,559.232136,6827249,556.860413,633.473267,13.758000,,,,
3,2019-01-04,550.119690,560.813372,541.756183,559.024141,7889310,550.119690,632.510986,14.976976,,,,
4,2019-01-07,558.982605,560.563737,550.494248,553.406916,8046340,558.982605,631.632324,12.996776,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1709,2025-12-01,1564.000000,1574.000000,1556.000000,1562.500000,5341800,1564.000000,,,61.360886,20.711765,0.804365,0.567659
1710,2025-12-02,1561.000000,1579.900024,1556.099976,1567.000000,6335969,1561.000000,,,60.408329,20.646627,0.759441,0.681840
1711,2025-12-03,1578.699951,1586.000000,1554.099976,1555.099976,12895312,1578.699951,,,63.962881,21.772265,0.869326,1.297677
1712,2025-12-04,1597.599976,1599.000000,1568.000000,1568.000000,12609619,1597.599976,,,67.335252,23.913753,0.985226,1.239827


In [74]:
# 2B. Lagged Features (Previous day data)
lag_days = 1
base_features = ['open', 'high', 'low', 'close_price', 'volume']
for feature in base_features:
    df[f'{feature}_Lag{lag_days}'] = df[feature].shift(lag_days)

# Final drop of NaNs created by indicators/lags
df.dropna(inplace=True)
print(f"Total data points after feature creation: {len(df)}")

Total data points after feature creation: 1668


In [75]:
df

Unnamed: 0,date,close,high,low,open,volume,close_price,future_close,target_return,rsi,macd,bbp,vwap_ratio,open_Lag1,high_Lag1,low_Lag1,close_price_Lag1,volume_Lag1
25,2019-02-05,631.632324,635.899858,625.063736,625.063736,4694366,631.632324,596.069824,-5.630253,76.878719,16.365033,0.798439,0.505636,633.431408,638.033618,626.904648,632.510986,3945391.0
26,2019-02-06,638.703125,641.757340,631.214063,632.218192,5880482,638.703125,595.149414,-6.819085,79.373770,16.843705,0.866326,0.678132,625.063736,635.899858,625.063736,631.632324,4694366.0
27,2019-02-07,639.288818,643.849200,636.778522,638.493881,3961797,639.288818,591.551453,-7.467261,79.570428,17.073504,0.865923,0.472145,632.218192,641.757340,631.214063,638.703125,5880482.0
28,2019-02-08,636.694763,646.192029,631.674171,635.941654,5915169,636.694763,592.806458,-6.893147,76.109248,16.852043,0.836334,0.732735,638.493881,643.849200,636.778522,639.288818,3961797.0
29,2019-02-11,638.284668,643.263433,635.104970,641.715437,5500216,638.284668,592.722717,-7.138187,76.776032,16.613318,0.859195,0.745076,635.941654,646.192029,631.674171,636.694763,5915169.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1688,2025-10-31,1482.300049,1497.099976,1480.699951,1490.000000,4819814,1482.300049,1564.000000,5.511701,52.071321,8.715278,0.604116,0.667491,1507.000000,1508.199951,1489.099976,1493.800049,5608867.0
1689,2025-11-03,1485.500000,1491.400024,1474.199951,1482.300049,5470600,1485.500000,1561.000000,5.082464,52.861341,8.241201,0.613356,0.765071,1490.000000,1497.099976,1480.699951,1482.300049,4819814.0
1690,2025-11-04,1467.900024,1481.900024,1462.900024,1479.699951,8691330,1467.900024,1578.699951,7.548193,48.159398,6.371870,0.436669,1.181934,1482.300049,1491.400024,1474.199951,1485.500000,5470600.0
1691,2025-11-05,1467.900024,1467.900024,1467.900024,1467.900024,0,1467.900024,1597.599976,8.835748,48.159398,4.834680,0.416692,0.000000,1479.699951,1481.900024,1462.900024,1467.900024,8691330.0


In [76]:
# Extract datetime features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['day_of_week'] = df['date'].dt.dayofweek
df['day_of_year'] = df['date'].dt.dayofyear
df['week_of_year'] = df['date'].dt.isocalendar().week.astype(int)
df['quarter'] = df['date'].dt.quarter

# --- 3. DATA SPLITTING (Time-Series Split 60/20/20) ---

# 1. Split off the Test set (e.g., last 20% of data)
# Result: df_full_train (80%), df_test (20%)
df_full_train, df_test = train_test_split(
    df,
    test_size=0.2,
    shuffle=False  # <--- CRITICAL: Keeps data in date order
)

# 2. Split the Full Train into Train and Val
# We need Val to be 20% of the ORIGINAL total.
# Since df_full_train is 80% of total, taking 0.25 (1/4) of it gives us 20%.
# Result: df_train (60%), df_val (20%)
df_train, df_val = train_test_split(
    df_full_train,
    test_size=0.25,
    shuffle=False  # <--- CRITICAL
)

print(f"Train: {len(df_train)} (60%)")
print(f"Val:   {len(df_val)}   (20%)")
print(f"Test:  {len(df_test)}  (20%)")

Train: 1000 (60%)
Val:   334   (20%)
Test:  334  (20%)


In [77]:
df_train

Unnamed: 0,date,close,high,low,open,volume,close_price,future_close,target_return,rsi,...,low_Lag1,close_price_Lag1,volume_Lag1,year,month,day,day_of_week,day_of_year,week_of_year,quarter
25,2019-02-05,631.632324,635.899858,625.063736,625.063736,4694366,631.632324,596.069824,-5.630253,76.878719,...,626.904648,632.510986,3945391.0,2019,2,5,1,36,6,1
26,2019-02-06,638.703125,641.757340,631.214063,632.218192,5880482,638.703125,595.149414,-6.819085,79.373770,...,625.063736,631.632324,4694366.0,2019,2,6,2,37,6,1
27,2019-02-07,639.288818,643.849200,636.778522,638.493881,3961797,639.288818,591.551453,-7.467261,79.570428,...,631.214063,638.703125,5880482.0,2019,2,7,3,38,6,1
28,2019-02-08,636.694763,646.192029,631.674171,635.941654,5915169,636.694763,592.806458,-6.893147,76.109248,...,636.778522,639.288818,3961797.0,2019,2,8,4,39,6,1
29,2019-02-11,638.284668,643.263433,635.104970,641.715437,5500216,638.284668,592.722717,-7.138187,76.776032,...,631.674171,636.694763,5915169.0,2019,2,11,0,42,7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,2023-02-14,1461.213135,1463.735235,1447.364534,1453.371651,5043736,1461.213135,1287.738525,-11.871958,57.162467,...,1436.221480,1437.551270,4001199.0,2023,2,14,1,45,7,1
1021,2023-02-15,1464.698364,1466.486718,1445.392788,1452.729807,3465747,1464.698364,1302.963013,-11.042229,57.803278,...,1447.364534,1461.213135,5043736.0,2023,2,15,2,46,7,1
1022,2023-02-16,1469.054810,1480.656382,1465.432089,1467.403935,4290289,1469.054810,1287.325684,-12.370480,58.636215,...,1445.392788,1464.698364,3465747.0,2023,2,16,3,47,7,1
1023,2023-02-17,1452.546387,1462.818169,1444.934185,1461.901041,2857734,1452.546387,1275.448975,-12.192204,54.264918,...,1465.432089,1469.054810,4290289.0,2023,2,17,4,48,7,1


In [78]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)


EXCLUDE_COLUMNS = ['future_close', 'target_return', 'close', 'close_price', 'date'] # Exclude target components, redundant columns, and the 'date' column

# Re-define FEATURES to include the new datetime features and exclude the original 'date' column
FEATURES = [col for col in df.columns if col not in EXCLUDE_COLUMNS and df[col].dtype != 'datetime64[ns]']
TARGET = 'target_return'

X_train, y_train = df_train[FEATURES], df_train[TARGET]
X_val, y_val = df_val[FEATURES], df_val[TARGET]
X_test, y_test = df_test[FEATURES], df_test[TARGET]

print("Updated FEATURES list:")
print(FEATURES)

Updated FEATURES list:
['high', 'low', 'open', 'volume', 'rsi', 'macd', 'bbp', 'vwap_ratio', 'open_Lag1', 'high_Lag1', 'low_Lag1', 'close_price_Lag1', 'volume_Lag1', 'year', 'month', 'day', 'day_of_week', 'day_of_year', 'week_of_year', 'quarter']


In [79]:
X_train.dtypes

Unnamed: 0,0
high,float64
low,float64
open,float64
volume,int64
rsi,float64
macd,float64
bbp,float64
vwap_ratio,float64
open_Lag1,float64
high_Lag1,float64


In [80]:
# Feature Correlation Matrix (checking for collinearity)
feature_corr = X_train[FEATURES].corr().abs()
# We look for features highly correlated with each other (e.g., > 0.9)
high_corr_features = set()
for i in range(len(feature_corr.columns)):
    for j in range(i):
        if feature_corr.iloc[i, j] > 0.8:
            colname = feature_corr.columns[i]
            high_corr_features.add(colname)

# High correlation is expected between certain lagged price features (e.g., Close_Lag1 and Open_Lag1).
# For tree-based models, high collinearity is less of a concern than in linear models, but we note it.
print("\n--- Highly Correlated Features (Collinearity > 0.9) ---")
print(high_corr_features)

# Target Variable Correlation
target_corr = df_train[FEATURES + [TARGET]].corr()[TARGET].abs().sort_values(ascending=False)
print("\n--- Feature Correlation with Target_Return ---")
print(target_corr.head(20))


--- Highly Correlated Features (Collinearity > 0.9) ---
{'low', 'open_Lag1', 'vwap_ratio', 'day_of_year', 'high_Lag1', 'year', 'week_of_year', 'low_Lag1', 'open', 'quarter', 'close_price_Lag1', 'bbp', 'macd'}

--- Feature Correlation with Target_Return ---
target_return       1.000000
low_Lag1            0.191807
open_Lag1           0.191729
low                 0.191253
open                0.191228
close_price_Lag1    0.191052
high_Lag1           0.189781
high                0.189476
month               0.184818
day_of_year         0.183687
week_of_year        0.181396
quarter             0.171617
macd                0.151846
volume              0.134448
volume_Lag1         0.134159
year                0.131858
rsi                 0.082173
vwap_ratio          0.031757
bbp                 0.022798
day_of_week         0.012546
Name: target_return, dtype: float64


In [81]:
from sklearn.feature_selection import mutual_info_regression

# Exclude 'date' column for mutual information calculation
FEATURES_MI = [col for col in FEATURES if col != 'date']

# Calculate mutual information
mi_scores = mutual_info_regression(X_train[FEATURES_MI], y_train, random_state=11)
mi_series = pd.Series(mi_scores, name="MI Scores", index=X_train[FEATURES_MI].columns).sort_values(ascending=False)

print("\n--- Feature Importance (Mutual Information) ---")
print(mi_series.head(100))


--- Feature Importance (Mutual Information) ---
high                0.487045
low                 0.464689
open                0.448559
high_Lag1           0.435840
close_price_Lag1    0.434044
low_Lag1            0.425452
open_Lag1           0.391181
day_of_year         0.384069
week_of_year        0.355937
macd                0.228312
year                0.171934
month               0.168747
rsi                 0.132672
quarter             0.075297
volume_Lag1         0.063339
bbp                 0.059928
vwap_ratio          0.045886
day                 0.019139
volume              0.005164
day_of_week         0.000000
Name: MI Scores, dtype: float64


In [82]:
# Function to evaluate model
def evaluate_model(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
    rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
    return rmse_train, rmse_val

# Tuning Max Depth
depths = range(3, 15)
dt_scores = []
best_dt_rmse_val = float('inf')
best_dt_depth = 0

for depth in depths:
    model = DecisionTreeRegressor(max_depth=depth, random_state=11)
    rmse_train, rmse_val = evaluate_model(model, X_train, y_train, X_val, y_val)
    dt_scores.append({'Depth': depth, 'Train RMSE': rmse_train, 'Validation RMSE': rmse_val})
    if rmse_val < best_dt_rmse_val:
        best_dt_rmse_val = rmse_val
        best_dt_depth = depth

dt_results = pd.DataFrame(dt_scores)
print("\n--- Decision Tree Max Depth Tuning Results ---")
print(dt_results)
print(f"\nBest Decision Tree Depth: {best_dt_depth} with Val RMSE: {best_dt_rmse_val:.4f}")

# Final DT Model
dt_model = DecisionTreeRegressor(max_depth=best_dt_depth, random_state=11)
dt_model.fit(X_train, y_train)


--- Decision Tree Max Depth Tuning Results ---
    Depth  Train RMSE  Validation RMSE
0       3    7.159215         7.194247
1       4    6.512045         7.011177
2       5    5.622076         8.035023
3       6    4.391336         8.929396
4       7    3.590768         8.534551
5       8    2.844240         8.772904
6       9    2.102481         9.251001
7      10    1.491408         9.270269
8      11    0.981726         9.176458
9      12    0.628176         9.584476
10     13    0.398190         9.492522
11     14    0.254890         9.458201

Best Decision Tree Depth: 4 with Val RMSE: 7.0112


In [83]:
X_train.head(100)

Unnamed: 0,high,low,open,volume,rsi,macd,bbp,vwap_ratio,open_Lag1,high_Lag1,low_Lag1,close_price_Lag1,volume_Lag1,year,month,day,day_of_week,day_of_year,week_of_year,quarter
0,635.899858,625.063736,625.063736,4694366,76.878719,16.365033,0.798439,0.505636,633.431408,638.033618,626.904648,632.510986,3945391.0,2019,2,5,1,36,6,1
1,641.757340,631.214063,632.218192,5880482,79.373770,16.843705,0.866326,0.678132,625.063736,635.899858,625.063736,631.632324,4694366.0,2019,2,6,2,37,6,1
2,643.849200,636.778522,638.493881,3961797,79.570428,17.073504,0.865923,0.472145,632.218192,641.757340,631.214063,638.703125,5880482.0,2019,2,7,3,38,6,1
3,646.192029,631.674171,635.941654,5915169,76.109248,16.852043,0.836334,0.732735,638.493881,643.849200,636.778522,639.288818,3961797.0,2019,2,8,4,39,6,1
4,643.263433,635.104970,641.715437,5500216,76.776032,16.613318,0.859195,0.745076,635.941654,646.192029,631.674171,636.694763,5915169.0,2019,2,11,0,42,7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,625.831920,619.340219,623.710456,3224968,47.467392,3.228053,0.249694,0.504232,624.134784,627.529127,618.194684,621.164734,5919494.0,2019,7,1,0,182,27,3
96,629.353678,619.552471,622.861976,4837606,53.969675,3.089179,0.489398,0.773962,623.710456,625.831920,619.340219,620.570679,3224968.0,2019,7,2,1,183,27,3
97,630.668869,619.467528,628.801970,5958379,47.621812,2.356229,0.214488,0.999447,622.861976,629.353678,619.552471,627.953491,4837606.0,2019,7,3,2,184,27,3
98,624.898737,618.279725,618.619180,3274064,49.461515,1.924381,0.260096,0.569682,628.801970,630.668869,619.467528,620.570679,5958379.0,2019,7,4,3,185,27,3


In [84]:
# --- 5. Random Forest Regressor Tuning ---
print("\n--- Random Forest Regressor Tuning (Max Depth) ---")

rf_scores = []
best_rf_rmse_val = float('inf')
best_rf_depth = 0

# Using a fixed n_estimators for initial depth tuning
for depth in depths: # Using the same depth range as Decision Tree for consistency
    model = RandomForestRegressor(max_depth=depth, n_estimators=100, random_state=11, n_jobs=-1)
    rmse_train, rmse_val = evaluate_model(model, X_train, y_train, X_val, y_val)
    rf_scores.append({'Depth': depth, 'Train RMSE': rmse_train, 'Validation RMSE': rmse_val})
    if rmse_val < best_rf_rmse_val:
        best_rf_rmse_val = rmse_val
        best_rf_depth = depth

rf_results = pd.DataFrame(rf_scores)
print(rf_results)
print(f"\nBest Random Forest Depth: {best_rf_depth} with Val RMSE: {best_rf_rmse_val:.4f}")

# Final RF Model
rf_model = RandomForestRegressor(max_depth=best_rf_depth, n_estimators=100, random_state=11, n_jobs=-1)
rf_model.fit(X_train, y_train)


--- Random Forest Regressor Tuning (Max Depth) ---
    Depth  Train RMSE  Validation RMSE
0       3    6.609777         7.318516
1       4    5.546154         7.478836
2       5    4.491006         7.658712
3       6    3.474049         7.953780
4       7    2.707671         8.096205
5       8    2.164967         8.156131
6       9    1.769614         8.249241
7      10    1.514460         8.275657
8      11    1.377308         8.305004
9      12    1.275715         8.292226
10     13    1.226834         8.322660
11     14    1.199044         8.277023

Best Random Forest Depth: 3 with Val RMSE: 7.3185


In [85]:
# XGBoost Regressor
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    max_depth=best_dt_depth,
    learning_rate=0.1,
    random_state=11,
    n_jobs=-1
)
xgb_rmse_train, xgb_rmse_val = evaluate_model(xgb_model, X_train, y_train, X_val, y_val)

# Task
Tune the `max_depth` and `n_estimators` hyperparameters for the XGBoost Regressor model. After tuning, train the final XGBoost model using the best hyperparameters found and summarize the parameter tuning results, including the best hyperparameters and the performance metrics (RMSE) on the training and validation sets.

## Tune XGBoost Max Depth

### Subtask:
Tune the `max_depth` parameter for the XGBoost Regressor model. I'll iterate through a range of depths, evaluate the model's performance using the validation set, and identify the optimal `max_depth` that minimizes the validation RMSE.


**Reasoning**:
The user wants to tune the `max_depth` parameter for the XGBoost Regressor model. I will define a range of depths, iterate through them, train an XGBoost model using the existing `evaluate_model` function, evaluate its performance on the validation set, and record the results to find the optimal depth.



In [86]:
print("\n--- XGBoost Regressor Tuning (Max Depth) ---")

# Define the range of max_depth values to test
depths = range(3, 15)
xgb_depth_scores = []
best_xgb_rmse_val = float('inf')
best_xgb_depth = 0

for depth in depths:
    # Initialize XGBoost Regressor with current max_depth
    model = xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=100, # Fixed for depth tuning, will be tuned later
        max_depth=depth,
        learning_rate=0.1,
        random_state=11,
        n_jobs=-1
    )

    # Evaluate the model using the pre-defined evaluate_model function
    rmse_train, rmse_val = evaluate_model(model, X_train, y_train, X_val, y_val)

    # Store the results
    xgb_depth_scores.append({'Depth': depth, 'Train RMSE': rmse_train, 'Validation RMSE': rmse_val})

    # Check for best validation RMSE
    if rmse_val < best_xgb_rmse_val:
        best_xgb_rmse_val = rmse_val
        best_xgb_depth = depth

# Convert results to DataFrame
xgb_depth_results = pd.DataFrame(xgb_depth_scores)
print(xgb_depth_results)
print(f"\nBest XGBoost Depth: {best_xgb_depth} with Val RMSE: {best_xgb_rmse_val:.4f}")

# Final XGBoost Model with best depth
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    max_depth=best_xgb_depth,
    learning_rate=0.1,
    random_state=11,
    n_jobs=-1
)
xgb_model.fit(X_train, y_train)


--- XGBoost Regressor Tuning (Max Depth) ---
    Depth  Train RMSE  Validation RMSE
0       3    3.330377         8.176971
1       4    2.105447         8.459257
2       5    1.357826         8.496655
3       6    0.776527         8.669136
4       7    0.446585         8.310200
5       8    0.214362         8.025857
6       9    0.088504         8.051101
7      10    0.042865         8.182465
8      11    0.029477         8.184342
9      12    0.015725         8.421354
10     13    0.012395         8.457646
11     14    0.011330         8.416860

Best XGBoost Depth: 8 with Val RMSE: 8.0259


**Reasoning**:
Now that the optimal `max_depth` for XGBoost has been determined, the next step is to tune the `n_estimators` (number of boosting rounds) hyperparameter. I will iterate through a range of `n_estimators` values, keeping the `max_depth` fixed at the previously found best value, and evaluate the model's performance on the validation set to find the optimal `n_estimators`.



In [87]:
print("--- XGBoost Regressor Tuning (N_Estimators) ---")

# Define the range of n_estimators values to test
n_estimators_range = range(50, 301, 50)
xgb_n_estimators_scores = []
best_xgb_rmse_val_n_est = float('inf')
best_xgb_n_estimators = 0

for n_est in n_estimators_range:
    # Initialize XGBoost Regressor with best_xgb_depth and current n_estimators
    model = xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=n_est,
        max_depth=best_xgb_depth, # Use the best depth found previously
        learning_rate=0.1,
        random_state=11,
        n_jobs=-1
    )

    # Evaluate the model using the pre-defined evaluate_model function
    rmse_train, rmse_val = evaluate_model(model, X_train, y_train, X_val, y_val)

    # Store the results
    xgb_n_estimators_scores.append({'N_Estimators': n_est, 'Train RMSE': rmse_train, 'Validation RMSE': rmse_val})

    # Check for best validation RMSE
    if rmse_val < best_xgb_rmse_val_n_est:
        best_xgb_rmse_val_n_est = rmse_val
        best_xgb_n_estimators = n_est

# Convert results to DataFrame
xgb_n_estimators_results = pd.DataFrame(xgb_n_estimators_scores)
print(xgb_n_estimators_results)
print(f"\nBest XGBoost N_Estimators: {best_xgb_n_estimators} with Val RMSE: {best_xgb_rmse_val_n_est:.4f}")

# Final XGBoost Model with best depth and best n_estimators
xgb_final_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=best_xgb_n_estimators,
    max_depth=best_xgb_depth,
    learning_rate=0.1,
    random_state=11,
    n_jobs=-1
)
xgb_final_model.fit(X_train, y_train)


--- XGBoost Regressor Tuning (N_Estimators) ---
   N_Estimators  Train RMSE  Validation RMSE
0            50    0.591309         7.989898
1           100    0.214362         8.025857
2           150    0.079750         8.030962
3           200    0.034276         8.032778
4           250    0.017723         8.032911
5           300    0.008341         8.033316

Best XGBoost N_Estimators: 50 with Val RMSE: 7.9899


**Reasoning**:
The previous code block failed due to a `SyntaxError` caused by an unterminated string literal in the `print` statement. I will correct the `print` statement to properly close the string.



In [88]:
print("\n--- XGBoost Regressor Tuning (N_Estimators) ---")

# Define the range of n_estimators values to test
n_estimators_range = range(50, 301, 50)
xgb_n_estimators_scores = []
best_xgb_rmse_val_n_est = float('inf')
best_xgb_n_estimators = 0

for n_est in n_estimators_range:
    # Initialize XGBoost Regressor with best_xgb_depth and current n_estimators
    model = xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=n_est,
        max_depth=best_xgb_depth, # Use the best depth found previously
        learning_rate=0.1,
        random_state=11,
        n_jobs=-1
    )

    # Evaluate the model using the pre-defined evaluate_model function
    rmse_train, rmse_val = evaluate_model(model, X_train, y_train, X_val, y_val)

    # Store the results
    xgb_n_estimators_scores.append({'N_Estimators': n_est, 'Train RMSE': rmse_train, 'Validation RMSE': rmse_val})

    # Check for best validation RMSE
    if rmse_val < best_xgb_rmse_val_n_est:
        best_xgb_rmse_val_n_est = rmse_val
        best_xgb_n_estimators = n_est

# Convert results to DataFrame
xgb_n_estimators_results = pd.DataFrame(xgb_n_estimators_scores)
print(xgb_n_estimators_results)
print(f"\nBest XGBoost N_Estimators: {best_xgb_n_estimators} with Val RMSE: {best_xgb_rmse_val_n_est:.4f}")

# Final XGBoost Model with best depth and best n_estimators
xgb_final_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=best_xgb_n_estimators,
    max_depth=best_xgb_depth,
    learning_rate=0.1,
    random_state=11,
    n_jobs=-1
)
xgb_final_model.fit(X_train, y_train)



--- XGBoost Regressor Tuning (N_Estimators) ---
   N_Estimators  Train RMSE  Validation RMSE
0            50    0.591309         7.989898
1           100    0.214362         8.025857
2           150    0.079750         8.030962
3           200    0.034276         8.032778
4           250    0.017723         8.032911
5           300    0.008341         8.033316

Best XGBoost N_Estimators: 50 with Val RMSE: 7.9899


**Reasoning**:
The subtask is to summarize the parameter tuning results, including the best hyperparameters and the performance metrics (RMSE) on the training and validation sets. I will print the tuning results for both `max_depth` and `n_estimators`, state the best hyperparameters found, and then calculate and print the RMSE for the final XGBoost model on the training and validation sets.



In [89]:
print("\n--- XGBoost Tuning Summary ---")
print(f"Best XGBoost Depth: {best_xgb_depth} with Val RMSE: {best_xgb_rmse_val:.4f}")
print(f"Best XGBoost N_Estimators: {best_xgb_n_estimators} with Val RMSE: {best_xgb_rmse_val_n_est:.4f}")

y_train_pred_xgb = xgb_final_model.predict(X_train)
y_val_pred_xgb = xgb_final_model.predict(X_val)

rmse_train_xgb = np.sqrt(mean_squared_error(y_train, y_train_pred_xgb))
rmse_val_xgb = np.sqrt(mean_squared_error(y_val, y_val_pred_xgb))

print(f"\nFinal XGBoost Model (Depth={best_xgb_depth}, N_Estimators={best_xgb_n_estimators}):")
print(f"  Train RMSE: {rmse_train_xgb:.4f}")
print(f"  Validation RMSE: {rmse_val_xgb:.4f}")



--- XGBoost Tuning Summary ---
Best XGBoost Depth: 8 with Val RMSE: 8.0259
Best XGBoost N_Estimators: 50 with Val RMSE: 7.9899

Final XGBoost Model (Depth=8, N_Estimators=50):
  Train RMSE: 0.5913
  Validation RMSE: 7.9899


## Summary:

### Q&A
The best hyperparameters found for the XGBoost Regressor model are `max_depth = 8` and `n_estimators = 50`.
The performance metrics for the final model with these hyperparameters are:
*   Train RMSE: 0.5913
*   Validation RMSE: 7.9899

### Data Analysis Key Findings
*   Tuning of `max_depth` revealed that a value of 8 resulted in the lowest validation RMSE of 8.0259, with a training RMSE of 0.2144.
*   After setting `max_depth` to 8, tuning `n_estimators` showed that a value of 50 yielded the lowest validation RMSE of 7.9899, with a training RMSE of 0.5913.
*   The final XGBoost model, configured with `max_depth = 8` and `n_estimators = 50`, achieved a training RMSE of 0.5913 and a validation RMSE of 7.9899.

### Insights or Next Steps
*   The significant difference between the training RMSE (0.5913) and validation RMSE (7.9899) for the final model suggests potential overfitting. Further investigation into regularization techniques or cross-validation could be beneficial.
*   While `n_estimators = 50` resulted in the lowest validation RMSE, it's worth exploring values slightly above and below 50 in a finer grid search to ensure the absolute optimal `n_estimators` is identified, as the current search used steps of 50.


# Task
The tuning of `max_depth` and `n_estimators` for the XGBoost Regressor model has been completed.

The best hyperparameters found for the XGBoost Regressor model are:
*   `max_depth`: 8
*   `n_estimators`: 50

The performance metrics for the final XGBoost model with these hyperparameters are:
*   Train RMSE: 0.5913
*   Validation RMSE: 7.9899

The tuning process revealed that a `max_depth` of 8 resulted in the lowest validation RMSE during its tuning phase (8.0259). Subsequently, with `max_depth` fixed at 8, an `n_estimators` value of 50 yielded the lowest validation RMSE (7.9899) during its tuning.

A noticeable difference between the training RMSE (0.5913) and validation RMSE (7.9899) suggests potential overfitting, which could be addressed in future steps by implementing additional regularization techniques or a more extensive cross-validation approach. Also, a finer grid search around the optimal `n_estimators` (e.g., between 1 and 100 with smaller steps) might help pinpoint an even more precise optimal value.

This concludes the task of tuning the `max_depth` and `n_estimators` hyperparameters for the XGBoost Regressor model and summarizing the results.

## Final XGBoost Model Training

### Subtask:
Train the final XGBoost Regressor model using the optimal max_depth (previously found) and the best n_estimators identified during this tuning process.


### Final XGBoost Model Training

The final XGBoost Regressor model has been trained in the previous tuning step using `best_xgb_depth = 8` and `best_xgb_n_estimators = 50`. The model is available as `xgb_final_model`.

Now, we will proceed to summarize the parameter tuning results for the XGBoost model.

## Final Task

### Subtask:
Summarize the parameter tuning results for `n_estimators` of the XGBoost model, including the best hyperparameter found and the performance metrics (RMSE) on the training and validation sets for the final model.


## Summary:

### Q&A
*   **What are the best hyperparameters found for the XGBoost Regressor model?**
    The best hyperparameters found are `max_depth`: 8 and `n_estimators`: 50.

*   **What are the performance metrics (RMSE) on the training and validation sets for the final model?**
    The final model achieved a Train RMSE of 0.5913 and a Validation RMSE of 7.9899.

### Data Analysis Key Findings
*   The optimal `n_estimators` for the XGBoost model, when `max_depth` was fixed at 8, was determined to be 50.
*   The final XGBoost model, configured with `max_depth=8` and `n_estimators=50`, exhibited a Training RMSE of 0.5913 and a Validation RMSE of 7.9899.
*   A significant difference was observed between the training RMSE (0.5913) and validation RMSE (7.9899), suggesting a potential issue of overfitting.

### Insights or Next Steps
*   To mitigate the observed overfitting, consider implementing additional regularization techniques or employing a more extensive cross-validation approach in future model development.
*   Perform a finer-grained grid search for `n_estimators` within a narrower range (e.g., between 1 and 100 with smaller step sizes) to potentially identify an even more precise optimal value.
