In [2]:
import os
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

# Optional: Configure plot style
sns.set(style="whitegrid")


In [3]:
# Load your engineered dataset
df = pd.read_csv("nifty100_feature_engineered_important_features.csv")
df['Date'] = pd.to_datetime(df['Date'])

# Drop rows with missing values (if any)
df.dropna(inplace=True)

# Show shape and preview
print(f"📊 Dataset shape: {df.shape}")
df.head()


📊 Dataset shape: (390903, 27)


Unnamed: 0,Date,Symbol,Close_Price_Shift,Close Price,Average Price,High Price,Low Price,Upper_BB,Open Price,RollingStd_5,...,EMA_10,Price_vs_EMA50,Lower_BB,Stochastic,Price_vs_SMA10,MACD,CCI,Lag_50,Target,Target_pct
0,2006-04-05,ABB.NS_data,101.668297,104.346001,103.587803,104.814684,101.602726,102.50669,101.938671,4.053557,...,98.980221,0.179043,93.144938,95.731341,0.062505,3.777288,250.471936,73.807823,-2.677704,-2.566178
1,2006-04-07,ABB.NS_data,104.362404,101.668297,102.148996,106.453452,98.325239,102.733646,104.224753,3.055947,...,99.468962,0.142123,93.748994,62.077967,0.030781,3.752768,159.826082,74.93364,2.694107,2.649899
2,2006-04-10,ABB.NS_data,102.240204,104.362404,102.959627,104.880245,99.636233,103.783378,101.733839,1.740922,...,100.358679,0.164515,93.46456,83.428586,0.051396,3.905704,161.618862,81.372314,-2.1222,-2.033491
3,2006-04-12,ABB.NS_data,96.111259,102.240204,103.238204,106.453436,101.020973,104.174768,104.91301,1.404097,...,100.700774,0.134569,93.387484,66.610292,0.02515,3.811725,144.648283,82.368675,-6.128944,-5.994652
4,2006-04-13,ABB.NS_data,97.782791,96.111259,97.040983,102.258229,92.75346,104.16978,102.127129,3.376616,...,99.866317,0.063779,93.133712,24.509502,-0.036187,3.205737,-59.106666,83.384705,1.671532,1.739163


In [4]:
# Features and target
FEATURES = [col for col in df.columns if col not in ['Date', 'Symbol', 'Target', 'Target_pct']]
TARGET = 'Target_pct'

# Time-based split
df.sort_values('Date', inplace=True)
train_df = df[df['Date'] < '2024-01-01']
test_df = df[df['Date'] >= '2024-01-01']

# Train/test split
X_train = train_df[FEATURES]
y_train = train_df[TARGET]
X_test = test_df[FEATURES]
y_test = test_df[TARGET]

print(f"✅ Training on {X_train.shape[0]} rows, Testing on {X_test.shape[0]} rows")


✅ Training on 363259 rows, Testing on 27644 rows


In [5]:
# Initialize LightGBM
from lightgbm import LGBMRegressor

lgb_model = LGBMRegressor(n_estimators=100, random_state=42)

print("⏳ Training LightGBM model...")
lgb_model.fit(X_train, y_train)
print("✅ LightGBM model trained.")

# Save model
joblib.dump(lgb_model, 'lgb_model.joblib')


⏳ Training LightGBM model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015088 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5865
[LightGBM] [Info] Number of data points in the train set: 363259, number of used features: 23
[LightGBM] [Info] Start training from score 0.092647
✅ LightGBM model trained.


['lgb_model.joblib']

In [7]:
pip install catboost

Note: you may need to restart the kernel to use updated packages.Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-win_amd64.whl.metadata (1.5 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting plotly (from catboost)
  Downloading plotly-6.0.1-py3-none-any.whl.metadata (6.7 kB)
Collecting narwhals>=1.15.1 (from plotly->catboost)
  Downloading narwhals-1.38.2-py3-none-any.whl.metadata (9.4 kB)
Downloading catboost-1.2.8-cp312-cp312-win_amd64.whl (102.4 MB)
   ---------------------------------------- 0.0/102.4 MB ? eta -:--:--
   - -------------------------------------- 4.7/102.4 MB 25.9 MB/s eta 0:00:04
   --- ------------------------------------ 10.0/102.4 MB 24.8 MB/s eta 0:00:04
   ---- ----------------------------------- 12.6/102.4 MB 20.7 MB/s eta 0:00:05
   ------ --------------------------------- 16.8/102.4 MB 20.3 MB/s eta 0:00:05
   -------- ------------------------------- 20.7/102.4 MB 20.4 MB/s eta 0:0


[notice] A new release of pip is available: 25.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
from catboost import CatBoostRegressor

cat_model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    verbose=100,
    random_seed=42
)

cat_model.fit(X_train, y_train)
cat_preds = cat_model.predict(X_test)

# Save the CatBoost model
joblib.dump(cat_model, "saved_models/catboost_model.joblib")
print("💾 CatBoost model saved successfully.")


ModuleNotFoundError: No module named 'catboost'