In [19]:
!pip install pandas numpy scikit-learn lightgbm ta joblib catboost optuna yfinance






[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [20]:
import pandas as pd
import numpy as np
import glob
import joblib

from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

import ta



In [21]:
!py --version

Python 3.13.7


In [22]:
path = "C:/Users/saini/OneDrive/Desktop/New_Projects/financial-advisor-ai/data"

files = glob.glob(f"{path}/*_data.csv")
print("Detected ML files:", files)

dfs = []

for file in files:
    df_temp = pd.read_csv(file, parse_dates=["date"])
    ticker = file.split("/")[-1].replace("_data.csv", "")
    print(f"Loaded {ticker} with shape:", df_temp.shape)
    df_temp["ticker"] = ticker
    dfs.append(df_temp)

df = pd.concat(dfs, ignore_index=True)
df = df.sort_values(["ticker", "date"]).reset_index(drop=True)
# print("Combined data shape:", df.shape)
# df.head()


Detected ML files: ['C:/Users/saini/OneDrive/Desktop/New_Projects/financial-advisor-ai/data\\BSESN_data.csv', 'C:/Users/saini/OneDrive/Desktop/New_Projects/financial-advisor-ai/data\\NSEBANK_data.csv', 'C:/Users/saini/OneDrive/Desktop/New_Projects/financial-advisor-ai/data\\NSEI_data.csv']
Loaded data\BSESN with shape: (2629, 19)
Loaded data\NSEBANK with shape: (2354, 19)
Loaded data\NSEI with shape: (2630, 19)


In [23]:
df["ticker_id"] = df["ticker"].astype("category").cat.codes


In [24]:
# ----- Lag Features -----
for lag in [1, 2, 3, 5]:
    df[f"close_lag_{lag}"] = df["close"].shift(lag)
    df[f"volume_lag_{lag}"] = df["volume"].shift(lag)

# ----- Rolling Stats -----
windows = [3, 5, 7, 10, 14]
for w in windows:
    df[f"roll_mean_{w}"] = df["close"].rolling(w).mean()
    df[f"roll_std_{w}"] = df["close"].rolling(w).std()
    df[f"roll_vol_{w}"] = df["volume"].rolling(w).mean()

# ----- Momentum -----
df["roc_5"] = df["close"].pct_change(5)
df["roc_10"] = df["close"].pct_change(10)
df["momentum"] = df["close"] - df["close"].shift(10)

# ----- ADX -----
df["adx_14"] = ta.trend.ADXIndicator(
    high=df["high"], low=df["low"], close=df["close"], window=14
).adx()

# ----- Weighted Moving Average -----
df["wma_20"] = ta.trend.WMAIndicator(df["close"], window=20).wma()

# Final cleanup
df = df.dropna().reset_index(drop=True)

df.head()


Unnamed: 0,date,close,high,low,open,volume,return_1d,return_7d,rsi_14,ema_20,...,roll_std_10,roll_vol_10,roll_mean_14,roll_std_14,roll_vol_14,roc_5,roc_10,momentum,adx_14,wma_20
0,2015-04-16,28666.039062,28876.230469,28497.699219,28876.230469,8400,-0.004641,0.005669,52.057945,28566.496124,...,325.28416,9770.0,28373.22154,516.859447,10735.714286,-0.001453,0.024671,690.179688,0.0,28481.820964
1,2015-04-17,28442.099609,28696.189453,28403.759766,28682.970703,13900,-0.007812,-0.002612,47.543091,28554.648837,...,240.249613,10050.0,28396.812221,511.520761,10485.714286,-0.01534,0.017334,484.609375,0.0,28487.410965
2,2015-04-20,27886.210938,28539.460938,27802.369141,28525.650391,19000,-0.019545,-0.028617,38.595014,28490.988085,...,325.281238,11140.0,28427.428711,461.346594,10935.714286,-0.03439,-0.013232,-373.929688,0.0,28441.460528
3,2015-04-21,27676.039062,27976.929688,27598.210938,27860.509766,20700,-0.007537,-0.041861,35.847895,28413.373892,...,445.122284,12380.0,28442.957171,428.735979,11600.0,-0.047114,-0.029063,-828.421875,0.0,28378.998047
4,2015-04-22,27890.130859,27947.259766,27385.480469,27756.679688,11700,0.007736,-0.034255,40.494279,28363.541222,...,492.014312,12360.0,28436.833705,436.463114,11642.857143,-0.031582,-0.021968,-626.458984,0.0,28340.704455


In [25]:
base_features = [
    "open", "high", "low", "close", "volume",
    "rsi_14", "ema_20", "ema_50",
    "macd", "macd_signal", "macd_hist",
    "bb_high", "bb_mid", "bb_low",
    "return_1d", "return_7d",
    "ticker_id"
]

extra_features = [
    "close_lag_1","close_lag_2","close_lag_3","close_lag_5",
    "volume_lag_1","volume_lag_2","volume_lag_3","volume_lag_5",
    "roll_mean_3","roll_mean_5","roll_mean_7","roll_mean_10","roll_mean_14",
    "roll_std_3","roll_std_5","roll_std_7","roll_std_10","roll_std_14",
    "roll_vol_3","roll_vol_5","roll_vol_7","roll_vol_10","roll_vol_14",
    "roc_5","roc_10","momentum","adx_14","wma_20"
]

feature_cols = base_features + extra_features
target_col = "target_direction"

len(feature_cols)


45

In [26]:
train_size = int(len(df) * 0.8)

train_df = df.iloc[:train_size]
valid_df = df.iloc[train_size:]

X_train = train_df[feature_cols]
y_train = train_df[target_col]

X_valid = valid_df[feature_cols]
y_valid = valid_df[target_col]

X_train.shape, X_valid.shape


((6075, 45), (1519, 45))

In [27]:
import optuna
from catboost import CatBoostClassifier

def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 300, 800),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "random_seed": 42,
        "loss_function": "Logloss",
        "verbose": False
    }

    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train)

    preds = model.predict_proba(X_valid)[:, 1]
    return roc_auc_score(y_valid, preds)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=25)

print("Best Params:", study.best_params)


[I 2025-11-23 12:25:35,482] A new study created in memory with name: no-name-bafad568-5e9e-4117-b576-a0b985365b04
[I 2025-11-23 12:26:05,682] Trial 0 finished with value: 0.5559652624575411 and parameters: {'iterations': 406, 'depth': 10, 'learning_rate': 0.04739315002072324, 'l2_leaf_reg': 8.31520521458462}. Best is trial 0 with value: 0.5559652624575411.
[I 2025-11-23 12:26:09,162] Trial 1 finished with value: 0.5645375914836992 and parameters: {'iterations': 625, 'depth': 6, 'learning_rate': 0.06107550366466079, 'l2_leaf_reg': 6.36356860757198}. Best is trial 1 with value: 0.5645375914836992.
[I 2025-11-23 12:26:16,864] Trial 2 finished with value: 0.5534002171096404 and parameters: {'iterations': 432, 'depth': 8, 'learning_rate': 0.04269470604604354, 'l2_leaf_reg': 5.006461622743483}. Best is trial 1 with value: 0.5645375914836992.
[I 2025-11-23 12:27:07,970] Trial 3 finished with value: 0.5507826452358441 and parameters: {'iterations': 694, 'depth': 10, 'learning_rate': 0.05123677

Best Params: {'iterations': 358, 'depth': 4, 'learning_rate': 0.017929661695989953, 'l2_leaf_reg': 8.982509829711734}


In [28]:
best_params = study.best_params
best_params["loss_function"] = "Logloss"
best_params["verbose"] = False

from catboost import CatBoostClassifier

model = CatBoostClassifier(**best_params)
model.fit(X_train, y_train)

print("Final model trained with Optuna parameters!")


Final model trained with Optuna parameters!


In [29]:
y_pred = model.predict(X_valid)
y_proba = model.predict_proba(X_valid)[:, 1]

print("Accuracy:", accuracy_score(y_valid, y_pred))
print("ROC-AUC:", roc_auc_score(y_valid, y_proba))
print(classification_report(y_valid, y_pred))


Accuracy: 0.5569453587886768
ROC-AUC: 0.5693140035718037
              precision    recall  f1-score   support

           0       0.51      0.36      0.42       684
           1       0.58      0.72      0.64       835

    accuracy                           0.56      1519
   macro avg       0.54      0.54      0.53      1519
weighted avg       0.55      0.56      0.54      1519



In [30]:
joblib.dump(model, "C:/Users/saini/OneDrive/Desktop/New_Projects/financial-advisor-ai/ml/upgraded_catboost_model.pkl")
print("Model saved as upgraded_catboost_model.pkl")


Model saved as upgraded_catboost_model.pkl


In [31]:
def predict_next_day(input_row):
    data = input_row[feature_cols].values.reshape(1, -1)
    direction = int(model.predict(data)[0])
    confidence = float(model.predict_proba(data)[0][1])
    
    return {
        "direction": direction,
        "confidence": confidence
    }

# Test on last record
predict_next_day(df.iloc[-1])


{'direction': 1, 'confidence': 0.527952961627416}