In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

import sys
from pathlib import Path
PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

DATA_DIR    = PROJECT_ROOT / "data"
RESULTS_DIR = PROJECT_ROOT / "results"
MODELS_DIR  = PROJECT_ROOT / "models"
PLOTS_DIR   = PROJECT_ROOT / "plots"

for d in [RESULTS_DIR, MODELS_DIR, PLOTS_DIR]:
    d.mkdir(exist_ok=True)


In [2]:
features_df = pd.read_csv(
    DATA_DIR / "features" / "nifty_features_5min.csv",
    parse_dates=["timestamp"]
)

baseline_trades = pd.read_csv(
    RESULTS_DIR / "baseline_trades.csv",
    parse_dates=["entry_time", "exit_time"]
)

regime_df = pd.read_csv(
    DATA_DIR / "nifty_with_regimes.csv",
    parse_dates=["timestamp"]
)


In [3]:
baseline_trades["target"] = (baseline_trades["pnl"] > 0).astype(int)


In [4]:
ml_df = pd.merge_asof(
    baseline_trades.sort_values("entry_time"),
    regime_df.sort_values("timestamp"),
    left_on="entry_time",
    right_on="timestamp",
    direction="backward"
)



In [5]:
ml_df["hour"] = ml_df["entry_time"].dt.hour
ml_df["day_of_week"] = ml_df["entry_time"].dt.dayofweek


In [6]:
for lag in [1, 2, 3]:
    ml_df[f"spot_return_lag_{lag}"] = ml_df["spot_returns"].shift(lag)


In [7]:
ml_df["signal_strength"] = abs(ml_df["ema_5"] - ml_df["ema_15"])

In [8]:
feature_cols = [
    # technical
    "ema_5", "ema_15", "signal_strength",

    # volatility & derivatives
    "avg_iv", "iv_spread",
    "pcr_oi", "pcr_volume",
    "ce_delta", "ce_gamma", "ce_vega",
    "futures_basis",

    # returns + lags
    "spot_returns",
    "spot_return_lag_1",
    "spot_return_lag_2",
    "spot_return_lag_3",

    # regime
    "regime",

    # time
    "hour",
    "day_of_week"
]

ml_df = ml_df.dropna(subset=feature_cols + ["target"])

X = ml_df[feature_cols]
y = ml_df["target"]

print(X.shape)


(610, 18)


#### XGBoost

In [9]:
tscv = TimeSeriesSplit(n_splits=5)
cv_scores = []

for train_idx, val_idx in tscv.split(X):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = XGBClassifier(
        n_estimators=300,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric="logloss"
    )

    model.fit(X_tr, y_tr)
    preds = model.predict(X_val)

    cv_scores.append(accuracy_score(y_val, preds))

print("XGBoost Time-Series CV scores:", cv_scores)
print("Mean CV accuracy:", np.mean(cv_scores))


XGBoost Time-Series CV scores: [0.5247524752475248, 0.7524752475247525, 0.7227722772277227, 0.6237623762376238, 0.6534653465346535]
Mean CV accuracy: 0.6554455445544555


In [10]:
split_idx = int(len(X) * 0.7)

X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

xgb_final = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="logloss"
)

xgb_final.fit(X_train, y_train)

xgb_probs = xgb_final.predict_proba(X_test)[:, 1]
xgb_preds = (xgb_probs > 0.5).astype(int)

print("XGBoost Test Accuracy:", accuracy_score(y_test, xgb_preds))



XGBoost Test Accuracy: 0.6284153005464481


#### LSTM

In [12]:
# LSTM SEQUENCE PREPARATION

SEQ_LEN = 10

lstm_features = [
    "ema_5", "ema_15",
    "avg_iv", "iv_spread",
    "pcr_oi", "pcr_volume",
    "ce_delta", "ce_gamma", "ce_vega",
    "futures_basis",
    "spot_returns",
    "regime"
]

lstm_df = regime_df[lstm_features + ["spot_returns"]].dropna()

scaler = StandardScaler()
scaled_features = scaler.fit_transform(lstm_df[lstm_features])

X_seq, y_seq = [], []

returns = lstm_df["spot_returns"].values

for i in range(SEQ_LEN, len(scaled_features)):
    X_seq.append(scaled_features[i-SEQ_LEN:i])
    y_seq.append(returns[i] > 0)

X_seq = np.array(X_seq)
y_seq = np.array(y_seq).astype(int)

print(X_seq.shape)


(17327, 10, 13)


In [13]:
split_seq = int(len(X_seq) * 0.7)

X_train_seq, X_test_seq = X_seq[:split_seq], X_seq[split_seq:]
y_train_seq, y_test_seq = y_seq[:split_seq], y_seq[split_seq:]


In [31]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input

tf.keras.backend.clear_session()  # IMPORTANT

lstm_model = Sequential([
    Input(shape=(SEQ_LEN, X_train_seq.shape[2])),
    LSTM(64),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])

lstm_model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

lstm_model.summary()







In [32]:
# Correct LSTM Train/Test Split

split_seq = int(len(X_seq) * 0.7)

X_train_seq = X_seq[:split_seq]
X_test_seq  = X_seq[split_seq:]

y_train_seq = y_seq[:split_seq]
y_test_seq  = y_seq[split_seq:]

print("X_train_seq:", X_train_seq.shape)
print("X_test_seq :", X_test_seq.shape)
print("y_train_seq:", y_train_seq.shape)
print("y_test_seq :", y_test_seq.shape)


X_train_seq: (12128, 10, 13)
X_test_seq : (5199, 10, 13)
y_train_seq: (12128, 2)
y_test_seq : (5199, 2)


In [33]:
# Convert one-hot to binary scalar if needed
if y_train_seq.ndim == 2:
    y_train_seq = y_train_seq[:, 1]

if y_test_seq.ndim == 2:
    y_test_seq = y_test_seq[:, 1]

# Ensure correct shape
y_train_seq = y_train_seq.astype(int).reshape(-1,)
y_test_seq  = y_test_seq.astype(int).reshape(-1,)

print("Fixed y_train_seq shape:", y_train_seq.shape)
print("Fixed y_test_seq shape :", y_test_seq.shape)


Fixed y_train_seq shape: (12128,)
Fixed y_test_seq shape : (5199,)


In [34]:
lstm_model.fit(
    X_train_seq,
    y_train_seq,
    validation_split=0.2,
    epochs=15,
    batch_size=64,
    verbose=1
)

Epoch 1/15
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 54ms/step - accuracy: 0.5038 - loss: 0.6958 - val_accuracy: 0.4798 - val_loss: 0.6953
Epoch 2/15
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.5046 - loss: 0.6943 - val_accuracy: 0.4930 - val_loss: 0.6955
Epoch 3/15
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - accuracy: 0.5180 - loss: 0.6927 - val_accuracy: 0.4901 - val_loss: 0.6945
Epoch 4/15
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 30ms/step - accuracy: 0.5124 - loss: 0.6933 - val_accuracy: 0.5103 - val_loss: 0.6939
Epoch 5/15
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.5134 - loss: 0.6926 - val_accuracy: 0.4909 - val_loss: 0.6944
Epoch 6/15
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.5081 - loss: 0.6923 - val_accuracy: 0.4732 - val_loss: 0.6961
Epoch 7/15
[1m152/15

<keras.src.callbacks.history.History at 0x1c68518d4b0>

In [35]:
lstm_probs = lstm_model.predict(X_test_seq).reshape(-1)
lstm_preds = (lstm_probs > 0.5).astype(int)

assert len(y_test_seq) == len(lstm_preds)

print("LSTM Test Accuracy:", accuracy_score(y_test_seq, lstm_preds))

[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 23ms/step
LSTM Test Accuracy: 0.4972110021157915


#### ML-Filtered backtest

In [36]:
ml_test = ml_df.iloc[split_idx:].copy()

ml_test["xgb_prob"] = xgb_probs
ml_test["xgb_pred"] = xgb_preds

ml_test["lstm_prob"] = lstm_probs[:len(ml_test)]
ml_test["lstm_pred"] = (ml_test["lstm_prob"] > 0.5).astype(int)


In [37]:
baseline_pnl = baseline_trades["pnl"].sum()
xgb_pnl = ml_test[ml_test["xgb_pred"] == 1]["pnl"].sum()
lstm_pnl = ml_test[ml_test["lstm_pred"] == 1]["pnl"].sum()

comparison = pd.DataFrame({
    "Strategy": ["Baseline", "XGBoost Filtered", "LSTM Filtered"],
    "Total PnL": [baseline_pnl, xgb_pnl, lstm_pnl]
})

comparison


Unnamed: 0,Strategy,Total PnL
0,Baseline,6731.228837
1,XGBoost Filtered,1025.349636
2,LSTM Filtered,636.82258


In [38]:
comparison.to_csv("../results/ml_strategy_comparison.csv", index=False)
ml_test.to_csv("../results/ml_trades_with_predictions.csv", index=False)


#### Saving Models

In [39]:
xgb_final.save_model(MODELS_DIR/"xgboost_model.json")

lstm_model.save(MODELS_DIR/"lstm_model.h5")

