In [35]:
import pandas as pd
import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, roc_auc_score

!pip install xgboost

from xgboost import XGBClassifier

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout




In [36]:
file_path = "../data/nifty_with_regime.csv"

df = pd.read_csv(file_path, parse_dates=["datetime"])
df.set_index("datetime", inplace=True)

df.head()


Unnamed: 0_level_0,open,high,low,close,volume,ema_5,ema_15,ema_gap,returns,volatility,candle_range,candle_body,hour,minute,day_of_week,returns_lag_1,returns_lag_3,ema_gap_lag_1,regime
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2015-01-09 11:00:00,8285.75,8288.85,8282.3,8288.0,0,8285.789434,8286.208532,-0.419098,0.000278,0.000489,6.55,2.25,11,0,4,0.000127,6e-05,-1.268458,0
2015-01-09 11:05:00,8287.95,8288.1,8280.2,8281.55,0,8284.376289,8285.626216,-1.249927,-0.000778,0.000485,7.9,6.4,11,5,4,0.000278,0.000338,-0.419098,0
2015-01-09 11:10:00,8281.85,8286.35,8279.9,8282.35,0,8283.70086,8285.216689,-1.515829,9.7e-05,0.000465,6.45,0.5,11,10,4,-0.000778,0.000127,-1.249927,0
2015-01-09 11:15:00,8282.55,8283.2,8276.15,8278.45,0,8281.950573,8284.370853,-2.42028,-0.000471,0.000457,7.05,4.1,11,15,4,9.7e-05,0.000278,-1.515829,-1
2015-01-09 11:20:00,8278.55,8280.7,8266.55,8266.55,0,8276.817049,8282.143246,-5.326197,-0.001437,0.000549,14.15,12.0,11,20,4,-0.000471,-0.000778,-2.42028,-1


In [37]:
df["future_return"] = df["returns"].shift(-3)
df["target"] = (df["future_return"] > 0).astype(int)


In [38]:
features = [
    "ema_5", "ema_15", "ema_gap",
    "volatility",
    "returns_lag_1", "returns_lag_3",
    "hour", "minute",
    "regime"
]
df.dropna(inplace=True)

X = df[features]
y = df["target"]


MODEL A — XGBOOST

In [44]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [46]:
tscv = TimeSeriesSplit(n_splits=5)

xgb_model = XGBClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

for train_idx, test_idx in tscv.split(X_scaled):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    xgb_model.fit(X_train, y_train)


In [48]:
df["xgb_prob"] = xgb_model.predict_proba(X_scaled)[:, 1]


MODEL B — LSTM

In [51]:
sequence_length = 10

lstm_features = [
    "close", "returns", "ema_gap",
    "volatility", "regime"
]

scaler_lstm = MinMaxScaler()
scaled_data = scaler_lstm.fit_transform(df[lstm_features])


In [53]:
X_lstm, y_lstm = [], []

for i in range(sequence_length, len(scaled_data)):
    X_lstm.append(scaled_data[i-sequence_length:i])
    y_lstm.append(df["target"].iloc[i])

X_lstm = np.array(X_lstm)
y_lstm = np.array(y_lstm)


In [54]:
split = int(0.7 * len(X_lstm))

X_train, X_test = X_lstm[:split], X_lstm[split:]
y_train, y_test = y_lstm[:split], y_lstm[split:]


In [55]:
lstm_model = Sequential([
    LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])

lstm_model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)


  super().__init__(**kwargs)


In [56]:
lstm_model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=64,
    validation_data=(X_test, y_test),
    verbose=1
)


Epoch 1/10
[1m2093/2093[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 14ms/step - accuracy: 0.5047 - loss: 0.6940 - val_accuracy: 0.5060 - val_loss: 0.6931
Epoch 2/10
[1m2093/2093[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 11ms/step - accuracy: 0.5070 - loss: 0.6932 - val_accuracy: 0.5058 - val_loss: 0.6938
Epoch 3/10
[1m2093/2093[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 9ms/step - accuracy: 0.5023 - loss: 0.6932 - val_accuracy: 0.5052 - val_loss: 0.6931
Epoch 4/10
[1m2093/2093[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 8ms/step - accuracy: 0.5077 - loss: 0.6931 - val_accuracy: 0.5059 - val_loss: 0.6931
Epoch 5/10
[1m2093/2093[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 8ms/step - accuracy: 0.5078 - loss: 0.6931 - val_accuracy: 0.5058 - val_loss: 0.6931
Epoch 6/10
[1m2093/2093[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 9ms/step - accuracy: 0.5097 - loss: 0.6930 - val_accuracy: 0.5066 - val_loss: 0.6930
Epoch 7/

<keras.src.callbacks.history.History at 0x1e0261c9700>

In [57]:
lstm_probs = lstm_model.predict(X_lstm).flatten()

# Align predictions
df = df.iloc[sequence_length:]
df["lstm_prob"] = lstm_probs


[1m5980/5980[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 3ms/step


In [61]:
output_path = "../results/ml_predictions.csv"
df.to_csv(output_path)

print(f"ML predictions saved to {output_path}")


ML predictions saved to ../results/ml_predictions.csv
