In [1]:
import sys
sys.path.append("..")

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [2]:
# Load features
dataset = pd.read_csv("../data_processed/dataset.csv", index_col=0, parse_dates=True)
dataset.head()

Unnamed: 0_level_0,price1,price2,beta,spread,spread_mean,spread_std,z_score,spread_change,spread_vol_20,ret1,ret2,adf_p_252,hurst_252,adf_stationary,hurst_mr,regime_score,y
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2020-12-31,47.18148,127.106888,0.297045,113.09185,102.502775,5.571701,1.90051,0.746264,0.898986,0.007347,0.00672,0.216485,0.302993,0,1,1,1
2021-01-04,45.391956,123.652817,0.297045,110.169348,102.545346,5.589174,1.364066,-2.922501,0.906599,-0.037929,-0.027175,0.252551,0.303254,0,1,1,1
2021-01-05,44.892956,124.021393,0.297045,110.68615,102.588229,5.610057,1.443465,0.516802,0.911003,-0.010993,0.002981,0.216342,0.302288,0,1,1,1
2021-01-06,43.464779,122.50428,0.297045,109.593271,102.633436,5.620454,1.238305,-1.092879,0.97046,-0.031813,-0.012233,0.217369,0.30143,0,1,1,1
2021-01-07,42.982983,122.110039,0.297045,109.342145,102.675464,5.630896,1.183947,-0.251126,1.039151,-0.011085,-0.003218,0.215428,0.30062,0,1,1,1


In [3]:
# Prepare features and labels
target = "y"

feature_cols = [
                "z_score",
                "spread",
                "adf_p_252",
                "hurst_252",
                "adf_stationary",
                "hurst_mr",
                "regime_score"
]

X = dataset[feature_cols].copy()
y = dataset[target].copy()

X.shape, y.shape, feature_cols

((1256, 7),
 (1256,),
 ['z_score',
  'spread',
  'adf_p_252',
  'hurst_252',
  'adf_stationary',
  'hurst_mr',
  'regime_score'])

In [4]:
# Train-test split
train_frac = 0.7
split_idx = int(len(dataset) * train_frac)

X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

X_train.index.min(), X_train.index.max(), X_test.index.min(), X_test.index.max()


(Timestamp('2020-12-31 00:00:00'),
 Timestamp('2024-07-01 00:00:00'),
 Timestamp('2024-07-02 00:00:00'),
 Timestamp('2025-12-31 00:00:00'))

In [6]:
# Train Logistic Regression model with feature scaling
model = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(max_iter=2000))
])

model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))

# ROC-AUC only makes sense if both classes exist in y_test
if len(set(y_test)) == 2:
    print("ROC-AUC:", roc_auc_score(y_test, y_proba))

Confusion matrix:
 [[141  38]
 [140  58]]

Classification report:
               precision    recall  f1-score   support

           0       0.50      0.79      0.61       179
           1       0.60      0.29      0.39       198

    accuracy                           0.53       377
   macro avg       0.55      0.54      0.50       377
weighted avg       0.56      0.53      0.50       377

ROC-AUC: 0.6137915467524406


In [7]:
# Save the trained model
pred = pd.DataFrame(index=X.index)
pred["y"] = y
pred["proba"] = model.predict_proba(X)[:, 1]
pred["y_hat"] = model.predict(X)

pred.to_csv("../data_processed/predictions.csv")
print("Saved predictions to data_processed/predictions.csv")
pred.tail()

Saved predictions to data_processed/predictions.csv


Unnamed: 0_level_0,y,proba,y_hat
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-12-24,0,0.289872,0
2025-12-26,0,0.299539,0
2025-12-29,0,0.29556,0
2025-12-30,0,0.29701,0
2025-12-31,0,0.288983,0
