In [61]:
import sys
sys.path.append("..")

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [62]:
# Load features
dataset = pd.read_csv("../data_processed/dataset.csv", index_col=0, parse_dates=True)
dataset.head()

Unnamed: 0_level_0,price1,price2,beta,spread,spread_mean,spread_std,z_score,spread_change,spread_vol_20,ret1,ret2,adf_p_252,hurst_252,adf_stationary,hurst_mr,regime_score,y
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2020-12-31,349.008179,349.54187,0.993045,2.960904,2.314154,0.338353,1.911466,0.194094,0.062249,0.005081,0.005599,0.825598,0.199873,0,1,1,1
2021-01-04,344.256683,344.727783,0.993045,2.865268,2.316166,0.340119,1.61444,-0.095636,0.064976,-0.013614,-0.013773,0.853558,0.198402,0,1,1,0
2021-01-05,346.627716,347.009094,0.993045,2.792036,2.317691,0.341391,1.38945,-0.073232,0.064252,0.006887,0.006618,0.870435,0.196244,0,1,1,0
2021-01-06,348.700104,349.104218,0.993045,2.829184,2.319302,0.342845,1.487206,0.037148,0.065326,0.005979,0.006038,0.856608,0.193426,0,1,1,0
2021-01-07,353.880859,354.272095,0.993045,2.852335,2.321236,0.344475,1.541764,0.023152,0.067004,0.014857,0.014803,0.861694,0.190564,0,1,1,1


In [63]:
# Prepare features and labels
target = "y"

feature_cols = [
                "z_score",
                "spread",
                "adf_p_252",
                "hurst_252",
                "adf_stationary",
                "hurst_mr",
                "regime_score"
]

X = dataset[feature_cols].copy()
y = dataset[target].copy()

X.shape, y.shape, feature_cols

((1256, 7),
 (1256,),
 ['z_score',
  'spread',
  'adf_p_252',
  'hurst_252',
  'adf_stationary',
  'hurst_mr',
  'regime_score'])

In [64]:
# Train-test split
train_frac = 0.7
split_idx = int(len(dataset) * train_frac)

X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

X_train.index.min(), X_train.index.max(), X_test.index.min(), X_test.index.max()


(Timestamp('2020-12-31 00:00:00'),
 Timestamp('2024-07-01 00:00:00'),
 Timestamp('2024-07-02 00:00:00'),
 Timestamp('2025-12-31 00:00:00'))

In [65]:
# Train Logistic Regression model with feature scaling
model = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(max_iter=2000, C=0.1))
])

model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))

# ROC-AUC only makes sense if both classes exist in y_test
if len(set(y_test)) == 2:
    print("ROC-AUC:", roc_auc_score(y_test, y_proba))

Confusion matrix:
 [[  8 181]
 [ 13 175]]

Classification report:
               precision    recall  f1-score   support

           0       0.38      0.04      0.08       189
           1       0.49      0.93      0.64       188

    accuracy                           0.49       377
   macro avg       0.44      0.49      0.36       377
weighted avg       0.44      0.49      0.36       377

ROC-AUC: 0.5034335247101205


In [66]:
# Save the trained model
pred = pd.DataFrame(index=X.index)
pred["y"] = y
pred["proba"] = model.predict_proba(X)[:, 1]
pred["y_hat"] = model.predict(X)

pred.to_csv("../data_processed/predictions.csv")
print("Saved predictions to data_processed/predictions.csv")
pred.tail()

Saved predictions to data_processed/predictions.csv


Unnamed: 0_level_0,y,proba,y_hat
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-12-24,0,0.564134,1
2025-12-26,0,0.565482,1
2025-12-29,0,0.56454,1
2025-12-30,0,0.568611,1
2025-12-31,0,0.566363,1
