In [73]:
import sys
sys.path.append("..")

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [74]:
# Load features
dataset = pd.read_csv("../data_processed/dataset.csv", index_col=0, parse_dates=True)
dataset.head()

Unnamed: 0_level_0,price1,price2,beta,spread,spread_mean,spread_std,z_score,spread_change,spread_vol_20,ret1,ret2,adf_p_252,hurst_252,adf_stationary,hurst_mr,regime_score,y
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2020-12-31,349.008118,349.541931,0.993045,2.961022,2.314153,0.33835,1.911831,0.194185,0.062264,0.005081,0.005599,0.825639,0.199873,0,1,1,1
2021-01-04,344.256836,344.727783,0.993045,2.865113,2.316165,0.340115,1.614003,-0.095909,0.064983,-0.013614,-0.013773,0.853609,0.198403,0,1,1,0
2021-01-05,346.627777,347.009155,0.993045,2.792032,2.317689,0.341387,1.38946,-0.07308,0.064259,0.006887,0.006618,0.870462,0.196245,0,1,1,0
2021-01-06,348.700104,349.104218,0.993045,2.82918,2.319301,0.342841,1.487217,0.037148,0.065335,0.005979,0.006037,0.856627,0.193426,0,1,1,0
2021-01-07,353.88089,354.272034,0.993045,2.85224,2.321235,0.34447,1.541515,0.02306,0.067005,0.014857,0.014803,0.861715,0.190564,0,1,1,1


In [75]:
# Prepare features and labels
target = "y"

feature_cols = [
                "z_score",
                "spread",
                "adf_p_252",
                "hurst_252",
                "adf_stationary",
                "hurst_mr",
                "regime_score"
]

X = dataset[feature_cols].copy()
y = dataset[target].copy()

X.shape, y.shape, feature_cols

((1256, 7),
 (1256,),
 ['z_score',
  'spread',
  'adf_p_252',
  'hurst_252',
  'adf_stationary',
  'hurst_mr',
  'regime_score'])

In [76]:
# Train-test split
train_frac = 0.7
split_idx = int(len(dataset) * train_frac)

X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

X_train.index.min(), X_train.index.max(), X_test.index.min(), X_test.index.max()


(Timestamp('2020-12-31 00:00:00'),
 Timestamp('2024-07-01 00:00:00'),
 Timestamp('2024-07-02 00:00:00'),
 Timestamp('2025-12-31 00:00:00'))

In [77]:
# Train Logistic Regression model with feature scaling
model = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(max_iter=2000, C=0.1))
])

model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))

# ROC-AUC only makes sense if both classes exist in y_test
if len(set(y_test)) == 2:
    print("ROC-AUC:", roc_auc_score(y_test, y_proba))

Confusion matrix:
 [[  8 181]
 [ 13 175]]

Classification report:
               precision    recall  f1-score   support

           0       0.38      0.04      0.08       189
           1       0.49      0.93      0.64       188

    accuracy                           0.49       377
   macro avg       0.44      0.49      0.36       377
weighted avg       0.44      0.49      0.36       377

ROC-AUC: 0.5034335247101205


In [78]:
# Save the trained model
pred = pd.DataFrame(index=X.index)
pred["y"] = y
pred["proba"] = model.predict_proba(X)[:, 1]
pred["y_hat"] = model.predict(X)

pred.to_csv("../data_processed/predictions.csv")
print("Saved predictions to data_processed/predictions.csv")
pred.tail()

Saved predictions to data_processed/predictions.csv


Unnamed: 0_level_0,y,proba,y_hat
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-12-24,0,0.564137,1
2025-12-26,0,0.565484,1
2025-12-29,0,0.564542,1
2025-12-30,0,0.568612,1
2025-12-31,0,0.566365,1
