In [None]:
import sys
sys.path.insert(0, '../')

from src.models import train_logistic_regression, scale_features
from src.utils import print_model_evaluation
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [None]:
# Load features with labels
ticker1 = 'KO'
ticker2 = 'PEP'

data = pd.read_csv(f'../data_processed/features_with_labels_{ticker1}_{ticker2}.csv', index_col=0, parse_dates=True)
print(f"Data shape: {data.shape}")
print(data.head())

In [None]:
# Prepare features and labels
X = data[['spread', 'zscore', 'rolling_mean', 'rolling_std', 'rolling_min', 'rolling_max']].dropna()
y = data.loc[X.index, 'label']

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"\nLabel distribution in training data:")
print(y.value_counts())

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

In [None]:
# Scale features
X_train_scaled, X_test_scaled = scale_features(X_train, X_test)

print(f"X_train_scaled shape: {X_train_scaled.shape}")
print(f"X_test_scaled shape: {X_test_scaled.shape}")

In [None]:
# Train Logistic Regression model
model = train_logistic_regression(X_train_scaled, y_train, max_iter=1000)

# Evaluate
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)

print("Model Training Complete!")
print(f"\nTraining score: {model.score(X_train_scaled, y_train):.4f}")
print(f"Test score: {model.score(X_test_scaled, y_test):.4f}")

In [None]:
# Print evaluation metrics
print_model_evaluation(y_test, y_pred, y_pred_proba)