In [1]:
import pandas as pd
import yfinance as yf
import numpy as np
from hurst import compute_Hc
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from src import (data_loader, processing, feature_engineering, clustering, modeling,strategies)
from src.clustering import core

df_raw = data_loader.download_raw_data(start_date='2000-01-01', end_date='2025-12-31')
df_proc = processing.prepare_market_data(df_raw)
df_features = feature_engineering.add_spx_vix_features(df_proc)

X, idx_features, feat_cols = core.build_feature_matrix(df_features, core.DEFAULT_FEATURE_COLS)
X_scaled, scaler = core.scale_features(X)
X_pca, n_components, pca = core.pca_analysis(X_scaled, variance_target=0.9)
labels, km = core.fit_kmeans(X_pca, n_clusters=2, random_state=42)
df_assigned = core.assign_cluster_labels(df_features, idx_features, labels, "KMeans_Regime")

df_sup, X_all, y_all = modeling.build_supervised_dataset(df_assigned, feat_cols)
splits = {
    "2010-20 to 2021-24": ("2010-01-01", "2020-12-31", "2021-01-01", "2024-12-31"),
    "2003-15 to 2016-20": ("2003-01-01", "2015-12-31", "2016-01-01", "2020-12-31"),
}

results_df, preds_df = modeling.run_splits(
    df_sup=df_sup,
    X_all=X_all,
    y_all=y_all,
    splits=splits,
)

# Example:
vote_pred = modeling.build_majority_vote(preds_df, "2010-20 to 2021-24")

pred_regime_today = vote_pred.shift(1).dropna()

results = strategies.run_all_strategies(pred_regime_today, df_assigned)




  sp500 = yf.download('^GSPC', start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed
  vix = yf.download('^VIX', start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed



=== Running split: 2010-20 to 2021-24 ===
LogisticRegression → acc=0.955, bal_acc=0.959, f1=0.900, auc=0.995
RandomForest → acc=0.944, bal_acc=0.896, f1=0.859, auc=0.988
GradientBoosting → acc=0.944, bal_acc=0.885, f1=0.854, auc=0.991

=== Running split: 2003-15 to 2016-20 ===
LogisticRegression → acc=0.961, bal_acc=0.953, f1=0.900, auc=0.991
RandomForest → acc=0.940, bal_acc=0.930, f1=0.851, auc=0.987
GradientBoosting → acc=0.948, bal_acc=0.925, f1=0.865, auc=0.981


In [2]:
print([print(f"\n{k}: {results[k]['performance']}") for k,v in results.items()])



strat1: {'ann_return': np.float64(0.10478865873929245), 'ann_vol': np.float64(0.1218361593662552), 'sharpe': np.float64(0.8600784798565771), 'sortino': np.float64(1.0846666578231272), 'max_drawdown': np.float64(-0.19457720871093254)}

strat2: {'ann_return': np.float64(0.09968915797309827), 'ann_vol': np.float64(0.09821695891765166), 'sharpe': np.float64(1.014989255131397), 'sortino': np.float64(0.9959741509043689), 'max_drawdown': np.float64(-0.07600394981111369)}

strat3A: {'ann_return': np.float64(0.060852897192576194), 'ann_vol': np.float64(0.05494928273138217), 'sharpe': np.float64(1.1074375163376318), 'sortino': np.float64(0.39137038970986615), 'max_drawdown': np.float64(-0.04920666806377516)}

strat3B: {'ann_return': np.float64(0.11368801759928883), 'ann_vol': np.float64(0.08951698477137027), 'sharpe': np.float64(1.2700161638559686), 'sortino': np.float64(1.3118480131486676), 'max_drawdown': np.float64(-0.09894789402897841)}

strat3: {'ann_return': np.float64(0.17454091479186504

In [3]:
from src import reporting

reporting.run_full_reporting(
    df_with_features=df_features,
    df_assigned=df_assigned,
    idx_features=idx_features,
    pca=pca,
    results_df=results_df,
    preds_df=preds_df,
    strategy_results=results,
)


[1/4] Summerising PCA results ...

[2/4] Describing regimes and clustering (KMeans)...

[3/4] Summarising ML results & diagnostics...
    → Confusion matrix & ROC for LogisticRegression – 2010-20 to 2021-24
    → Transition matrices (true vs predicted regimes)

[4/4] Evaluating strategies and plotting equity curves...

All reporting complete. Results written to: C:\Users\leogo\PycharmProjects\FinalProject\results


In [4]:
# Dont forget to change the csv conversion to parquet conversion for the raw data save