In [6]:
#import scikitlearn and paths
import pathlib, pandas as pd, numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, f1_score, roc_auc_score
import joblib

BASE = pathlib.Path.cwd().parents[0]
DATA_PROCESSED = BASE / "data" / "processed"
MODELS = BASE / "models"
REPORTS = BASE / "reports"
MODELS.mkdir(exist_ok=True)
REPORTS.mkdir(exist_ok=True)

# Adjust to your symbols
SYMBOLS = ["QQQ", "VFV.TO", "XEQT.TO"]

In [7]:
# feature/target split
def load_dataset(symbol: str) -> pd.DataFrame:
    path = DATA_PROCESSED / f"{symbol}_dataset.parquet"
    df = pd.read_parquet(path)
    # Ensure sorted by date
    df = df.sort_values("date").reset_index(drop=True)
    return df

def make_X_y(df: pd.DataFrame, task: str):
    """
    task: "reg" -> y_reg, "cls" -> y_cls
    Returns X (features), y (target), dates (for plotting/reporting)
    """
    feats = df.drop(columns=["date","y_reg","y_cls"])
    if task == "reg":
        y = df["y_reg"].values
    else:
        y = df["y_cls"].values
    X = feats.values.astype(float)
    dates = pd.to_datetime(df["date"].values)
    return X, y, dates, feats.columns.tolist()

In [8]:
# time series cross validation helper function
def ts_cv_indices(n_samples: int, n_splits: int = 5):
    """
    Wrapper around TimeSeriesSplit to get train/test indices.
    """
    tscv = TimeSeriesSplit(n_splits=n_splits)
    return list(tscv.split(np.arange(n_samples)))

In [9]:
def train_regression_model(symbol: str, algo: str = "ridge", n_splits: int = 5) -> pd.DataFrame:
    """
    Train regression on y_reg with TimeSeriesSplit.
    algo: 'ridge' or 'hgb'
    Saves best (last fold) model to models/{symbol}_{algo}_reg.pkl
    Returns metrics per fold as a DataFrame.
    """
    #tr is training, te is testing
    df = load_dataset(symbol)
    X, y, dates, feat_names = make_X_y(df, task="reg")

    if algo == "ridge":
        model = Pipeline([("scaler", StandardScaler()), ("ridge", Ridge())])
    elif algo == "hgb":
        model = HistGradientBoostingRegressor()
    else:
        raise ValueError("Unknown algo")

    folds = ts_cv_indices(len(X), n_splits=n_splits)
    metrics = []
    yhat_all = np.full_like(y, np.nan, dtype=float)

    for fold_idx, (tr, te) in enumerate(folds, start=1):
        model.fit(X[tr], y[tr])
        yhat = model.predict(X[te])
        yhat_all[te] = yhat

        mae = mean_absolute_error(y[te], yhat)
        mse = mean_squared_error(y[te], yhat)
        rmse = mse**0.5
        r2 = r2_score(y[te], yhat)
        metrics.append({"fold": fold_idx, "MAE": mae, "RMSE": rmse, "R2": r2})

    # Save the model re-fitted on ALL data (common baseline practice)
    model.fit(X, y)
    outp = MODELS / f"{symbol}_{algo}_reg.pkl"
    joblib.dump({"model": model, "features": feat_names}, outp)

    mdf = pd.DataFrame(metrics)
    mdf["symbol"] = symbol
    mdf["algo"] = f"{algo}_reg"
    # also save CV preds for reference
    pd.DataFrame({"date": dates, "y": y, "yhat_cv": yhat_all}).to_csv(REPORTS / f"{symbol}_{algo}_reg_cv.csv", index=False)
    return mdf

def train_classification_model(symbol: str, n_splits: int = 5) -> pd.DataFrame:
    """
    Train classification on y_cls with TimeSeriesSplit using LogisticRegression.
    Saves model to models/{symbol}_logit_cls.pkl
    """
    #tr is training, te is testing
    df = load_dataset(symbol)
    X, y, dates, feat_names = make_X_y(df, task="cls")

    # Some class imbalance protection (not critical here)
    model = Pipeline([
        ("scaler", StandardScaler()),
        ("logit", LogisticRegression(max_iter=200, class_weight="balanced"))
    ])

    folds = ts_cv_indices(len(X), n_splits=n_splits)
    metrics = []
    yhat_all = np.full_like(y, np.nan, dtype=float)  # store probabilities

    for fold_idx, (tr, te) in enumerate(folds, start=1):
        model.fit(X[tr], y[tr])
        proba = model.predict_proba(X[te])[:,1]
        yhat = (proba > 0.5).astype(int)

        acc = accuracy_score(y[te], yhat)
        f1  = f1_score(y[te], yhat)
        try:
            auc = roc_auc_score(y[te], proba)
        except ValueError:
            auc = np.nan

        yhat_all[te] = proba
        metrics.append({"fold": fold_idx, "ACC": acc, "F1": f1, "AUC": auc})

    # Save final model fitted on ALL data
    model.fit(X, y)
    outp = MODELS / f"{symbol}_logit_cls.pkl"
    joblib.dump({"model": model, "features": feat_names}, outp)

    mdf = pd.DataFrame(metrics)
    mdf["symbol"] = symbol
    mdf["algo"] = "logit_cls"
    # save CV probs for reference
    pd.DataFrame({"date": dates, "y": y, "proba_cv": yhat_all}).to_csv(REPORTS / f"{symbol}_logit_cls_cv.csv", index=False)
    return mdf

In [10]:
# trains models for all symbols and puts all results in a metrics_all variable.
results = []

for sym in SYMBOLS:
    # Regression baselines
    m1 = train_regression_model(sym, algo="ridge", n_splits=5)
    m2 = train_regression_model(sym, algo="hgb", n_splits=5)
    # Classification baseline
    m3 = train_classification_model(sym, n_splits=5)
    results.extend([m1, m2, m3])

metrics_all = pd.concat(results, ignore_index=True, sort=False)
metrics_all

ValueError: Input X contains NaN.
Ridge does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# save the metrics to a table
metrics_wide = metrics_all.pivot_table(index=["symbol","algo"],
                                       values=[c for c in metrics_all.columns if c not in ["fold","symbol","algo"]],
                                       aggfunc="mean")
metrics_wide = metrics_wide.round(4)
display(metrics_wide)

metrics_wide.to_csv(REPORTS / "day4_cv_metrics_summary.csv")
print("Saved:", REPORTS / "day4_cv_metrics_summary.csv")

In [None]:
# sanity check of the base models on the QQQ Symbol.
# Example: load and use the saved Ridge regressor for QQQ
bundle = joblib.load(MODELS / "QQQ_ridge_reg.pkl")
mdl = bundle["model"]
feats = bundle["features"]

ds = load_dataset("QQQ")
X, y, dates, feat_names = make_X_y(ds, task="reg")
assert feat_names == feats, "Feature mismatch between training and inference"
yhat_all = mdl.predict(X)  # in-sample; for smoke test only
pd.DataFrame({"date": dates, "y": y, "yhat": yhat_all}).head()
