In [1]:
# Loading modules, paths, and models

import numpy as np, pandas as pd, pathlib, joblib
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score, accuracy_score, f1_score, roc_auc_score

BASE = pathlib.Path.cwd().parents[0]
DATA_PROCESSED = BASE / "data" / "processed"
MODELS = BASE / "models"
REPORTS = BASE / "reports"
MODELS.mkdir(exist_ok=True)
REPORTS.mkdir(exist_ok=True)

SYMBOLS = ["QQQ", "VFV.TO", "XEQT.TO"]

In [3]:
# Helper functions
def load_dataset(symbol: str) -> pd.DataFrame:
    df = pd.read_parquet(DATA_PROCESSED / f"{symbol}_dataset.parquet")
    return df.sort_values("date").reset_index(drop=True)

def make_X_y(df: pd.DataFrame, task: str):
    feats = df.drop(columns=["date","y_reg","y_cls"])
    X = feats.values.astype(float)
    if task == "reg":
        y = df["y_reg"].values
    else:
        y = df["y_cls"].values
    return X, y, feats.columns.tolist()

In [9]:
# Cross validation and scorers for performance metrics
def tscv(n_splits=5):
    return TimeSeriesSplit(n_splits=n_splits)

# Regression scorers (primary = negative RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
rmse_scorer = make_scorer(rmse, greater_is_better=False)
mae_scorer  = make_scorer(mean_absolute_error, greater_is_better=False)
r2_scorer   = make_scorer(r2_score)

In [5]:
# Establishing grids for the different models.
ridge_grid = {
    "ridge__alpha": [0.01, 0.1, 1.0, 5.0, 10.0, 50.0, 100.0]
}

hgb_grid = {
    "hgb__learning_rate": [0.03, 0.05, 0.08, 0.1],
    "hgb__max_depth": [3, 5, None],
    "hgb__max_leaf_nodes": [15, 31, 63],
    "hgb__min_samples_leaf": [10, 20, 50]
}

logit_grid = {
    "logit__C": [0.1, 0.5, 1.0, 2.0, 5.0],
    "logit__penalty": ["l2"],   # keep simple/portable
    "logit__solver": ["lbfgs"], # supports l2 + probas
    "logit__max_iter": [200]
}

In [10]:
# Functions to tune the models using GridSearch Cross Validation
def tune_ridge(symbol: str, n_splits=5):
    df = load_dataset(symbol)
    X, y, feat_names = make_X_y(df, "reg")

    pipe = Pipeline([("scaler", StandardScaler()), ("ridge", Ridge())])
    gs = GridSearchCV(
        estimator=pipe,
        param_grid=ridge_grid,
        scoring={"rmse": rmse_scorer, "mae": mae_scorer, "r2": r2_scorer},
        refit="rmse",                # pick best by RMSE
        cv=tscv(n_splits),
        n_jobs=-1, verbose=0, return_train_score=False,
    )
    gs.fit(X, y)

    # Save best model (refit on full data by GridSearchCV)
    outp = MODELS / f"{symbol}_ridge_reg_tuned.pkl"
    joblib.dump({"model": gs.best_estimator_, "features": feat_names, "best_params": gs.best_params_}, outp)

    # Log CV results
    res = pd.DataFrame(gs.cv_results_)
    res.to_csv(REPORTS / f"day7_{symbol}_ridge_grid.csv", index=False)

    # Summarize metrics
    best = dict(symbol=symbol, algo="ridge_reg", **gs.best_params_)
    # Compute CV mean scores from cv_results_
    best["mean_test_rmse"] = res.loc[res["rank_test_rmse"].idxmin(), "mean_test_rmse"]
    best["mean_test_mae"]  = res.loc[res["rank_test_rmse"].idxmin(), "mean_test_mae"]
    best["mean_test_r2"]   = res.loc[res["rank_test_rmse"].idxmin(), "mean_test_r2"]
    return pd.DataFrame([best])

def tune_hgb(symbol: str, n_splits=5):
    df = load_dataset(symbol)
    X, y, feat_names = make_X_y(df, "reg")

    pipe = Pipeline([("hgb", HistGradientBoostingRegressor(random_state=42))])
    gs = GridSearchCV(
        estimator=pipe,
        param_grid=hgb_grid,
        scoring={"rmse": rmse_scorer, "mae": mae_scorer, "r2": r2_scorer},
        refit="rmse",
        cv=tscv(n_splits),
        n_jobs=-1, verbose=0, return_train_score=False,
    )
    gs.fit(X, y)

    outp = MODELS / f"{symbol}_hgb_reg_tuned.pkl"
    joblib.dump({"model": gs.best_estimator_, "features": feat_names, "best_params": gs.best_params_}, outp)

    res = pd.DataFrame(gs.cv_results_)
    res.to_csv(REPORTS / f"day7_{symbol}_hgb_grid.csv", index=False)

    best = dict(symbol=symbol, algo="hgb_reg", **gs.best_params_)
    best["mean_test_rmse"] = res.loc[res["rank_test_rmse"].idxmin(), "mean_test_rmse"]
    best["mean_test_mae"]  = res.loc[res["rank_test_rmse"].idxmin(), "mean_test_mae"]
    best["mean_test_r2"]   = res.loc[res["rank_test_rmse"].idxmin(), "mean_test_r2"]
    return pd.DataFrame([best])

def tune_logit(symbol: str, n_splits=5):
    df = load_dataset(symbol)
    X, y, feat_names = make_X_y(df, "cls")

    pipe = Pipeline([("scaler", StandardScaler()),
                     ("logit", LogisticRegression())])

    gs = GridSearchCV(
        estimator=pipe,
        param_grid=logit_grid,
        # Use built-in scorers by name
        scoring={"auc": "roc_auc", "f1": "f1", "acc": "accuracy"},
        refit="auc",
        cv=TimeSeriesSplit(n_splits=n_splits),
        n_jobs=-1, verbose=0, return_train_score=False,
    )
    gs.fit(X, y)

    outp = MODELS / f"{symbol}_logit_cls_tuned.pkl"
    joblib.dump({"model": gs.best_estimator_, "features": feat_names, "best_params": gs.best_params_}, outp)

    res = pd.DataFrame(gs.cv_results_)
    res.to_csv(REPORTS / f"day7_{symbol}_logit_grid.csv", index=False)

    best = dict(symbol=symbol, algo="logit_cls", **gs.best_params_)
    best["mean_test_auc"] = res.loc[res["rank_test_auc"].idxmin(), "mean_test_auc"]
    best["mean_test_f1"]  = res.loc[res["rank_test_auc"].idxmin(), "mean_test_f1"]
    best["mean_test_acc"] = res.loc[res["rank_test_auc"].idxmin(), "mean_test_acc"]
    return pd.DataFrame([best])

In [11]:
# Tuning models using search for all the fund symbols
all_rows = []
for sym in SYMBOLS:
    all_rows.append(tune_ridge(sym))
    all_rows.append(tune_hgb(sym))
    all_rows.append(tune_logit(sym))

day7_summary = pd.concat(all_rows, ignore_index=True)
display(day7_summary)

out_csv = REPORTS / "day7_best_models_summary.csv"
day7_summary.to_csv(out_csv, index=False)
print("Saved:", out_csv)

Unnamed: 0,symbol,algo,ridge__alpha,mean_test_rmse,mean_test_mae,mean_test_r2,hgb__learning_rate,hgb__max_depth,hgb__max_leaf_nodes,hgb__min_samples_leaf,logit__C,logit__max_iter,logit__penalty,logit__solver,mean_test_auc,mean_test_f1,mean_test_acc
0,QQQ,ridge_reg,100.0,-0.01335,-0.009422,-0.002039,,,,,,,,,,,
1,QQQ,hgb_reg,,-0.013421,-0.009499,-0.015023,0.03,3.0,15.0,10.0,,,,,,,
2,QQQ,logit_cls,,,,,,,,,0.1,200.0,l2,lbfgs,0.502383,0.627772,0.534758
3,VFV.TO,ridge_reg,100.0,-0.010296,-0.007066,-0.024625,,,,,,,,,,,
4,VFV.TO,hgb_reg,,-0.010353,-0.007097,-0.034905,0.03,3.0,15.0,50.0,,,,,,,
5,VFV.TO,logit_cls,,,,,,,,,5.0,200.0,l2,lbfgs,0.50813,0.626793,0.532325
6,XEQT.TO,ridge_reg,100.0,-0.008206,-0.006074,-0.037732,,,,,,,,,,,
7,XEQT.TO,hgb_reg,,-0.008368,-0.006151,-0.086785,0.03,3.0,15.0,20.0,,,,,,,
8,XEQT.TO,logit_cls,,,,,,,,,0.1,200.0,l2,lbfgs,0.510294,0.527618,0.497561


Saved: /Users/itzronald/Desktop/trend-predictor/reports/day7_best_models_summary.csv


In [13]:
# Quick test of tuned ridge regression model with one symbol

sym = SYMBOLS[0]
bundle = joblib.load(MODELS / f"{sym}_ridge_reg_tuned.pkl")
mdl, feats = bundle["model"], bundle["features"]
df = load_dataset(sym)
X, y, feat_names = make_X_y(df, "reg")
assert feat_names == feats, "Feature mismatch"
yhat = mdl.predict(X)
pd.DataFrame({"y": y, "yhat": yhat}).head()


Unnamed: 0,y,yhat
0,0.01328,-0.002836
1,0.052206,-0.002422
2,-0.003359,-0.006269
3,0.011709,-0.003769
4,-0.005559,-0.003456
