In [1]:
import sys
import os

# Get the absolute path to the project directory
project_dir = os.path.abspath("..")

# Append the project directory to sys.path
if project_dir not in sys.path:
    sys.path.append(project_dir)
    
from src.predictionModule.LoadupSamples import LoadupSamples
from src.predictionModule.FilterSamples import FilterSamples
from src.predictionModule.MachineModels import MachineModels

import numpy as np
import pandas as pd
import polars as pl
import datetime
import scipy

import logging
formatted_date = datetime.datetime.now().strftime("%d%b%y_%H%M").lower()

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter(fmt="%(asctime)s - %(message)s")
handler.setFormatter(formatter)
if not logger.hasHandlers():
    logger.addHandler(handler)
else:
    logger.handlers[:] = [handler]

#Output File handler
formatted_str = f"notebook-LGBMOnFiltered-optuna-{formatted_date}"
file_handler = logging.FileHandler(f"{formatted_str}.log", mode="w")
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# Usage
logger.info("This will print to the notebook's output cell")

2025-09-02 17:22:04,037 - This will print to the notebook's output cell


In [None]:
params = {
    "idxAfterPrediction": 5,
    'timesteps': 35,
    'target_option': 'last',
    "LoadupSamples_time_scaling_stretch": True,
    "LoadupSamples_time_inc_factor": 61,
    
    "TreeTime_LSTM_days_to_train": 360,
    "LSTM_units": 32,
    "LSTM_num_layers": 2,
    "LSTM_dropout": 1e-4,
    "LSTM_recurrent_dropout": 0.001157,
    "LSTM_learning_rate": 0.000046,
    "LSTM_optimizer": "adam",
    "LSTM_bidirectional": True,
    "LSTM_batch_size": 2**12,
    "LSTM_epochs": 4,
    "LSTM_l1": 0.000036,
    "LSTM_l2": 0.003569,
    "LSTM_inter_dropout": 0.052745,
    "LSTM_input_gaussian_noise": 0.001,
    "LSTM_conv1d": True,
    "LSTM_conv1d_kernel_size": 3,
    "LSTM_loss": "mse",

    "FilterSamples_q_up": 0.985,
    "FilterSamples_days_to_train_end": 115,
    
    "FilterSamples_cat_over20": True,
    "FilterSamples_cat_posOneYearReturn": False,
    "FilterSamples_cat_posFiveYearReturn": False,

    "FilterSamples_lincomb_epochs": 5,
    "FilterSamples_lincomb_show_progress": False,
    "FilterSamples_lincomb_probs_noise_std": 0.05,
    "FilterSamples_lincomb_sharpness": 0.5,
    "FilterSamples_lincomb_subsample_ratio": 0.5,
    "FilterSamples_lincomb_featureratio": 0.5,
    "FilterSamples_lincomb_itermax": 1,
    "FilterSamples_lincomb_init_toprand":  3,
    "FilterSamples_lincomb_batch_size": 2**12,

    "FilterSamples_taylor_horizon_days": 50,
    "FilterSamples_taylor_roll_window_days": 10,
    "FilterSamples_taylor_weight_slope": 1.268923,
    
    "TreeTime_LGB_days_to_train": 360,
    "LGB_num_boost_round": 950,
    "LGB_lambda_l1": 0.000100,
    "LGB_lambda_l2": 0.009786276249261908,
    "LGB_feature_fraction": 0.20813359498274574,
    "LGB_num_leaves": 191,
    "LGB_max_depth": 9,
    "LGB_learning_rate": 0.008855,
    "LGB_min_data_in_leaf": 350,
    "LGB_min_gain_to_split": 0.10066457576238419,
    "LGB_path_smooth": 0.5935679203578974,
    "LGB_min_sum_hessian_in_leaf": 0.3732876155751053,
    "LGB_max_bin": 850,
}

In [None]:
timegroup = "group_regOHLCV_over5years"
treegroup = "group_finanTo2011"

eval_date = datetime.date(year=2025, month=7, day=13)
evaldates = [eval_date - datetime.timedelta(days=i) for i in range(1, 6)]
start_train_date = datetime.date(year=2020, month=1, day=1)
split_Date = datetime.date(year=2025, month=1, day=1)
ls = LoadupSamples(
    train_start_date=start_train_date,
    test_dates=evaldates,
    treegroup=treegroup,
    timegroup=timegroup,
    params=params,
)
ls.load_samples(main_path = "../src/featureAlchemy/bin/")
ls.split_dataset(
    start_date=start_train_date,
    last_train_date=split_Date,
    last_test_date=eval_date
)

2025-09-02 17:22:12,996 - Test date 2025-07-12 not found in the database. Omitting.


In [None]:
import optuna
logger.setLevel(logging.INFO)

n_reruns = 3
time_delta_days = 25
split_dates = [split_Date - datetime.timedelta(days = time_delta_days * i) for i in range(0, n_reruns)]
n_testdays = 60
studytime = 60*60*2
n_startup_trials = 15
studyname = f"optuna_LGBMOnFiltered_{formatted_str}"
device = "cuda"

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def sample_params(trial: optuna.Trial):
    PARAMS_SPACE = {
        "FilterSamples_method": "taylor",
        "FilterSamples_q_up": trial.suggest_float("FilterSamples_q_up", 0.985, 0.993),

        "TreeTime_LGB_days_to_train": trial.suggest_int("TreeTime_LGB_days_to_train", 100, 2500, step=100),
        "LGB_num_boost_round": trial.suggest_int("LGB_num_boost_round", 750, 1500, step=50),
        "LGB_lambda_l1": trial.suggest_float("LGB_lambda_l1", 1e-6, 5e+1, log=True), #5e-2
        "LGB_lambda_l2": 1e-5,
        "LGB_feature_fraction": 0.7,
        "LGB_num_leaves": trial.suggest_int("LGB_num_leaves", 141, 211, step=10),
        "LGB_max_depth": trial.suggest_int("LGB_max_depth", 7, 10),
        "LGB_learning_rate": trial.suggest_float("LGB_learning_rate", 5e-3, 5e-1, log=True),
        "LGB_min_data_in_leaf": trial.suggest_int("LGB_min_data_in_leaf", 300, 500, step=50),
        "LGB_min_gain_to_split": 0.13,
        "LGB_path_smooth": 0.6,
        "LGB_min_sum_hessian_in_leaf": 1.0,
        "LGB_max_bin": trial.suggest_int("LGB_max_bin", 750, 950, step=50),
    }
    return PARAMS_SPACE

def evaluate_objective(opt_params, trial_num):
    scores = []
    for sd in split_dates[:n_reruns]:
        logger.info(f"Trial number {trial_num}")
        lsc = ls.copy(deep=True)
        last_test_day = sd + datetime.timedelta(days=n_testdays)
        lsc.split_dataset(
            start_date      = start_train_date, 
            last_train_date = sd, 
            last_test_date  = last_test_day)

        ### PRE FILTERING
        fs_pre = FilterSamples(
            Xtree_train = lsc.train_Xtree, 
            ytree_train = lsc.train_ytree, 
            treenames   = lsc.featureTreeNames,
            Xtree_test  = lsc.test_Xtree,  
            ytree_test  = lsc.test_ytree,
            meta_train  = lsc.meta_pl_train, 
            meta_test   = lsc.meta_pl_test, 
            params      = opt_params
        )
        mask_train_pre, mask_test_pre = fs_pre.categorical_masks()

        ### LSTM ADDITION
        days_to_train = opt_params["TreeTime_LSTM_days_to_train"]
        mask_dates_reduced = fs_pre.get_recent_training_mask(days_to_train)

        mm = MachineModels(opt_params)
        lstm_model, res_dict = mm.run_LSTM_torch(
            lsc.train_Xtime[mask_dates_reduced & mask_train_pre], 
            lsc.train_ytime[mask_dates_reduced & mask_train_pre], 
            lsc.test_Xtime[mask_test_pre], 
            lsc.test_ytime[mask_test_pre], 
            device=device
        )
        preds_train = mm.predict_LSTM_torch(lstm_model, lsc.train_Xtime, batch_size=opt_params["LSTM_batch_size"], device=device)
        preds_test = mm.predict_LSTM_torch(lstm_model, lsc.test_Xtime, batch_size=opt_params["LSTM_batch_size"], device=device)

        logger.info(f"  LSTM RMSE: {res_dict['val_rmse']*2/opt_params['LoadupSamples_time_inc_factor']:.4f}")
        filtered_train = lsc.train_ytree[mask_train_pre][preds_train[mask_train_pre] >= np.quantile(preds_train[mask_train_pre], opt_params["FilterSamples_q_up"])]
        filtered_test  = lsc.test_ytree[mask_test_pre][preds_test[mask_test_pre] >= np.quantile(preds_test[mask_test_pre], opt_params["FilterSamples_q_up"])]
        logger.info(f"  Result of quantile {opt_params['FilterSamples_q_up']:.2f}")
        logger.info(f"    Train set (gmean, unreduced): {scipy.stats.gmean(filtered_train):.4f}")
        logger.info(f"    Test set (gmean): {scipy.stats.gmean(filtered_test):.4f}")
        
        ## Add LSTM predictions to the tree test set
        train_std = np.std(preds_train)
        test_std = 1.0 if np.std(preds_test) < 1e-6 else np.std(preds_test)
        lsc.train_Xtree = np.hstack((lsc.train_Xtree, ((preds_train-1.0)/train_std).reshape(-1, 1)))
        lsc.test_Xtree = np.hstack((lsc.test_Xtree, ((preds_test-1.0)/test_std).reshape(-1, 1)))
        if isinstance(lsc.featureTreeNames, list):
            lsc.featureTreeNames.append("LSTM_Prediction")
        elif isinstance(lsc.featureTreeNames, np.ndarray):
            lsc.featureTreeNames = np.append(lsc.featureTreeNames, "LSTM_Prediction")

        ### MAIN FILTERING
        lsc.apply_masks(mask_train_pre, mask_test_pre)
        fs = FilterSamples(
            Xtree_train = lsc.train_Xtree, 
            ytree_train = lsc.train_ytree, 
            treenames   = lsc.featureTreeNames,
            Xtree_test  = lsc.test_Xtree,
            ytree_test  = lsc.test_ytree,
            meta_train  = lsc.meta_pl_train,
            meta_test   = lsc.meta_pl_test,
            params      = opt_params
        )

        if opt_params["FilterSamples_method"] == "taylor":
            mask_train, mask_test = fs.taylor_feature_masks()
        if opt_params["FilterSamples_method"] == "lincomb":
            mask_train, mask_test = fs.lincomb_masks()

        score_train = fs.evaluate_mask(mask_train, lsc.meta_pl_train['date'], lsc.train_ytree)
        score_test  = fs.evaluate_mask(mask_test,  lsc.meta_pl_test['date'],  lsc.test_ytree)
        logger.info(f"  Filtering Score (train) = {score_train}")
        logger.info(f"  Filtering Score (test)  = {score_test}")
        
        ### LGBM ASSESSMENT
        mm = MachineModels(opt_params)
        days_to_train_LGB = opt_params["TreeTime_LGB_days_to_train"]
        mask_dates_reduced = fs.get_recent_training_mask(days_to_train_LGB)
        lgb_model, lgb_res_dict = mm.run_LGB(
            X_train=lsc.train_Xtree[mask_dates_reduced & mask_train],
            y_train=lsc.train_ytree[mask_dates_reduced & mask_train],
            X_test=lsc.test_Xtree[mask_test],
            y_test=lsc.test_ytree[mask_test]
        )
        
        # LGB Predictions
        best_iter = getattr(lgb_model, "best_iteration", None)
        y_test_pred_masked  = lgb_model.predict(lsc.test_Xtree[mask_test], num_iteration=best_iter)
        logger.info(f"  Test RMSE LGBM: {lgb_res_dict['best_score']:.4f}")
        
        m = 5
        meta_pl = lsc.meta_pl_test.filter(pl.Series(mask_test)).with_columns(
            pl.Series("prediction_ratio", y_test_pred_masked)
        )
        meta_pl_filtered = (
            meta_pl
            .sort(["date", "prediction_ratio"], descending=[False, True])
            .with_columns(
                pl.col("prediction_ratio")
                .rank(method="random", descending=True)
                .over("date")
                .alias("prediction_rank")
            ).filter(pl.col("prediction_rank") <= m)
        )
        agg_exprs = [
            pl.col("prediction_ratio").max().alias("max_pred"),  # this is also .first()
            pl.col("prediction_ratio").log().mean().exp().alias("mean_pred"),
            pl.col("target_ratio").log().mean().exp().alias("mean_res"),
            pl.col("target_ratio")
                .sort_by(pl.col("prediction_ratio"), descending=True)
                .first()
                .alias("top_res"),
            pl.len().alias("n_entries"),
        ]
        test_df_perdate = meta_pl_filtered.group_by("date").agg(agg_exprs).sort("date")
        
        if test_df_perdate.height == 0:
            pred_meanlast = pred_toplast = res_meanlast = res_toplast = 1.0
        else:
            pred_meanlast = test_df_perdate["mean_pred"].item(-1)
            pred_toplast  = test_df_perdate["max_pred"].item(-1)
            res_meanlast  = test_df_perdate["mean_res"].item(-1)
            res_toplast   = test_df_perdate["top_res"].item(-1)
        res_sum_n = int(test_df_perdate["n_entries"].sum())
        
        score = res_meanlast

        # Final  Analysis
        logger.info(f"  Final top last prediction ratio: {pred_toplast:.4f}")
        logger.info(f"  Final last mean prediction ratio: {pred_meanlast:.4f}")
        logger.info(f"  Final top last P/L Ratio: {res_toplast:.4f}")
        logger.info(f"  Final mean last P/L Ratio: {res_meanlast:.4f}")
        logger.info(f"  Number of entries: {res_sum_n}")

        scores.append(score)

    s = np.array(scores, dtype=float)
    s[~np.isfinite(s)] = 1.0
    s[s <= 0] = 1.0
    return float(np.log(s).mean()) if s.size else float("-inf")

def make_objective():
    def objective(trial: optuna.Trial) -> float:
        opt_params = params.copy()
        opt_params.update(sample_params(trial))
        return evaluate_objective(opt_params, trial.number)
    return objective

In [None]:
optuna.logging.enable_propagation()
sampler = optuna.samplers.TPESampler(n_startup_trials=n_startup_trials)
study = optuna.create_study(
    study_name = studyname,
    storage="sqlite:///sandbox_optuna.db",
    direction="maximize",
    load_if_exists=True,
    sampler=sampler,
)
study.optimize(make_objective(), timeout=studytime)

logger.info(f"Best parameters: {study.best_params}")
logger.info(f"Best score: {study.best_value}")

df: pd.DataFrame = study.trials_dataframe()
logger.info("\nTrials DataFrame:")
logger.info(df.sort_values("value").to_string())

param_importances = optuna.importance.get_param_importances(study)
logger.info("Parameter Importances:")
for key, value in param_importances.items():
    logger.info(f"{key}: {value}")

In [None]:
df.to_parquet(f"{formatted_str}.parquet", index=False)

In [None]:
df = pl.read_parquet(f"{formatted_str}.parquet")

In [None]:
df_roll_mean = df.sort("value").select(
    [pl.col("value")] + 
    [pl.col(c).rolling_mean(window_size=10).alias(f"{c}_rollmean10") for c in df.columns if c.startswith("params_")]
)