In [1]:
import sys
import os

# Get the absolute path to the project directory
project_dir = os.path.abspath("..")

# Append the project directory to sys.path
if project_dir not in sys.path:
    sys.path.append(project_dir)
    
from src.predictionModule.LoadupSamples import LoadupSamples
from src.predictionModule.FilterSamples import FilterSamples
from src.predictionModule.MachineModels import MachineModels

import numpy as np
import polars as pl
import pandas as pd
import datetime
import scipy

import logging
formatted_date = datetime.datetime.now().strftime("%d%b%y_%H%M").lower()

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter(fmt="%(asctime)s - %(message)s")
handler.setFormatter(formatter)
if not logger.hasHandlers():
    logger.addHandler(handler)
else:
    logger.handlers[:] = [handler]

#Output File handler
formatted_str = f"notebook-FilterSamples-optuna-{formatted_date}"
file_handler = logging.FileHandler(f"{formatted_str}.log", mode="w")
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# Usage
logger.info("This will print to the notebook's output cell")

2025-09-02 17:20:18,065 - This will print to the notebook's output cell


In [2]:
params = {
    "idxAfterPrediction": 5,
    'timesteps': 60,
    'target_option': 'last',
    "LoadupSamples_time_scaling_stretch": True,
    "LoadupSamples_time_inc_factor": 61,
    
    "Treetime_LSTM_days_to_train": 360*2,
    "LSTM_units": 32,
    "LSTM_num_layers": 2,
    "LSTM_dropout": 0.006086,
    "LSTM_recurrent_dropout": 0.001341,
    "LSTM_learning_rate": 0.000195,
    "LSTM_optimizer": "adam",
    "LSTM_bidirectional": True,
    "LSTM_batch_size": 2**12,
    "LSTM_epochs": 2,
    "LSTM_l1": 0.000016,
    "LSTM_l2": 0.000121,
    "LSTM_inter_dropout": 0.022068,
    "LSTM_input_gaussian_noise": 0.001,
    "LSTM_conv1d": True,
    "LSTM_conv1d_kernel_size": 3,
    "LSTM_loss": "mse",

    "FilterSamples_q_up": 0.985,

    "FilterSamples_lincomb_epochs": 5,
    "FilterSamples_lincomb_show_progress": False,
    "FilterSamples_lincomb_featureratio": 0.5,
    "FilterSamples_lincomb_itermax": 1,
    "FilterSamples_lincomb_init_toprand":  3,
    "FilterSamples_lincomb_batch_size": 2**12,

    "FilterSamples_days_to_train_end": 115,
    "FilterSamples_cat_over20": True,
    "FilterSamples_cat_posOneYearReturn": False,
    "FilterSamples_cat_posFiveYearReturn": False,
    "FilterSamples_taylor_horizon_days": 50,
    "FilterSamples_taylor_roll_window_days": 10,
    "FilterSamples_taylor_weight_slope": 1.268923
}

In [None]:
timegroup = "group_regOHLCV_over5years"
treegroup = "group_finanTo2011"

eval_date = datetime.date(year=2025, month=7, day=13)
evaldates = [eval_date - datetime.timedelta(days=i) for i in range(1, 6)]
start_train_date = datetime.date(year=2020, month=1, day=1)
split_Date = datetime.date(year=2025, month=1, day=1)
ls = LoadupSamples(
    train_start_date=start_train_date,
    test_dates=evaldates,
    treegroup=treegroup,
    timegroup=timegroup,
    params=params,
)
ls.load_samples(main_path = "../src/featureAlchemy/bin/")

2025-09-02 17:20:26,661 - Test date 2025-07-12 not found in the database. Omitting.


In [None]:
import optuna
logger.setLevel(logging.INFO)

n_reruns = 3
time_delta_days = 25
split_dates = [split_Date - datetime.timedelta(days=time_delta_days * i) for i in range(0, n_reruns)]
n_testdays = 60
studytime = 60*60*5
n_startup_trials = 15
objective_name = "taylor"
studyname = f"optuna_{formatted_str}_{objective_name}"
params["FilterSamples_q_up"] = 0.985
device = "cuda"

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
LINCOMB_SPACE = {
    "FilterSamples_days_to_train_end": ("int", 250, 350, {"step": 10}),
    "FilterSamples_lincomb_lr": ("float", 1e-4, 5e-3, {"log": False}),
    "FilterSamples_lincomb_epochs": ("int", 10, 200, {"step": 10}),
    "FilterSamples_lincomb_probs_noise_std": ("float", 0.1, 0.3, {"log": False}),
    "FilterSamples_lincomb_subsample_ratio": ("float", 0.2, 0.4, {}),
    "FilterSamples_lincomb_sharpness": ("float", 2.0, 5.0, {"log": False}),
    #"FilterSamples_lincomb_featureratio": ("float", 0.05, 0.99, {"log": True}),
    #"FilterSamples_lincomb_itermax": ("int", 1, 3, {}),
    "FilterSamples_lincomb_init_toprand": ("int", 2, 15, {})
}
TAYLOR_SPACE = {
    "FilterSamples_days_to_train_end": ("int", 5, 130, {"step": 5}),
    "FilterSamples_taylor_horizon_days": ("int", 5, 80, {"step": 5}),
    "FilterSamples_taylor_roll_window_days": ("int", 5, 80, {"step": 5}),
    "FilterSamples_taylor_weight_slope": ("float", 0.1, 1.5, {"log": False})
}
def sample_params(trial, space):
    out = {}
    for name, spec in space.items():
        kind, lo, hi, kw = spec
        sug = trial.suggest_int if kind == "int" else trial.suggest_float
        out[name] = sug(name.replace("FilterSamples_", ""), lo, hi, **kw)
    return out

def evaluate_objective(opt_params, mask_getter, trial_num):
    scores = []
    for sd in split_dates[:n_reruns]:
        lsc = ls.copy(deep=True)
        last_test_day = sd + datetime.timedelta(days=n_testdays)
        lsc.split_dataset(
            start_date=start_train_date, 
            last_train_date=sd, 
            last_test_date=last_test_day
        )

        fs_pre = FilterSamples(
            Xtree_train = lsc.train_Xtree, 
            ytree_train = lsc.train_ytree, 
            treenames   = lsc.featureTreeNames,
            Xtree_test  = lsc.test_Xtree,
            ytree_test  = lsc.test_ytree,
            meta_train  = lsc.meta_pl_train, 
            meta_test   = lsc.meta_pl_test, 
            params      = opt_params
        )
        mask_train_pre, mask_test_pre = fs_pre.categorical_masks()

        mm = MachineModels(opt_params)
        days_to_train = opt_params["Treetime_LSTM_days_to_train"]
        mask_dates_reduced = fs_pre.get_recent_training_mask(days_to_train)
        lstm_model, res_dict = mm.run_LSTM_torch(
            lsc.train_Xtime[mask_dates_reduced & mask_train_pre], 
            lsc.train_ytime[mask_dates_reduced & mask_train_pre], 
            lsc.test_Xtime[mask_test_pre], 
            lsc.test_ytime[mask_test_pre], 
            device=device
        )
        preds_train = mm.predict_LSTM_torch(lstm_model, lsc.train_Xtime, batch_size=opt_params["LSTM_batch_size"], device=device)
        preds_test = mm.predict_LSTM_torch(lstm_model, lsc.test_Xtime, batch_size=opt_params["LSTM_batch_size"], device=device)

        logger.info(f"  LSTM RSME: {res_dict['val_rmse']*2/opt_params['LoadupSamples_time_inc_factor']:.4f}")
        filtered_train = lsc.train_ytree[mask_train_pre][preds_train[mask_train_pre] >= np.quantile(preds_train[mask_train_pre], opt_params["FilterSamples_q_up"])]
        filtered_test  = lsc.test_ytree[mask_test_pre][preds_test[mask_test_pre] >= np.quantile(preds_test[mask_test_pre], opt_params["FilterSamples_q_up"])]
        logger.info(f"  Result of quantile {opt_params['FilterSamples_q_up']:.2f}")
        logger.info(f"    Train set (gmean, unreduced): {scipy.stats.gmean(filtered_train):.4f}")
        logger.info(f"    Test set (gmean): {scipy.stats.gmean(filtered_test):.4f}")

        ## Add LSTM predictions to the tree test set
        train_std = np.std(preds_train)
        test_std = 1.0 if np.std(preds_test) < 1e-6 else np.std(preds_test)
        lsc.train_Xtree = np.hstack((lsc.train_Xtree, ((preds_train-1.0)/train_std).reshape(-1, 1)))
        lsc.test_Xtree = np.hstack((lsc.test_Xtree, ((preds_test-1.0)/test_std).reshape(-1, 1)))
        if isinstance(lsc.featureTreeNames, list):
            lsc.featureTreeNames.append("LSTM_Prediction")
        elif isinstance(lsc.featureTreeNames, np.ndarray):
            lsc.featureTreeNames = np.append(lsc.featureTreeNames, "LSTM_Prediction")

        fs = FilterSamples(
            Xtree_train = lsc.train_Xtree[mask_train_pre], 
            ytree_train = lsc.train_ytree[mask_train_pre], 
            treenames   = lsc.featureTreeNames,
            Xtree_test  = lsc.test_Xtree[mask_test_pre],
            ytree_test  = lsc.test_ytree[mask_test_pre],
            meta_train  = lsc.meta_pl_train.filter(pl.Series(mask_train_pre)), 
            meta_test   = lsc.meta_pl_test.filter(pl.Series(mask_test_pre)), 
            params      = opt_params
        )
        mask_train, mask_test = mask_getter(fs)

        logger.info(f"Trial number {trial_num}")
        score_train = fs.evaluate_mask(mask_train, lsc.meta_pl_train['date'].filter(pl.Series(mask_train_pre)), lsc.train_ytree[mask_train_pre])
        score_test  = fs.evaluate_mask(mask_test,  lsc.meta_pl_test['date'].filter(pl.Series(mask_test_pre)),  lsc.test_ytree[mask_test_pre])
        logger.info(f"  Score (train) = {score_train}")
        logger.info(f"  Score (test)  = {score_test}")

        scores.append(score_test)

    s = np.array(scores, dtype=float)
    s[~np.isfinite(s)] = 1.0
    s[s <= 1e-5] = 1e-5
    return float(np.log(s).mean()) if s.size else float("-inf")

def make_objective(space, mask_getter, static_overrides=None):
    static_overrides = static_overrides or {}
    def objective(trial: optuna.Trial) -> float:
        opt_params = params.copy()
        opt_params.update(sample_params(trial, space))
        opt_params.update(static_overrides)
        return evaluate_objective(opt_params, mask_getter, trial.number)
    return objective

objective_lincomb = make_objective(
    LINCOMB_SPACE,
    lambda fs: fs.lincomb_masks(),
    {"FilterSamples_lincomb_show_progress": False},
)
objective_taylor = make_objective(
    TAYLOR_SPACE,
    lambda fs: fs.taylor_feature_masks(),
)

In [6]:
if objective_name == "lincomb":
    objective = objective_lincomb
if objective_name == "taylor":
    objective = objective_taylor

In [None]:
optuna.logging.enable_propagation()
sampler = optuna.samplers.TPESampler(n_startup_trials=n_startup_trials)
study = optuna.create_study(
    study_name = studyname,
    storage="sqlite:///sandbox_optuna.db",
    direction="maximize",
    load_if_exists=True,
    sampler=sampler,
)
study.optimize(objective, timeout=studytime)

logger.info(f"Best parameters: {study.best_params}")
logger.info(f"Best score: {study.best_value}")

df: pd.DataFrame = study.trials_dataframe()
logger.info("\nTrials DataFrame:")
logger.info(df.sort_values("value").to_string())

param_importances = optuna.importance.get_param_importances(study)
logger.info("Parameter Importances:")
for key, value in param_importances.items():
    logger.info(f"{key}: {value}")

In [None]:
df.to_parquet(f"{formatted_str}.parquet", index=False)