In [10]:
import sys
import os

# Get the absolute path to the project directory
project_dir = os.path.abspath("..")

# Append the project directory to sys.path
if project_dir not in sys.path:
    sys.path.append(project_dir)
    
from src.predictionModule.LoadupSamples import LoadupSamples
from src.predictionModule.FilterSamples import FilterSamples
from src.predictionModule.MachineModels import MachineModels

import numpy as np
import polars as pl
import pandas as pd
import datetime

import logging
formatted_date = datetime.datetime.now().strftime("%d%b%y_%H%M").lower()

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter(fmt="%(asctime)s - %(message)s")
handler.setFormatter(formatter)
if not logger.hasHandlers():
    logger.addHandler(handler)
else:
    logger.handlers[:] = [handler]

#Output File handler
formatted_str = f"notebook-FilterSamples-optuna-{formatted_date}"
file_handler = logging.FileHandler(f"{formatted_str}.log", mode="w")
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# Usage
logger.info("This will print to the notebook's output cell")

2025-08-28 18:16:53,987 - This will print to the notebook's output cell


In [11]:
params = {
    "idxAfterPrediction": 5,
    'timesteps': 60,
    'target_option': 'last',
    "LoadupSamples_time_scaling_stretch": True,
    "LoadupSamples_time_inc_factor": 60,
    
    "LSTM_units": 32,
    "LSTM_num_layers": 1,
    "LSTM_dropout": 0.001,
    "LSTM_recurrent_dropout": 0.001,
    "LSTM_learning_rate": 0.001,
    "LSTM_optimizer": "adam",
    "LSTM_bidirectional": True,
    "LSTM_batch_size": 2**12,
    "LSTM_epochs": 2,
    "LSTM_l1": 0.001,
    "LSTM_l2": 0.001,
    "LSTM_inter_dropout": 0.001,
    "LSTM_input_gaussian_noise": 0.001,
    "LSTM_conv1d": True,
    "LSTM_conv1d_kernel_size": 3,
    "LSTM_loss": "mse",

    "FilterSamples_q_up": 0.90,

    "FilterSamples_lincomb_epochs": 5,
    "FilterSamples_lincomb_show_progress": False,
    "FilterSamples_lincomb_featureratio": 0.5,
    "FilterSamples_lincomb_itermax": 2,
    "FilterSamples_lincomb_init_toprand":  1,

    "FilterSamples_days_to_train_end": 10,
    "FilterSamples_cat_over20": True,
    "FilterSamples_cat_posOneYearReturn": True,
    "FilterSamples_cat_posFiveYearReturn": True,
    "FilterSamples_taylor_horizon_days": 20,
    "FilterSamples_taylor_roll_window_days": 20
}

In [12]:
timegroup = "group_regOHLCV_over5years"
treegroup = "group_debug"

eval_date = datetime.date(year=2025, month=6, day=13)
evaldates = [eval_date - datetime.timedelta(days=i) for i in range(1, 6)]
start_train_date = datetime.date(year=2021, month=1, day=1)
split_Date = datetime.date(year=2025, month=1, day=1)
ls = LoadupSamples(
    train_start_date=start_train_date,
    test_dates=evaldates,
    treegroup=treegroup,
    timegroup=timegroup,
    params=params,
)
ls.load_samples(main_path = "../src/featureAlchemy/bin/")
ls.split_dataset(
    start_date=start_train_date,
    last_train_date=split_Date,
    last_test_date=eval_date
)

2025-08-28 18:17:01,261 - Test date 2025-06-08 not found in the database. Omitting.


In [13]:
fs_pre = FilterSamples(
    Xtree_train=ls.train_Xtree,
    ytree_train=ls.train_ytree,
    treenames=ls.featureTreeNames,
    Xtree_test=ls.test_Xtree,
    ytree_test=ls.test_ytree,
    meta_train=ls.meta_pl_train,
    meta_test=ls.meta_pl_test,
    params=params
)

cat_mask_train, cat_mask_test = fs_pre.categorical_masks()
ls.apply_masks(cat_mask_train, cat_mask_test)

In [None]:
"""
Runs LSTM to generate feature(s) to add to the tree data.
"""
mm = MachineModels(params)

device = 'cuda'
starttime = datetime.datetime.now()
lstm_model, res_dict = mm.run_LSTM_torch(
    ls.train_Xtime, 
    ls.train_ytime, 
    ls.test_Xtime, 
    ls.test_ytime, 
    device=device
)
preds_train = mm.predict_LSTM_torch(lstm_model, ls.train_Xtime, batch_size=params["LSTM_batch_size"], device=device)
preds_test = mm.predict_LSTM_torch(lstm_model, ls.test_Xtime, batch_size=params["LSTM_batch_size"], device=device)
endtime = datetime.datetime.now()

logger.info(f"  LSTM RSME: {res_dict['val_rmse']*2/params['LoadupSamples_time_inc_factor']:.4f}")
logger.info(f"  LSTM completed in {endtime - starttime}.")
for q in [0.95, 0.98, 0.99]:
    q_pred = np.quantile(preds_train, q)
    logger.info(f"  LSTM quantile {q:.2f} of train predictions: {q_pred:.4f}")
    filtered = ls.train_ytree[preds_train >= q_pred]
    logger.info(f"  Result of quantile {q:.2f} of train set (mean): {filtered.mean():.4f}")

## Add LSTM predictions to the tree test set
train_std = np.std(preds_train)
test_std = 1.0 if np.std(preds_test) < 1e-6 else np.std(preds_test)
ls.train_Xtree = np.hstack((ls.train_Xtree, ((preds_train-1.0)/train_std).reshape(-1, 1)))
ls.test_Xtree = np.hstack((ls.test_Xtree, ((preds_test-1.0)/test_std).reshape(-1, 1)))
if isinstance(ls.featureTreeNames, list):
    ls.featureTreeNames.append("LSTM_Prediction")
elif isinstance(ls.featureTreeNames, np.ndarray):
    ls.featureTreeNames = np.append(ls.featureTreeNames, "LSTM_Prediction")

In [15]:
import optuna
logger.setLevel(logging.INFO)

n_reruns = 5
time_delta_days = 20
split_dates = [split_Date - datetime.timedelta(days=time_delta_days * i) for i in range(0, n_reruns)]
studytime = 60*10*1
n_startup_trials = 15
objective_name = "lincomb"
studyname = f"optuna_{formatted_str}_{objective_name}"
params["FilterSamples_q_up"] = 0.80

In [None]:
def sample_params(trial, space):
    out = {}
    for name, spec in space.items():
        kind, lo, hi, kw = spec
        sug = trial.suggest_int if kind == "int" else trial.suggest_float
        out[name] = sug(name.replace("FilterSamples_", ""), lo, hi, **kw)
    return out

def evaluate_objective(opt_params, mask_getter, trial_num):
    scores = []
    for sd in split_dates[:n_reruns]:
        lsc = ls.copy(deep=True)
        lsc.split_dataset(start_date=start_train_date, last_train_date=sd, last_test_date=eval_date)

        fs = FilterSamples(
            Xtree_train=lsc.train_Xtree, ytree_train=lsc.train_ytree, treenames=lsc.featureTreeNames,
            Xtree_test=lsc.test_Xtree,  ytree_test=lsc.test_ytree,
            meta_train=lsc.meta_pl_train, meta_test=lsc.meta_pl_test, params=opt_params
        )
        mask_train, mask_test = mask_getter(fs)

        logger.info(f"Trial number {trial_num}")
        score_train = fs.evaluate_mask(mask_train, lsc.meta_pl_train['date'], lsc.train_ytree)
        score_test  = fs.evaluate_mask(mask_test,  lsc.meta_pl_test['date'],  lsc.test_ytree)
        logger.info(f"  Score (train) = {score_train}")
        logger.info(f"  Score (test)  = {score_test}")

        scores.append(score_test)

    s = np.array(scores, dtype=float)
    s[~np.isfinite(s)] = 1.0
    s[s <= 0] = 1.0
    return float(np.log(s).mean()) if s.size else float("-inf")

LINCOMB_SPACE = {
    "FilterSamples_days_to_train_end": ("int", 10, 500, {"step": 10}),
    "FilterSamples_lincomb_lr": ("float", 5e-5, 1e-2, {"log": True}),
    "FilterSamples_lincomb_epochs": ("int", 50, 1000, {"step": 50}),
    "FilterSamples_lincomb_probs_noise_std": ("float", 0.005, 0.2, {"log": True}),
    "FilterSamples_lincomb_subsample_ratio": ("float", 0.05, 0.5, {}),
    "FilterSamples_lincomb_sharpness": ("float", 0.2, 2.5, {"log": True}),
    "FilterSamples_lincomb_featureratio": ("float", 0.05, 0.99, {"log": True}),
    "FilterSamples_lincomb_itermax": ("int", 1, 2, {}),
}
TAYLOR_SPACE = {
    "FilterSamples_days_to_train_end": ("int", 10, 500, {"step": 10}),
    "FilterSamples_taylor_horizon_days": ("int", 5, 60, {"step": 5}),
    "FilterSamples_taylor_roll_window_days": ("int", 5, 60, {"step": 5}),
}

def make_objective(space, mask_getter, static_overrides=None):
    static_overrides = static_overrides or {}
    def objective(trial: optuna.Trial) -> float:
        opt_params = params.copy()
        opt_params.update(sample_params(trial, space))
        opt_params.update(static_overrides)
        return evaluate_objective(opt_params, mask_getter, trial.number)
    return objective

objective_lincomb = make_objective(
    LINCOMB_SPACE,
    lambda fs: fs.lincomb_masks(),
    {"FilterSamples_lincomb_show_progress": False},
)
objective_taylor = make_objective(
    TAYLOR_SPACE,
    lambda fs: fs.taylor_feature_masks(),
)

In [17]:
if objective_name == "lincomb":
    objective = objective_lincomb
if objective_name == "taylor":
    objective = objective_taylor

In [None]:
optuna.logging.enable_propagation()
sampler = optuna.samplers.TPESampler(n_startup_trials=n_startup_trials)
study = optuna.create_study(
    study_name = studyname,
    storage="sqlite:///sandbox_optuna.db",
    direction="maximize",
    load_if_exists=True,
    sampler=sampler,
)
study.optimize(objective, timeout=studytime)

logger.info(f"Best parameters: {study.best_params}")
logger.info(f"Best score: {study.best_value}")

df: pd.DataFrame = study.trials_dataframe()
logger.info("\nTrials DataFrame:")
logger.info(df.sort_values("value").to_string())

param_importances = optuna.importance.get_param_importances(study)
logger.info("Parameter Importances:")
for key, value in param_importances.items():
    logger.info(f"{key}: {value}")

In [None]:
df.to_parquet(f"{formatted_str}.parquet", index=False)