In [5]:
import sys
import os

# Get the absolute path to the project directory
project_dir = os.path.abspath("..")

# Append the project directory to sys.path
if project_dir not in sys.path:
    sys.path.append(project_dir)
    
from src.predictionModule.LoadupSamples import LoadupSamples
from src.predictionModule.MachineModels import MachineModels 
from src.predictionModule.FilterSamples import FilterSamples

import numpy as np
import datetime
import pandas as pd
import polars as pl
import optuna
import scipy

import logging
formatted_date = datetime.datetime.now().strftime("%d%b%y_%H%M").lower()

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter(fmt="%(asctime)s - %(message)s")
handler.setFormatter(formatter)
if not logger.hasHandlers():
    logger.addHandler(handler)
else:
    logger.handlers[:] = [handler]

#Output File handler
formatted_str = f"notebook-lstm-optuna-{formatted_date}"
file_handler = logging.FileHandler(f"{formatted_str}.log", mode="w")
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# Usage
logger.info("This will print to the notebook's output cell")

2025-09-02 17:12:43,970 - This will print to the notebook's output cell


In [6]:
params_default = {
    "idxAfterPrediction": 5,
    'timesteps': 90,
    'target_option': 'last',
    "LoadupSamples_time_scaling_stretch": True,
    "LoadupSamples_time_inc_factor": 1,
    
    "Treetime_LSTM_days_to_train": 500,
    "LSTM_units": 32,
    "LSTM_num_layers": 2,
    "LSTM_dropout": 0.001,
    "LSTM_recurrent_dropout": 0.001,
    "LSTM_learning_rate": 0.001,
    "LSTM_optimizer": "adam",
    "LSTM_bidirectional": True,
    "LSTM_batch_size": 2**12,
    "LSTM_epochs": 10,
    "LSTM_l1": 0.001,
    "LSTM_l2": 0.001,
    "LSTM_inter_dropout": 0.001,
    "LSTM_input_gaussian_noise": 0.001,
    "LSTM_conv1d": True,
    "LSTM_conv1d_kernel_size": 3,
    "LSTM_loss": "mse",
}

In [None]:
stock_group = "group_regOHLCV_over5years"
treegroup = "group_finanTo2011"
start_date = datetime.date(year=2016, month=1, day=1)
eval_date = datetime.date(year=2025, month=7, day=13)
split_date = datetime.date(year=2025, month=1, day=1)
n_reruns = 3
time_delta_days = 25
split_dates = [split_date - datetime.timedelta(days=time_delta_days * i) for i in range(0, n_reruns)]
n_testdays = 90

studytime = 60*60*1
studyname = f"sandbox_lstm_optuna_{formatted_str}"
n_startup_trials = 7
q_full = 0.986
device = "cuda"

In [8]:
ls = LoadupSamples(
    train_start_date=start_date,
    test_dates=[eval_date],
    treegroup=treegroup,
    timegroup=stock_group,
    params=params_default,
)
ls.load_samples(main_path = "../src/featureAlchemy/bin/")

def objective(trial: optuna.Trial) -> float:
    opt_params = params_default.copy()
    opt_params["idxAfterPrediction"] = 5
    opt_params["LoadupSamples_time_inc_factor"] = trial.suggest_int("LoadupSamples_time_inc_factor", 21, 81, step=5)
    opt_params["timesteps"] = trial.suggest_int("timesteps", 30, 90, step=5)
    opt_params["Treetime_LSTM_days_to_train"] = trial.suggest_int("Treetime_LSTM_days_to_train", 300, 2500, step=100)
    opt_params["LSTM_units"] = 32
    opt_params["LSTM_num_layers"] = trial.suggest_int("LSTM_num_layers", 1, 2)
    opt_params["LSTM_learning_rate"] = trial.suggest_float("LSTM_learning_rate", 1e-5, 1e-3, log=True)
    opt_params["LSTM_epochs"] = 2
    opt_params["LSTM_l1"] = trial.suggest_float("LSTM_l1", 1e-5, 1e-1, log=True)
    opt_params["LSTM_l2"] = trial.suggest_float("LSTM_l2", 1e-4, 5e-1, log=True)
    opt_params["LSTM_dropout"] = trial.suggest_float("LSTM_dropout", 1e-4, 1e-1, log=True)
    opt_params["LSTM_inter_dropout"] = trial.suggest_float("LSTM_inter_dropout", 1e-4, 1e-1, log=True)
    opt_params["LSTM_recurrent_dropout"] = trial.suggest_float("LSTM_recurrent_dropout", 1e-4, 1e-1, log=True)
    opt_params["LSTM_conv1d_kernel_size"] = 3
    opt_params["is_single_feature"] = trial.suggest_categorical("is_single_feature", [False, True])

    scores = []
    for sd in split_dates[:n_reruns]:
        lsc = ls.copy(deep=True)
        last_test_day = sd + datetime.timedelta(days=n_testdays)
        lsc.split_dataset(
            start_date=start_date, 
            last_train_date=sd, 
            last_test_date=last_test_day
        )

        time_factor = opt_params["LoadupSamples_time_inc_factor"]
        lsc.train_ytime = np.tanh((lsc.train_ytree - 1.0) * time_factor) / 2.0 + 0.5
        lsc.test_ytime = np.tanh((lsc.test_ytree - 1.0) * time_factor) / 2.0 + 0.5

        fs_pre = FilterSamples(
            Xtree_train=lsc.train_Xtree,
            ytree_train=lsc.train_ytree,
            treenames=lsc.featureTreeNames,
            Xtree_test=lsc.test_Xtree,
            ytree_test=lsc.test_ytree,
            meta_train=lsc.meta_pl_train,
            meta_test=lsc.meta_pl_test,
            params=opt_params
        )

        cat_mask_train, cat_mask_test = fs_pre.categorical_masks()
        days_reduced = fs_pre.get_recent_training_mask(opt_params["Treetime_LSTM_days_to_train"])

        Xtrain = lsc.train_Xtime[days_reduced & cat_mask_train]
        ytrain = lsc.train_ytime[days_reduced & cat_mask_train]
        Xtest = lsc.test_Xtime[cat_mask_test] if lsc.test_Xtime is not None else None
        ytest = lsc.test_ytime[cat_mask_test] if lsc.test_ytime is not None else None

        true_res = lsc.meta_pl_test.filter(pl.Series(cat_mask_test))

        Xtrain = Xtrain[:, -opt_params["timesteps"]:, :]
        Xtest = Xtest[:, -opt_params["timesteps"]:, :]

        if opt_params["is_single_feature"]:
            Xtrain = Xtrain[:, :, [0]]
            Xtest = Xtest[:, :, [0]]

        mm = MachineModels(opt_params)

        starttime0 = datetime.datetime.now()
        model_lstm0, res_dict0 = mm.run_LSTM_torch(Xtrain, ytrain, Xtest, ytest, device=device)
        preds_test0 = mm.predict_LSTM_torch(model_lstm0, Xtest, batch_size=opt_params["LSTM_batch_size"], device=device)
        endtime0 = datetime.datetime.now()

        # final mask
        q0 = q_full
        mask_pred0_test_above = (preds_test0 >= np.quantile(preds_test0,   q0))
        mask_pred0_test_below = (preds_test0 <= np.quantile(preds_test0, 1-q0))

        true_res_masked_above = true_res.filter(pl.Series(mask_pred0_test_above))
        true_res_masked_below = true_res.filter(pl.Series(mask_pred0_test_below))

        score = (scipy.stats.gmean(true_res_masked_above['target_ratio'].to_numpy())) ** (1/opt_params["idxAfterPrediction"])
        scores.append(score)

        # Log some results
        def quant_dis_in_mask(mask: np.ndarray, q: float) -> float:
            if not mask.any():
                return len(mask)
            return np.quantile(np.abs(np.diff(np.where(mask)[0])), q)

        fullmask_above = mask_pred0_test_above.copy()
        fullmask_below = mask_pred0_test_below.copy()

        logger.info(f"Trial {trial.number} with params: {opt_params}")
        logger.info(f"  q0: {q0:.4f}")
        logger.info(f"  Duration0: {endtime0 - starttime0}")
        logger.info(f"  Val RMSE0 adjusted: {res_dict0['val_rmse']/opt_params['LoadupSamples_time_inc_factor']:.4f}")
        logger.info(f"  Mean all prediction: {scipy.stats.gmean(true_res['target_ratio'].to_numpy()):.4f}")
        logger.info(f"  Mean above prediction: {scipy.stats.gmean(true_res_masked_above['target_ratio'].to_numpy()):.4f}")
        logger.info(f"  Mean below prediction: {scipy.stats.gmean(true_res_masked_below['target_ratio'].to_numpy()):.4f}")
        logger.info(f"  Quantile 0.99 for distance in mask above: {quant_dis_in_mask(fullmask_above, 0.99)}")
        logger.info(f"  Quantile 0.99 for distance in mask below: {quant_dis_in_mask(fullmask_below, 0.99)}")
        logger.info(f"  Ratio for quantile-distance-to-length above: {quant_dis_in_mask(fullmask_above, 0.99) / len(fullmask_above):.4f}")
        logger.info(f"  Ratio for quantile-distance-to-length below: {quant_dis_in_mask(fullmask_below, 0.99) / len(fullmask_below):.4f}")
        logger.info(f"  Score: {score:.4f}")

    s = np.array(scores, dtype=float)
    s[~np.isfinite(s)] = 1.0
    s[s <= 1e-5] = 1e-5
    return float(np.log(s).mean()) if s.size else float("-inf")

2025-09-02 17:13:01,647 - Test date 2025-07-13 not found in the database. Omitting.


In [None]:
optuna.logging.enable_propagation()
sampler = optuna.samplers.TPESampler(n_startup_trials=n_startup_trials)
study = optuna.create_study(
    study_name = studyname,
    storage="sqlite:///sandbox_optuna.db",
    direction="maximize",
    load_if_exists=True,
    sampler=sampler,
)
study.optimize(objective, timeout=studytime)

logger.info(f"Best parameters: {study.best_params}")
logger.info(f"Best score: {study.best_value}")

df: pd.DataFrame = study.trials_dataframe()
logger.info("\nTrials DataFrame:")
logger.info(df.sort_values("value").to_string())

param_importances = optuna.importance.get_param_importances(study)
logger.info("Parameter Importances:")
for key, value in param_importances.items():
    logger.info(f"{key}: {value}")

In [None]:
df.to_parquet(f"{formatted_str}.parquet", index=False)