In [1]:
import sys
import os

# Get the absolute path to the project directory
project_dir = os.path.abspath("..")

# Append the project directory to sys.path
if project_dir not in sys.path:
    sys.path.append(project_dir)
    
from src.predictionModule.LoadupSamples import LoadupSamples
from src.predictionModule.MachineModels import MachineModels 

import numpy as np
import datetime
import pandas as pd
import polars as pl
import optuna

import logging
formatted_date = datetime.datetime.now().strftime("%d%b%y_%H%M").lower()

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter(fmt="%(asctime)s - %(message)s")
handler.setFormatter(formatter)
if not logger.hasHandlers():
    logger.addHandler(handler)
else:
    logger.handlers[:] = [handler]

#Output File handler
formatted_str = f"notebook-lstm-optuna-{formatted_date}"
file_handler = logging.FileHandler(f"{formatted_str}.log", mode="w")
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# Usage
logger.info("This will print to the notebook's output cell")

2025-08-19 18:07:09,219 - This will print to the notebook's output cell


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Default params (keeps previous behavior where params corresponds to idxAfterPrediction=5)
params_default = {
    "idxAfterPrediction": 5,
    'timesteps': 90,
    'target_option': 'last',
    "LoadupSamples_time_scaling_stretch": True,
    "LoadupSamples_time_inc_factor": 10,
    
    "LSTM_units": 32,
    "LSTM_num_layers": 3,
    "LSTM_dropout": 0.001,
    "LSTM_recurrent_dropout": 0.001,
    "LSTM_learning_rate": 0.001,
    "LSTM_optimizer": "adam",
    "LSTM_bidirectional": True,
    "LSTM_batch_size": 2**12,
    "LSTM_epochs": 10,
    "LSTM_l1": 0.001,
    "LSTM_l2": 0.001,
    "LSTM_inter_dropout": 0.001,
    "LSTM_input_gaussian_noise": 0.001,
    "LSTM_conv1d": True,
    "LSTM_conv1d_kernel_size": 3,
    "LSTM_loss": "mse",
}

In [3]:
stock_group = "group_regOHLCV_over5years"
eval_date = datetime.date(year=2025, month=6, day=13)
split_date = datetime.date(year=2023, month=12, day=31)

studytime = 60*60*1
studyname = f"sandbox_lstm_optuna_{formatted_str}"

In [4]:
def objective(trial: optuna.Trial) -> float:
    opt_params = params_default.copy()
    opt_params["year_start"] = trial.suggest_int("year_start", 2019, 2023)
    opt_params["idxAfterPrediction"] = trial.suggest_int("idxAfterPrediction", 3, 5, step=1)
    opt_params["LoadupSamples_time_inc_factor"] = trial.suggest_int("LoadupSamples_time_inc_factor", 21, 71, step=5)
    opt_params["timesteps"] = trial.suggest_int("timesteps", 50, 90, step=5)
    opt_params["LSTM_units"] = 16
    opt_params["LSTM_num_layers"] = 1
    opt_params["LSTM_learning_rate"] = trial.suggest_float("LSTM_learning_rate", 1e-5, 1e-3, log=True)
    opt_params["LSTM_epochs"] = 2
    opt_params["LSTM_l1"] = trial.suggest_float("LSTM_l1", 1e-5, 1e-3, log=True)
    opt_params["LSTM_l2"] = trial.suggest_float("LSTM_l2", 1e-4, 1e-1, log=True)
    opt_params["LSTM_dropout"] = trial.suggest_float("LSTM_dropout", 1e-5, 1e-3, log=True)
    opt_params["LSTM_inter_dropout"] = trial.suggest_float("LSTM_inter_dropout", 1e-4, 1e-1, log=True)
    opt_params["LSTM_recurrent_dropout"] = trial.suggest_float("LSTM_recurrent_dropout", 1e-4, 1e-1, log=True)
    opt_params["LSTM_conv1d_kernel_size"] = 3
    opt_params["is_single_feature"] = trial.suggest_categorical("is_single_feature", [True, False])

    ls = LoadupSamples(
        train_start_date=datetime.date(year=opt_params["year_start"], month=1, day=1),
        test_dates=[eval_date],
        treegroup=None,
        timegroup=stock_group,
        params=opt_params,
    )
    ls.load_samples(main_path = "../src/featureAlchemy/bin/")

    ls.split_dataset(
        start_date=datetime.date(year=opt_params["year_start"], month=1, day=1),
        last_train_date=split_date,
        last_test_date=eval_date,
    )

    Xtrain = ls.train_Xtime
    ytrain = ls.train_ytime
    Xtest = ls.test_Xtime
    ytest = ls.test_ytime

    true_res = ls.meta_pl_test

    Xtrain = Xtrain[:, -opt_params["timesteps"]:, :]
    Xtest = Xtest[:, -opt_params["timesteps"]:, :]

    if opt_params["is_single_feature"]:
        Xtrain = Xtrain[:, :, [0]]
        Xtest = Xtest[:, :, [0]]

    mm = MachineModels(opt_params)

    starttime0 = datetime.datetime.now()
    model_lstm0, res_dict0 = mm.run_LSTM_torch(Xtrain, ytrain, Xtest, ytest, device="cuda")
    preds_train0 = mm.predict_LSTM_torch(model_lstm0, Xtrain, batch_size=opt_params["LSTM_batch_size"], device="cuda")
    preds_test0 = mm.predict_LSTM_torch(model_lstm0, Xtest, batch_size=opt_params["LSTM_batch_size"], device="cuda")
    endtime0 = datetime.datetime.now()

    q = 0.90
    mask_train_above0 = (preds_train0 >= np.quantile(preds_train0, q))
    mask_pred_above0 = (preds_test0 >= np.quantile(preds_test0, q))
    mask_pred_below0 = (preds_test0 < np.quantile(preds_test0, q))
    true_res_masked_0 = true_res.filter(pl.Series(mask_pred_above0))
    true_res_masked_below0 = true_res.filter(pl.Series(mask_pred_below0))

    #run again
    starttime1 = datetime.datetime.now()
    model_lstm1, res_dict1 = mm.run_LSTM_torch(Xtrain[mask_train_above0], ytrain[mask_train_above0], Xtest[mask_pred_above0], ytest[mask_pred_above0], device="cuda")
    preds_test1 = mm.predict_LSTM_torch(model_lstm1, Xtest[mask_pred_above0], batch_size=opt_params["LSTM_batch_size"], device="cuda")
    endtime1 = datetime.datetime.now()

    mask_pred_above1 = (preds_test1 >= np.quantile(preds_test1, q))
    mask_pred_below1 = (preds_test1 < np.quantile(preds_test1, q))
    true_res_masked_1 = true_res_masked_0.filter(pl.Series(mask_pred_above1))
    true_res_masked_below1 = true_res_masked_below0.filter(pl.Series(mask_pred_below1))

    score = (np.mean(true_res_masked_1['target_ratio'].to_numpy())) ** (1/opt_params["idxAfterPrediction"])

    # Log some results
    def quant_dis_in_mask(mask: np.ndarray, q: float) -> int:
        if not mask.any():
            return len(mask)
        return np.quantile(np.abs(np.diff(np.where(mask)[0])), q)

    full_mask = mask_pred_above0.copy()
    full_mask[mask_pred_above0] = mask_pred_above1

    logger.info(f"Trial {trial.number} with params: {opt_params}")
    logger.info(f"  Duration0: {endtime0 - starttime0}")
    logger.info(f"  Duration1: {endtime1 - starttime1}")
    logger.info(f"  Val RSME0 adjusted: {res_dict0['val_rmse']/opt_params['LoadupSamples_time_inc_factor']:.4f}")
    logger.info(f"  Val RSME1 adjusted: {res_dict1['val_rmse']/opt_params['LoadupSamples_time_inc_factor']:.4f}")
    logger.info(f"  Mean all prediction: {np.mean(true_res['target_ratio'].to_numpy()):.4f}")
    logger.info(f"  Mean above prediction0: {np.mean(true_res_masked_0['target_ratio'].to_numpy()):.4f}")
    logger.info(f"  Mean below prediction0: {np.mean(true_res_masked_below0['target_ratio'].to_numpy()):.4f}")
    logger.info(f"  Mean above prediction1: {np.mean(true_res_masked_1['target_ratio'].to_numpy()):.4f}")
    logger.info(f"  Mean below prediction1: {np.mean(true_res_masked_below1['target_ratio'].to_numpy()):.4f}")
    logger.info(f"  Quantile 0.99 for distance in mask above: {quant_dis_in_mask(full_mask, 0.99)}")
    logger.info(f"  Quantile 0.99 for distance in mask below: {quant_dis_in_mask(full_mask, 0.99)}")
    logger.info(f"  Ratio for quantile-distance-to-length above: {quant_dis_in_mask(full_mask, 0.99) / len(full_mask):.4f}")
    logger.info(f"  Ratio for quantile-distance-to-length below: {quant_dis_in_mask(full_mask, 0.99) / len(full_mask):.4f}")
    logger.info(f"  Score: {score:.4f}")

    return score

optuna.logging.enable_propagation()
sampler = optuna.samplers.TPESampler(n_startup_trials=15)
study = optuna.create_study(
    study_name = studyname,
    storage="sqlite:///sandbox_optuna.db",
    direction="maximize",
    load_if_exists=True,
    sampler=sampler,
)
study.optimize(objective, timeout=studytime)

logger.info(f"Best parameters: {study.best_params}")
logger.info(f"Best score: {study.best_value}")

df: pd.DataFrame = study.trials_dataframe()
logger.info("\nTrials DataFrame:")
logger.info(df.sort_values("value").to_string())

param_importances = optuna.importance.get_param_importances(study)
logger.info("Parameter Importances:")
for key, value in param_importances.items():
    logger.info(f"{key}: {value}")

[I 2025-08-19 18:07:10,801] A new study created in RDB with name: sandbox_lstm_optuna_notebook-lstm-optuna-19aug25_1807


2025-08-19 18:07:10,801 - A new study created in RDB with name: sandbox_lstm_optuna_notebook-lstm-optuna-19aug25_1807


[W 2025-08-19 18:07:22,120] Trial 0 failed with parameters: {'year_start': 2023, 'idxAfterPrediction': 4, 'LoadupSamples_time_inc_factor': 41, 'timesteps': 70, 'LSTM_learning_rate': 0.000649918985645458, 'LSTM_l1': 0.00047753902002221107, 'LSTM_l2': 0.0002814140456861457, 'LSTM_dropout': 0.0004739446332724232, 'LSTM_inter_dropout': 0.0001857324807671344, 'LSTM_recurrent_dropout': 0.041310111798629444, 'is_single_feature': False} because of the following error: AssertionError('Torch not compiled with CUDA enabled').
Traceback (most recent call last):
  File "c:\Users\KILightTouch\Desktop\RandomOdyssey\.venv\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\KILightTouch\AppData\Local\Temp\ipykernel_9688\1869135689.py", line 51, in objective
    model_lstm0, res_dict0 = mm.run_LSTM_torch(Xtrain, ytrain, Xtest, ytest, device="cuda")
                             ^^^^^^^^^^^^^^^^^^^^^^^^^

2025-08-19 18:07:22,120 - Trial 0 failed with parameters: {'year_start': 2023, 'idxAfterPrediction': 4, 'LoadupSamples_time_inc_factor': 41, 'timesteps': 70, 'LSTM_learning_rate': 0.000649918985645458, 'LSTM_l1': 0.00047753902002221107, 'LSTM_l2': 0.0002814140456861457, 'LSTM_dropout': 0.0004739446332724232, 'LSTM_inter_dropout': 0.0001857324807671344, 'LSTM_recurrent_dropout': 0.041310111798629444, 'is_single_feature': False} because of the following error: AssertionError('Torch not compiled with CUDA enabled').
Traceback (most recent call last):
  File "c:\Users\KILightTouch\Desktop\RandomOdyssey\.venv\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\KILightTouch\AppData\Local\Temp\ipykernel_9688\1869135689.py", line 51, in objective
    model_lstm0, res_dict0 = mm.run_LSTM_torch(Xtrain, ytrain, Xtest, ytest, device="cuda")
                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^

[W 2025-08-19 18:07:22,146] Trial 0 failed with value None.


2025-08-19 18:07:22,146 - Trial 0 failed with value None.


AssertionError: Torch not compiled with CUDA enabled

In [None]:
df.to_parquet(f"sandbox_lstm_optuna_{formatted_str}.parquet", index=False)

In [None]:
# Run with best parameters
best_params = {**params_default, **study.best_params.copy()}
best_params["LSTM_units"] = 16
best_params["LSTM_epochs"] = 2
best_params["LSTM_conv1d_kernel_size"] = 3
best_params["LSTM_num_layers"] = 1

ls = LoadupSamples(
    train_start_date=datetime.date(year=best_params["year_start"], month=1, day=1),
    test_dates=[eval_date],
    group=stock_group,
    group_type="Time",
    params=best_params,
)
ls.load_samples(main_path="../src/featureAlchemy/bin/")

ls.split_dataset(
    start_date=datetime.date(year=best_params["year_start"], month=1, day=1),
    last_train_date=split_date,
    last_test_date=eval_date,
)

Xtrain = ls.train_Xtime
ytrain = ls.train_ytime
Xtest = ls.test_Xtime
ytest = ls.test_ytime

Xtrain = Xtrain[:, -best_params["timesteps"]:, :]
Xtest = Xtest[:, -best_params["timesteps"]:, :]

if best_params["is_single_feature"]:
    Xtrain = Xtrain[:, :, [0]]
    Xtest = Xtest[:, :, [0]]

mm = MachineModels(best_params)
model_lstm, res_dict = mm.run_LSTM_torch(Xtrain, ytrain, Xtest, ytest, device="cuda")
preds = mm.predict_LSTM_torch(model_lstm, Xtest, batch_size=best_params["LSTM_batch_size"], device="cuda")



2025-08-16 18:18:25,292 - NaN values found in training time features. 28 Samples removed.


Epochs:   0%|          | 0/2 [00:00<?, ?it/s]

2025-08-16 18:18:42,097 - Epoch 1/2 — Train RMSE: 0.7566 — Validation RMSE: 0.6959


Epochs:  50%|█████     | 1/2 [00:11<00:11, 11.01s/it]

2025-08-16 18:18:53,391 - Epoch 2/2 — Train RMSE: 0.6707 — Validation RMSE: 0.6077


Epochs: 100%|██████████| 2/2 [00:22<00:00, 11.15s/it]


In [None]:
q = 0.98
mask_pred_above = (preds >= np.quantile(preds, q))
mask_pred_below = (preds <= np.quantile(preds, 1-q))
true_res = ls.meta_pl_test
true_res_masked_above = true_res.filter(pl.Series(mask_pred_above))
true_res_masked_below = true_res.filter(pl.Series(mask_pred_below))
score = (np.mean(true_res_masked_above['target_ratio'].to_numpy())) ** (1/best_params["idxAfterPrediction"])
# Log some results
def quant_dis_in_mask(mask: np.ndarray, q: float) -> int:
    if not mask.any():
        return len(mask)
    return np.quantile(np.abs(np.diff(np.where(mask)[0])), q)
logger.info(f"  Val RSME adjusted: {res_dict['val_rmse']/best_params['LoadupSamples_time_inc_factor']:.4f}")
logger.info(f"  Mean all prediction: {np.mean(true_res['target_ratio'].to_numpy()):.4f}")
logger.info(f"  Mean above prediction: {np.mean(true_res_masked_above['target_ratio'].to_numpy()):.4f}")
logger.info(f"  Mean below prediction: {np.mean(true_res_masked_below['target_ratio'].to_numpy()):.4f}")
logger.info(f"  Quantile 0.99 in mask above: {quant_dis_in_mask(mask_pred_above, 0.99)}")
logger.info(f"  Quantile 0.99 in mask below: {quant_dis_in_mask(mask_pred_below, 0.99)}")
logger.info(f"  Ratio for quantile to length above: {quant_dis_in_mask(mask_pred_above, 0.99) / len(mask_pred_above):.4f}")
logger.info(f"  Ratio for quantile to length below: {quant_dis_in_mask(mask_pred_below, 0.99) / len(mask_pred_below):.4f}")
logger.info(f"  Score: {score:.4f}")

2025-08-16 18:33:38,602 -   Val RSME adjusted: 0.0132
2025-08-16 18:33:38,604 -   Mean all prediction: 1.0010
2025-08-16 18:33:38,605 -   Mean above prediction: 1.0101
2025-08-16 18:33:38,605 -   Mean below prediction: 1.0001
2025-08-16 18:33:38,608 -   Quantile 0.99 in mask above: 450.27999999999884
2025-08-16 18:33:38,609 -   Quantile 0.99 in mask below: 427.0
2025-08-16 18:33:38,611 -   Ratio for quantile to length above: 0.0007
2025-08-16 18:33:38,612 -   Ratio for quantile to length below: 0.0006
2025-08-16 18:33:38,612 -   Score: 1.0025
