In [2]:
import sys
import os

# Get the absolute path to the project directory
project_dir = os.path.abspath("..")

# Append the project directory to sys.path
if project_dir not in sys.path:
    sys.path.append(project_dir)
    
from src.predictionModule.LoadupSamples import LoadupSamples
from src.predictionModule.FilterSamples import FilterSamples

import numpy as np
import polars as pl
import datetime
import random
import matplotlib.pyplot as plt

import logging
formatted_date = datetime.datetime.now().strftime("%d%b%y_%H%M").lower()

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter(fmt="%(asctime)s - %(message)s")
handler.setFormatter(formatter)
if not logger.hasHandlers():
    logger.addHandler(handler)
else:
    logger.handlers[:] = [handler]

#Output File handler
formatted_str = f"notebook-lstm-optuna-{formatted_date}"
file_handler = logging.FileHandler(f"{formatted_str}.log", mode="w")
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# Usage
logger.info("This will print to the notebook's output cell")

2025-07-11 22:31:40,700 - This will print to the notebook's output cell


In [None]:
params = {
    "idxAfterPrediction": 5,
    'timesteps': 60,
    'target_option': 'last',
    
    "TreeTime_lstm_units": 32,
    "TreeTime_lstm_num_layers": 3,
    "TreeTime_lstm_dropout": 0.00001,
    "TreeTime_lstm_recurrent_dropout": 0.00001,
    "TreeTime_lstm_learning_rate": 0.001,
    "TreeTime_lstm_optimizer": "adam",
    "TreeTime_lstm_bidirectional": True,
    "TreeTime_lstm_batch_size": 2**12,
    "TreeTime_lstm_epochs": 20,
    "TreeTime_lstm_l1": 0.00001,
    "TreeTime_lstm_l2": 0.00001,
    "TreeTime_inter_dropout": 0.00001,
    "TreeTime_input_gaussian_noise": 0.00001,
    "TreeTime_lstm_conv1d": True,
    "TreeTime_lstm_conv1d_kernel_size": 3,
    "TreeTime_lstm_loss": "mse",
}

In [4]:
stock_group = "group_snp500_finanTo2011"

eval_dates = sorted([
    datetime.date(2025,  2,  1) - datetime.timedelta(days=i*60 + random.randint(-10,10)) 
    for i in range(1)
])

years_back = 9
start_Dates = [eval_date - datetime.timedelta(days=365 * years_back) for eval_date in eval_dates]
start_Dates = [datetime.date(year=start_date.year, month=1, day=1) for start_date in start_Dates]

In [5]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm, trange
import shap

class TreeTimeLSTM(nn.Module):
    def __init__(self, 
                 input_size,
                 lstm_units,
                 num_layers,
                 dropout,
                 recurrent_dropout,
                 bidirectional,
                 l1=0.0,
                 l2=0.0,
                 use_conv1d=False,
                 conv_kernel=3,
                 noise_std=0.0,
                 inter_dropout=0.0):
        super().__init__()
        self.use_conv1d = use_conv1d
        self.noise_std = noise_std
        self.inter_dropout = inter_dropout

        if use_conv1d:
            self.conv1d = nn.Conv1d(
                in_channels=input_size,
                out_channels=lstm_units,
                kernel_size=conv_kernel,
                padding=conv_kernel//2
            )
            input_size = lstm_units

        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=lstm_units,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0.0,
            bidirectional=bidirectional,
            batch_first=True
        )
        self.dropout = nn.Dropout(inter_dropout) if inter_dropout > 0 else None
        self.output = nn.Linear(
            lstm_units * (2 if bidirectional else 1),
            1
        )
        self.l1 = l1
        self.l2 = l2

    def forward(self, x):
        if self.noise_std > 0:
            x = x + torch.randn_like(x) * self.noise_std
        if self.use_conv1d:
            x = x.transpose(1, 2)
            x = self.conv1d(x)
            x = x.transpose(1, 2)
        out, _ = self.lstm(x)
        out_last = out[:, -1, :]
        if self.dropout:
            out_last = self.dropout(out_last)
        return self.output(out_last)

# Loss functions
def quantile_loss(q):
    def loss_fn(y_pred, y_true):
        e = y_true - y_pred
        return torch.mean(torch.max(q * e, (q - 1) * e))
    return loss_fn

def r2_metric(y_pred, y_true):
    ss_res = torch.sum((y_true - y_pred) ** 2)
    ss_tot = torch.sum((y_true - torch.mean(y_true)) ** 2)
    return 1 - ss_res / (ss_tot + 1e-6)

def neg_r2_loss(y_pred, y_true):
    return -r2_metric(y_pred, y_true)


def run(params, train_Xtime, train_ytime, training_ratio=0.95, device='cpu'):
    # Hyperparameters
    lstm_units = params['TreeTime_lstm_units']
    num_layers = params['TreeTime_lstm_num_layers']
    dropout = params['TreeTime_lstm_dropout']
    recurrent_dropout = params['TreeTime_lstm_recurrent_dropout']
    learning_rate = params['TreeTime_lstm_learning_rate']
    optimizer_name = params['TreeTime_lstm_optimizer']
    bidirectional = params['TreeTime_lstm_bidirectional']
    batch_size = params['TreeTime_lstm_batch_size']
    epochs = params['TreeTime_lstm_epochs']
    loss_name = params['TreeTime_lstm_loss']
    l1 = params.get('TreeTime_lstm_l1', 0.0)
    l2 = params.get('TreeTime_lstm_l2', 0.0)
    inter_dropout = params.get('TreeTime_inter_dropout', 0.0)
    noise_std = params.get('TreeTime_input_gaussian_noise', 0.0)
    use_conv1d = params.get('TreeTime_lstm_conv1d', False)
    conv_kernel = params.get('TreeTime_lstm_conv1d_kernel_size', 3)

    # Data split
    n_total = train_Xtime.shape[0]
    split_at = int(n_total * training_ratio)
    X_train, y_train = train_Xtime[:split_at], train_ytime[:split_at]
    X_val, y_val = train_Xtime[split_at:], train_ytime[split_at:]

    train_ds = TensorDataset(
        torch.tensor(X_train, dtype=torch.float32),
        torch.tensor(y_train, dtype=torch.float32)
    )
    val_ds = TensorDataset(
        torch.tensor(X_val, dtype=torch.float32),
        torch.tensor(y_val, dtype=torch.float32)
    )
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=False)
    val_loader = DataLoader(val_ds, batch_size=batch_size)

    # Model
    model = TreeTimeLSTM(
        input_size=train_Xtime.shape[-1],
        lstm_units=lstm_units,
        num_layers=num_layers,
        dropout=dropout,
        recurrent_dropout=recurrent_dropout,
        bidirectional=bidirectional,
        l1=l1,
        l2=l2,
        use_conv1d=use_conv1d,
        conv_kernel=conv_kernel,
        noise_std=noise_std,
        inter_dropout=inter_dropout
    ).to(device)

    # Loss & optimizer
    if loss_name == 'mse':
        criterion = nn.MSELoss()
    elif loss_name == 'r2':
        criterion = lambda pred, true: neg_r2_loss(pred, true)
    else:
        q = int(loss_name.split('_')[1]) / 10.0
        criterion = quantile_loss(q)
    optimizer = optim.Adam(
        model.parameters(), lr=learning_rate, weight_decay=l2
    )
    if optimizer_name == 'rmsprop':
        optimizer = optim.RMSprop(
            model.parameters(), lr=learning_rate, weight_decay=l2
        )
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, factor=0.5, patience=2
    )

    best_rmse, wait = float('inf'), 0
    start_time = time.time()

    for epoch in trange(epochs, desc='Epochs'):
        model.train()
        for X_batch, y_batch in tqdm(
            train_loader, desc='Training', leave=False
        ):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            preds = model(X_batch).squeeze()
            loss = criterion(preds, y_batch)
            if l1 > 0:
                l1_penalty = sum(p.abs().sum() for p in model.parameters())
                loss += l1 * l1_penalty
            loss.backward()
            optimizer.step()
            if time.time() - start_time > 3600:
                break

        model.eval()
        val_rmses = []
        for X_batch, y_batch in tqdm(
            val_loader, desc='Validation', leave=False
        ):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            preds = model(X_batch).squeeze()
            mse = nn.MSELoss()(preds, y_batch)
            val_rmses.append(torch.sqrt(mse).item())
        val_rmse = sum(val_rmses) / len(val_rmses)
        scheduler.step(val_rmse)

        if val_rmse < best_rmse:
            best_rmse, wait = val_rmse, 0
            best_state = model.state_dict()
        else:
            wait += 1
            if wait >= 3:
                break
        if time.time() - start_time > 3600:
            break

    model.load_state_dict(best_state)
    return best_rmse, model

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def shorttest(model, train_Xtime, train_ytree, train_ytime, device='cpu'):
    # Convert to torch tensor and send to device
    n = 20000
    X_tensor = torch.tensor(train_Xtime[-n:], dtype=torch.float32).to(device)

    # Put model into eval mode and disable grad
    model.eval()
    with torch.no_grad():
        preds = model(X_tensor)        # (N, 1) tensor
        preds = preds.squeeze(-1)      # (N,) tensor

    # Bring back to CPU NumPy array if you like
    preds = preds.cpu().numpy()

    true_val = train_ytree[-n:]

    rsme_err = np.sqrt(np.mean((preds - train_ytime[-n:])**2))
    q = 0.99
    mask_pred_above = preds >= np.quantile(preds, q)
    mask_pred_below = preds <= np.quantile(preds, 1-q)
    logger.info(f"  Mean error: {rsme_err:.4f}")
    logger.info(f"  Mean all prediction: {np.mean(true_val):.4f}")
    logger.info(f"  Mean above prediction: {np.mean(true_val[mask_pred_above]):.4f}")
    logger.info(f"  Mean below prediction: {np.mean(true_val[mask_pred_below]):.4f}")
    logger.info(f"  True values above zero: {np.sum(mask_pred_above)/len(mask_pred_above):.4f}")
    logger.info(f"  True values below zero: {np.sum(true_val[mask_pred_below])/len(mask_pred_below):.4f}")
    
    return np.mean(true_val[mask_pred_above]) + np.mean(true_val[mask_pred_below])

In [None]:
import optuna
split_f = 0.90  # Split fraction for train/test sets
def objective(trial: optuna.Trial) -> float:
    # Suggest hyperparameters
    opt_params = {
        "idxAfterPrediction": trial.suggest_int("idxAfterPrediction", 1, 5),
        "LoadupSamples_time_scaling_stretch": trial.suggest_categorical("time_scaling_stretch", [True, False]),
        "LoadupSamples_time_inc_factor": trial.suggest_int("time_inc_factor", 1, 20),
    }
    doFeatureReduce = trial.suggest_categorical("doFeatureReduce", [True, False])

    # Build and evaluate FilterSamples for each instance
    scores_test = []
    for i, date in enumerate(eval_dates):
        ls = LoadupSamples(
            train_start_date=start_Dates[i],
            test_dates=[date],
            group=stock_group,
            params={**params, **opt_params}
        )
        try:
            ls.load_samples(main_path = "../src/featureAlchemy/bin/")
            train_Xtree = ls.train_Xtree
            train_ytree = ls.train_ytree
            train_Xtime = ls.train_Xtime
            train_ytime = ls.train_ytime

            timenames = ls.featureTimeNames
            
            if doFeatureReduce:
                idx1 = np.where(timenames == "MathFeature_TradedPrice")[0][0]
                idx2 = np.where(timenames == "FeatureTA_High")[0][0]
                idx3 = np.where(timenames == "FeatureTA_Low")[0][0]
                idx4 = np.where(timenames == "FeatureTA_volume_obv")[0][0]

                train_Xtime = train_Xtime[:, :, [idx1, idx2, idx3, idx4]]
            
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            val_rmse, model = run(params, train_Xtime, train_ytime, training_ratio=split_f, device=device)
            logger.info(f"Validation RMSE: {val_rmse:.4f}")
            a = shorttest(model, train_Xtime, train_ytree, train_ytime, device=device)
            
            score = 1.0 if a is None or np.isnan(a) else a
            
        except ValueError:
            score = 1.0
            
        scores_test.append(score)    
        
    logger.info(f"Trial number {trial.number}")
    logger.info(f"All scores (test) = {scores_test}") 
    
    fin_score = np.mean(np.log(scores_test))
    logger.info(f"Log mean of scores (test) {np.mean(np.log(scores_test))}")
    
    return float(fin_score)

optuna.logging.enable_propagation()
study = optuna.create_study(
    direction="maximize",
)
study.optimize(objective, timeout=60*60*1)

logger.info(f"Best parameters: {study.best_params}")
logger.info(f"Best score: {study.best_value}")

df = study.trials_dataframe()
logger.info("\nTrials DataFrame:")
logger.info(df.sort_values("value").to_string())

param_importances = optuna.importance.get_param_importances(study)
logger.info("Parameter Importances:")
for key, value in param_importances.items():
    logger.info(f"{key}: {value}")

In [8]:
df.to_parquet(f"{formatted_str}.parquet", index=False)