In [1]:
import sys
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, GaussianNoise, LSTM, Bidirectional, Dropout, Dense, Conv1D
from tensorflow.keras import regularizers, backend as K
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, Callback
from tensorflow.keras.metrics import RootMeanSquaredError, R2Score

# Get the absolute path to the project directory
project_dir = os.path.abspath("..")

# Append the project directory to sys.path
if project_dir not in sys.path:
    sys.path.append(project_dir)
    
from src.predictionModule.TreeTimeML_save import TreeTimeML

import pandas as pd
import numpy as np
import polars as pl
import datetime
import seaborn as sns
import lightgbm as lgb
import random
import matplotlib.pyplot as plt
import logging
import time
import re

In [14]:
params = {
    "daysAfterPrediction": 5,
    'timesteps': 25,
    'target_option': 'last',
    "TreeTime_isFiltered": True,
    "TreeTime_volatility_atr_qup": 0.90,
    "TreeTime_top_highest": 10,
    
    "TreeTime_lstm_units": 64,
    "TreeTime_lstm_num_layers": 3,
    "TreeTime_lstm_dropout": 0.00001,
    "TreeTime_lstm_recurrent_dropout": 0.00001,
    "TreeTime_lstm_learning_rate": 0.001,
    "TreeTime_lstm_optimizer": "adam",
    "TreeTime_lstm_bidirectional": True,
    "TreeTime_lstm_batch_size": 2**14,
    "TreeTime_lstm_epochs": 4,
    "TreeTime_lstm_l1": 0.00001,
    "TreeTime_lstm_l2": 0.00001,
    "TreeTime_inter_dropout": 0.00001,
    "TreeTime_input_gaussian_noise": 0.00001,
    "TreeTime_lstm_conv1d": True,
    "TreeTime_lstm_conv1d_kernel_size": 3,
    "TreeTime_lstm_loss": "mse",
}

In [3]:
stock_group = "group_snp500_finanTo2011"

eval_date = datetime.date(year=2025, month=2, day=13)
start_train_date = datetime.date(year=2014, month=1, day=1)

treetimeML = TreeTimeML(
    train_start_date=start_train_date,
    test_dates=[eval_date],
    group=stock_group,
    params=params,
)

In [4]:
treetimeML.load_and_filter_sets(main_path = "../src/featureAlchemy/bin/")

In [5]:
train_Xtree = treetimeML.train_Xtree
train_ytree = treetimeML.train_ytree
train_Xtime = treetimeML.train_Xtime
train_ytime = treetimeML.train_ytime

test_Xtree = treetimeML.test_Xtree
test_ytree = treetimeML.test_ytree
test_Xtime = treetimeML.test_Xtime
test_ytime = treetimeML.test_ytime

treenames = treetimeML.featureTreeNames
timenames = treetimeML.featureTimeNames

In [6]:
idx1 = np.where(timenames == "MathFeature_TradedPrice")[0][0]
idx2 = np.where(timenames == "FeatureTA_High")[0][0]
idx3 = np.where(timenames == "FeatureTA_Low")[0][0]
idx4 = np.where(timenames == "FeatureTA_volume_obv")[0][0]

train_Xtime = train_Xtime[:, :, [idx1, idx2, idx3, idx4]]
test_Xtime = test_Xtime[:, :, [idx1, idx2, idx3, idx4]]

In [7]:
def run(params, train_Xtime=train_Xtime, train_ytime=train_ytime, training_ratio=0.95):
    # Hyperparameters to tune
    lstm_units = params["TreeTime_lstm_units"]
    num_layers = params["TreeTime_lstm_num_layers"]
    dropout = params["TreeTime_lstm_dropout"]
    recurrent_dropout = params["TreeTime_lstm_recurrent_dropout"]
    learning_rate = params["TreeTime_lstm_learning_rate"]
    optimizer_name = params["TreeTime_lstm_optimizer"]
    bidirectional = params["TreeTime_lstm_bidirectional"]
    batch_size = params["TreeTime_lstm_batch_size"]
    epochs = params["TreeTime_lstm_epochs"]
    loss_name = params["TreeTime_lstm_loss"]

    # Regularization hyperparameters
    l1 = params.get("TreeTime_lstm_l1", 0.0)
    l2 = params.get("TreeTime_lstm_l2", 0.0)
    inter_dropout = params.get("TreeTime_inter_dropout", 0.0)
    noise_std = params.get("TreeTime_input_gaussian_noise", 0.0)

    # Conv1D option
    use_conv1d = params.get("TreeTime_lstm_conv1d", False)
    conv_filters = lstm_units
    conv_kernel = params.get("TreeTime_lstm_conv1d_kernel_size", 3)
    X_full, y_full = train_Xtime, train_ytime
    n_total = X_full.shape[0]
    split_at = int(n_total * training_ratio)
    X_train, X_holdout = X_full[:split_at], X_full[split_at:]
    y_train, y_holdout = y_full[:split_at], y_full[split_at:]

    # Build model
    model = Sequential([Input(shape=train_Xtime.shape[1:])])
    # Add Gaussian noise to inputs
    if noise_std > 0:
        model.add(GaussianNoise(noise_std))
    # Add Conv1D layer if opted in
    if use_conv1d:
        model.add(Conv1D(filters=conv_filters,
                        kernel_size=conv_kernel,
                        padding='same',
                        activation='linear'))
    # Add LSTM layers with regularization and dropout
    for i in range(num_layers):
        return_seq = i < (num_layers - 1)
        lstm_layer = LSTM(
            lstm_units,
            return_sequences=return_seq,
            dropout=dropout,
            recurrent_dropout=recurrent_dropout,
            kernel_regularizer=regularizers.L1L2(l1=l1, l2=l2)
        )
        if bidirectional:
            model.add(Bidirectional(lstm_layer))
        else:
            model.add(lstm_layer)
        # Add dropout between layers
        if inter_dropout > 0 and return_seq:
            model.add(Dropout(inter_dropout))
    # Output layer
    model.add(Dense(1, activation='linear', kernel_regularizer=regularizers.L1L2(l1=l1, l2=l2)))
    # Optimizer
    if optimizer_name == "adam":
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer_name == "rmsprop":
        optimizer = RMSprop(learning_rate=learning_rate)
    else:
        raise ValueError(f"Unknown optimizer: {optimizer_name}")
    def quantile_loss(q):
        def loss(y_true, y_pred):
            e = y_true - y_pred
            return tf.reduce_mean(tf.maximum(q*e, (q-1)*e))
        return loss
    def r2_keras(y_true, y_pred):
        """
        Returns R^2 metric: 1 - SS_res / SS_tot
        """
        ss_res =  K.sum(K.square(y_true - y_pred)) 
        ss_tot = K.sum(K.square(y_true - K.mean(y_true)))
        # avoid division by zero
        return 1 - ss_res/(ss_tot + K.epsilon())
    def neg_r2_loss(y_true, y_pred):
        """
        Loss function to *maximize* R^2 by minimizing its negative.
        """
        return -r2_keras(y_true, y_pred)
    if loss_name == "mse":
        loss_lstm = MeanSquaredError()
    elif loss_name == "r2":
        loss_lstm = neg_r2_loss
    else:
        # handles quantile_1,3,5,7,9 etc.
        q = int(loss_name.split("_")[1]) / 10.0
        loss_lstm = quantile_loss(q)
    # Compile
    model.compile(
        optimizer=optimizer,
        loss=loss_lstm,
        metrics=[MeanSquaredError(name='mse'),
                RootMeanSquaredError(name='rmse')]
    )
    # Callbacks
    class TimeLimit(Callback):
        def __init__(self, max_seconds): super().__init__(); self.max_seconds = max_seconds
        def on_train_begin(self, logs=None): self.t0 = time.time()
        def on_batch_end(self, batch, logs=None): (time.time() - self.t0 > self.max_seconds) and setattr(self.model, 'stop_training', True)
    es = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)
    rlrop = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=2)
    time_cb = TimeLimit(3600) 
    # Train
    history = model.fit(
        X_train,
        y_train,
        batch_size=batch_size,
        epochs=epochs,
        validation_data=(X_holdout, y_holdout),
        callbacks=[es, rlrop, time_cb],
        shuffle=False,
    )
    
    return min(history.history['val_rmse']), model

In [8]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

def run_gpt(params,
        train_Xtime_np,
        train_ytime_np,
        training_ratio=0.95,
        subset_ratio=0.1,
        device='cuda' if torch.cuda.is_available() else 'cpu'):
    """
    Train an LSTM on the first `training_ratio` of (train_Xtime, train_ytime),
    then select the top `subset_ratio` fraction of samples by predicted score
    on both train and held‐out validation, and report their actual means.
    """
    # -- split train / val by time order --
    train_Xtime = torch.from_numpy(train_Xtime_np).float()
    train_ytime = torch.from_numpy(train_ytime_np).float()
    N = train_Xtime.shape[0]
    split = int(N * training_ratio)
    X_train = train_Xtime[:split].to(device)
    y_train = train_ytime[:split].to(device)
    X_val   = train_Xtime[split:].to(device)
    y_val   = train_ytime[split:].to(device)

    # -- model setup --
    D = X_train.size(-1)
    H = params.get('hidden_size', 64)
    model = nn.Sequential(
        nn.LSTM(input_size=D, hidden_size=H, batch_first=True),
        nn.Flatten(start_dim=0, end_dim=1),                     # (h, _) → (N, H)
        nn.Linear(H, 1)
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=params.get('lr', 1e-3))
    loss_fn = nn.MSELoss()

    # -- training loop --
    batch_size = params.get('batch_size', 512)
    train_ds = TensorDataset(X_train, y_train)
    loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    for epoch in range(params.get('epochs', 10)):
        model.train()
        for xb, yb in loader:
            optimizer.zero_grad()
            preds, _ = model[0](xb)        # unpack LSTM
            last = preds[:, -1, :]
            out = model[2](last)
            loss = loss_fn(out.squeeze(), yb)
            loss.backward()
            optimizer.step()

    # -- scoring and subset selection --
    device = torch.device('cpu')
    # move model parts
    model0 = model[0].to(device)
    model2 = model[2].to(device)

    # move your data
    X_train = X_train.to(device)
    y_train = y_train.to(device)
    X_val   = X_val.to(device)
    y_val   = y_val.to(device)
    model.eval()
    with torch.no_grad():
        # train
        preds_tr, _ = model0(X_train)
        scores_tr = model2(preds_tr[:, -1, :]).squeeze()
        k_tr = max(1, int(len(scores_tr) * subset_ratio))
        top_tr = torch.topk(scores_tr, k_tr).indices
        mean_tr_subset = y_train[top_tr].mean().item()
        mean_tr_all    = y_train.mean().item()

        # validation
        preds_val, _ = model0(X_val)
        scores_val = model2(preds_val[:, -1, :]).squeeze()
        k_val = max(1, int(len(scores_val) * subset_ratio))
        top_val = torch.topk(scores_val, k_val).indices
        mean_val_subset = y_val[top_val].mean().item()
        mean_val_all    = y_val.mean().item()

    # -- results --
    torch.cuda.empty_cache()
    return {
        'mean_train_all': mean_tr_all,
        'mean_train_subset': mean_tr_subset,
        'mean_val_all': mean_val_all,
        'mean_val_subset': mean_val_subset,
        'train_subset_idx': top_tr.cpu().numpy(),
        'val_subset_idx': top_val.cpu().numpy(),
        'model': model
    }


In [9]:
print(torch.version.cuda)
print(torch.cuda.is_available())      # should print True
print(torch.cuda.get_device_name(0))  # your GPU name
torch.cuda.empty_cache()

11.8
True
NVIDIA GeForce RTX 3060


In [10]:
# set your LSTM params
params = {
    'hidden_size': 128,
    'lr': 1e-3,
    'batch_size': 128,
    'epochs': 50
}

results = run_gpt(params, train_Xtime, train_ytime,
              training_ratio=0.90, subset_ratio=0.2)

# inspect outcome
print(f"Mean y (all train):   {results['mean_train_all']:.3f}")
print(f"Mean y (top 10% train):{results['mean_train_subset']:.3f}")
print(f"Mean y (all val):     {results['mean_val_all']:.3f}")
print(f"Mean y (top 10% val): {results['mean_val_subset']:.3f}")

# indices of selected samples
print("Selected train indices:", results['train_subset_idx'])
print("Selected val   indices:", results['val_subset_idx'])


Mean y (all train):   0.533
Mean y (top 10% train):0.616
Mean y (all val):     0.522
Mean y (top 10% val): 0.527
Selected train indices: [37726 40720 40654 ... 45491 17313 50097]
Selected val   indices: [2993 5093 5617 ... 1115  367 1676]


In [11]:
d = results['val_subset_idx']
ddiff = np.diff(np.sort(d))

In [12]:
params.update({
    "TreeTime_lstm_units": 16,
    "TreeTime_lstm_num_layers": 4,
    "TreeTime_lstm_learning_rate": 0.002,
    "TreeTime_lstm_conv1d": True,
    "TreeTime_lstm_batch_size": 2**11,
    "TreeTime_lstm_epochs": 2,
    "TreeTime_lstm_dropout": 0.00,
    "TreeTime_inter_dropout": 0.00,
    "TreeTime_input_gaussian_noise": 0.0,
    "TreeTime_lstm_loss": "mse",
})

In [15]:
q = 0.2
itermax = 5

mask = np.ones(train_Xtime.shape[0], dtype=bool)
for i in range(itermax):
    print(f"Iteration {i+1}/{itermax}")
    
    val_rmse, model = run(params, train_Xtime=train_Xtime[mask], train_ytime=train_ytime[mask], training_ratio=0.9)
    print(f"Validation RMSE: {val_rmse:.4f}")
    
    # Update mask based on validation RMSE
    y_train_pred = model.predict(train_Xtime[mask], batch_size=params['TreeTime_lstm_batch_size'])[:,0]
    rsme_train = np.sqrt(np.mean((y_train_pred - train_ytime[mask]) ** 2))
    print(f"Train RMSE: {rsme_train:.4f}")
    print(f"Sqrt Variance: {np.sqrt(np.var(y_train_pred)):.4f}")
    print(f"mean y time: {np.mean(train_ytime[mask]):.4f}")
    print(f"pred Quantile 0.5: {np.mean(train_ytime[mask][y_train_pred > np.quantile(y_train_pred, 0.5)]):.4f}")
    print(f"pred Quantile 0.8: {np.mean(train_ytime[mask][y_train_pred > np.quantile(y_train_pred, 0.8)]):.4f}")
    
    mask_loop = (y_train_pred > np.quantile(y_train_pred, q))
    prev_count = mask.sum()
    mask[mask] = mask_loop
    new_count = mask.sum()
    print(f"Mask size: {new_count/train_Xtime.shape[0]}")
    if new_count == prev_count:
        print("No change in mask, stopping early")
        break

Iteration 1/5
Epoch 1/4
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 8s/step - loss: 0.3672 - mse: 0.2512 - rmse: 0.4989 - val_loss: 0.1738 - val_mse: 0.0611 - val_rmse: 0.2472 - learning_rate: 0.0010
Epoch 2/4
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 8s/step - loss: 0.1861 - mse: 0.0744 - rmse: 0.2705 - val_loss: 0.1918 - val_mse: 0.0798 - val_rmse: 0.2825 - learning_rate: 0.0010
Epoch 3/4
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 8s/step - loss: 0.1859 - mse: 0.0738 - rmse: 0.2723 - val_loss: 0.1674 - val_mse: 0.0564 - val_rmse: 0.2375 - learning_rate: 0.0010
Epoch 4/4
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 8s/step - loss: 0.1759 - mse: 0.0650 - rmse: 0.2548 - val_loss: 0.1640 - val_mse: 0.0540 - val_rmse: 0.2323 - learning_rate: 0.0010
Validation RMSE: 0.2323
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2s/step
Train RMSE: 0.2496
Sqrt Variance: 0.0849
mean y time: 0.5315
pred Quant

In [9]:
y_train_pred = model.predict(train_Xtime, batch_size=params['TreeTime_lstm_batch_size'])[:,0]
rsme_train = np.sqrt(np.mean((y_train_pred - train_ytime) ** 2))
print(f"Train RMSE: {rsme_train:.4f}")
print(f"Sqrt Variance: {np.sqrt(np.var(y_train_pred)):.4f}")

[1m2613/2613[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 16ms/step
Train RMSE: 0.1545
Sqrt Variance: 0.0001


In [10]:
print(f"mean y time: {np.mean(train_ytime):.4f}")
print(f"pred Quantile 0.5: {np.mean(train_ytime[y_train_pred > np.quantile(y_train_pred, 0.5)]):.4f}")
print(f"pred Quantile 0.8: {np.mean(train_ytime[y_train_pred > np.quantile(y_train_pred, 0.8)]):.4f}")

mean y time: 0.5085
pred Quantile 0.5: 0.5106
pred Quantile 0.8: 0.5124


In [11]:
y_test_pred = model.predict(test_Xtime, batch_size=params['TreeTime_lstm_batch_size'])[:,0]
rsme_test = np.sqrt(np.mean((y_test_pred - test_ytime) ** 2))
print(f"Test RMSE: {rsme_test:.4f}")
print(f"Sqrt Variance: {np.sqrt(np.var(y_test_pred)):.4f}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Test RMSE: 0.1250
Sqrt Variance: 0.0001


In [12]:
z = 0
zz = 0
zzz = 0

In [None]:
import optuna
def objective(trial: optuna.Trial):
    # sample hyperparameters
    optparams = params.copy()
    optparams.update({
        "TreeTime_lstm_units": trial.suggest_categorical("units", [64, 128, 256]),
        "TreeTime_lstm_num_layers": trial.suggest_int("num_layers", 3, 8),
        "TreeTime_lstm_learning_rate": trial.suggest_float("learning_rate", 1e-3, 5e-2),
        "TreeTime_lstm_conv1d": trial.suggest_categorical("use_conv1d", [False, True]),
        "TreeTime_lstm_batch_size": trial.suggest_categorical("batch_size", [2**12, 2**13, 2**14]),
    })
    # run and report validation RMSE
    val_rmse = run(optparams)
    trial.report(val_rmse, step=0)
    if trial.should_prune():
        raise optuna.TrialPruned()
    return val_rmse


study = optuna.create_study(
    direction="minimize",
    sampler=optuna.samplers.TPESampler(),
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5)
)
study.optimize(objective, timeout=60*60*8)

print("Best RMSE:", study.best_value)
print("Best params:", study.best_trial.params)