In [1]:
import sys
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, GaussianNoise, LSTM, Bidirectional, Dropout, Dense, Conv1D
from tensorflow.keras import regularizers, backend as K
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, Callback
from tensorflow.keras.metrics import RootMeanSquaredError, R2Score

# Get the absolute path to the project directory
project_dir = os.path.abspath("..")

# Append the project directory to sys.path
if project_dir not in sys.path:
    sys.path.append(project_dir)
    
from src.predictionModule.LoadupSamples import LoadupSamples

import pandas as pd
import numpy as np
import polars as pl
import datetime
import seaborn as sns
import lightgbm as lgb
import random
import matplotlib.pyplot as plt
import logging
import time
import re

In [2]:
params = {
    "idxAfterPrediction": 3,
    'timesteps': 60,
    'target_option': 'mean',
    "LoadupSamples_time_scaling_stretch": True,
    "LoadupSamples_time_inc_factor": 10,
    
    "TreeTime_lstm_units": 32,
    "TreeTime_lstm_num_layers": 3,
    "TreeTime_lstm_dropout": 0.00001,
    "TreeTime_lstm_recurrent_dropout": 0.00001,
    "TreeTime_lstm_learning_rate": 0.001,
    "TreeTime_lstm_optimizer": "adam",
    "TreeTime_lstm_bidirectional": True,
    "TreeTime_lstm_batch_size": 2**12,
    "TreeTime_lstm_epochs": 2,
    "TreeTime_lstm_l1": 0.00001,
    "TreeTime_lstm_l2": 0.00001,
    "TreeTime_inter_dropout": 0.00001,
    "TreeTime_input_gaussian_noise": 0.00001,
    "TreeTime_lstm_conv1d": True,
    "TreeTime_lstm_conv1d_kernel_size": 3,
    "TreeTime_lstm_loss": "mse",
}

In [3]:
stock_group = "group_regOHLCV_over5years"

eval_date = datetime.date(year=2025, month=6, day=13)
start_train_date = datetime.date(year=2020, month=1, day=1)

ls = LoadupSamples(
    train_start_date=start_train_date,
    test_dates=[eval_date],
    group=stock_group,
    group_type="Time",
    params=params,
)

In [4]:
ls.load_samples(main_path = "../src/featureAlchemy/bin/")

NaN values found in training time features. 1806 Samples removed.


In [5]:
train_Xtree = ls.train_Xtree
train_ytree = ls.train_ytree
train_Xtime = ls.train_Xtime
train_ytime = ls.train_ytime

test_Xtree = ls.test_Xtree
test_ytree = ls.test_ytree
test_Xtime = ls.test_Xtime
test_ytime = ls.test_ytime

treenames = ls.featureTreeNames
timenames = ls.featureTimeNames

In [6]:
print(np.mean(train_ytree)) if stock_group == "Time" else None
print(np.mean(train_ytime)) if stock_group == "Tree" else None

In [7]:
print(timenames)
print(train_Xtime.shape)

['FeatureLSTM_Price' 'FeatureLSTM_Volume']
(2477819, 90, 2)


In [8]:
def run_tf(params, train_Xtime=train_Xtime, train_ytime=train_ytime, training_ratio=0.95):
    # Hyperparameters to tune
    lstm_units = params["TreeTime_lstm_units"]
    num_layers = params["TreeTime_lstm_num_layers"]
    dropout = params["TreeTime_lstm_dropout"]
    recurrent_dropout = params["TreeTime_lstm_recurrent_dropout"]
    learning_rate = params["TreeTime_lstm_learning_rate"]
    optimizer_name = params["TreeTime_lstm_optimizer"]
    bidirectional = params["TreeTime_lstm_bidirectional"]
    batch_size = params["TreeTime_lstm_batch_size"]
    epochs = params["TreeTime_lstm_epochs"]
    loss_name = params["TreeTime_lstm_loss"]

    # Regularization hyperparameters
    l1 = params.get("TreeTime_lstm_l1", 0.0)
    l2 = params.get("TreeTime_lstm_l2", 0.0)
    inter_dropout = params.get("TreeTime_inter_dropout", 0.0)
    noise_std = params.get("TreeTime_input_gaussian_noise", 0.0)

    # Conv1D option
    use_conv1d = params.get("TreeTime_lstm_conv1d", False)
    conv_filters = lstm_units
    conv_kernel = params.get("TreeTime_lstm_conv1d_kernel_size", 3)
    X_full, y_full = train_Xtime, train_ytime
    n_total = X_full.shape[0]
    split_at = int(n_total * training_ratio)
    X_train, X_holdout = X_full[:split_at], X_full[split_at:]
    y_train, y_holdout = y_full[:split_at], y_full[split_at:]

    # Build model
    model = Sequential([Input(shape=train_Xtime.shape[1:])])
    # Add Gaussian noise to inputs
    if noise_std > 0:
        model.add(GaussianNoise(noise_std))
    # Add Conv1D layer if opted in
    if use_conv1d:
        model.add(Conv1D(filters=conv_filters,
                        kernel_size=conv_kernel,
                        padding='same',
                        activation='linear'))
    # Add LSTM layers with regularization and dropout
    for i in range(num_layers):
        return_seq = i < (num_layers - 1)
        lstm_layer = LSTM(
            lstm_units,
            return_sequences=return_seq,
            dropout=dropout,
            recurrent_dropout=recurrent_dropout,
            kernel_regularizer=regularizers.L1L2(l1=l1, l2=l2)
        )
        if bidirectional:
            model.add(Bidirectional(lstm_layer))
        else:
            model.add(lstm_layer)
        # Add dropout between layers
        if inter_dropout > 0 and return_seq:
            model.add(Dropout(inter_dropout))
    # Output layer
    model.add(Dense(1, activation='linear', kernel_regularizer=regularizers.L1L2(l1=l1, l2=l2)))
    # Optimizer
    if optimizer_name == "adam":
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer_name == "rmsprop":
        optimizer = RMSprop(learning_rate=learning_rate)
    else:
        raise ValueError(f"Unknown optimizer: {optimizer_name}")
    def quantile_loss(q):
        def loss(y_true, y_pred):
            e = y_true - y_pred
            return tf.reduce_mean(tf.maximum(q*e, (q-1)*e))
        return loss
    def r2_keras(y_true, y_pred):
        """
        Returns R^2 metric: 1 - SS_res / SS_tot
        """
        ss_res =  K.sum(K.square(y_true - y_pred)) 
        ss_tot = K.sum(K.square(y_true - K.mean(y_true)))
        # avoid division by zero
        return 1 - ss_res/(ss_tot + K.epsilon())
    def neg_r2_loss(y_true, y_pred):
        """
        Loss function to *maximize* R^2 by minimizing its negative.
        """
        return -r2_keras(y_true, y_pred)
    if loss_name == "mse":
        loss_lstm = MeanSquaredError()
    elif loss_name == "r2":
        loss_lstm = neg_r2_loss
    else:
        # handles quantile_1,3,5,7,9 etc.
        q = int(loss_name.split("_")[1]) / 10.0
        loss_lstm = quantile_loss(q)
    # Compile
    model.compile(
        optimizer=optimizer,
        loss=loss_lstm,
        metrics=[MeanSquaredError(name='mse'),
                RootMeanSquaredError(name='rmse')]
    )
    # Callbacks
    class TimeLimit(Callback):
        def __init__(self, max_seconds): super().__init__(); self.max_seconds = max_seconds
        def on_train_begin(self, logs=None): self.t0 = time.time()
        def on_batch_end(self, batch, logs=None): (time.time() - self.t0 > self.max_seconds) and setattr(self.model, 'stop_training', True)
    es = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)
    rlrop = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=2)
    time_cb = TimeLimit(3600) 
    # Train
    history = model.fit(
        X_train,
        y_train,
        batch_size=batch_size,
        epochs=epochs,
        validation_data=(X_holdout, y_holdout),
        callbacks=[es, rlrop, time_cb],
        shuffle=False,
    )
    
    return min(history.history['val_rmse']), model

In [9]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm, trange
import shap

class TreeTimeLSTM(nn.Module):
    def __init__(self, 
                 input_size,
                 lstm_units,
                 num_layers,
                 dropout,
                 recurrent_dropout,
                 bidirectional,
                 l1=0.0,
                 l2=0.0,
                 use_conv1d=False,
                 conv_kernel=3,
                 noise_std=0.0,
                 inter_dropout=0.0):
        super().__init__()
        self.use_conv1d = use_conv1d
        self.noise_std = noise_std
        self.inter_dropout = inter_dropout

        if use_conv1d:
            self.conv1d = nn.Conv1d(
                in_channels=input_size,
                out_channels=lstm_units,
                kernel_size=conv_kernel,
                padding=conv_kernel//2
            )
            input_size = lstm_units

        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=lstm_units,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0.0,
            bidirectional=bidirectional,
            batch_first=True
        )
        self.dropout = nn.Dropout(inter_dropout) if inter_dropout > 0 else None
        self.output = nn.Linear(
            lstm_units * (2 if bidirectional else 1),
            1
        )
        self.l1 = l1
        self.l2 = l2

    def forward(self, x):
        if self.noise_std > 0:
            x = x + torch.randn_like(x) * self.noise_std
        if self.use_conv1d:
            x = x.transpose(1, 2)
            x = self.conv1d(x)
            x = x.transpose(1, 2)
        out, _ = self.lstm(x)
        out_last = out[:, -1, :]
        if self.dropout:
            out_last = self.dropout(out_last)
        return self.output(out_last)

# Loss functions
def quantile_loss(q):
    def loss_fn(y_pred, y_true):
        e = y_true - y_pred
        return torch.mean(torch.max(q * e, (q - 1) * e))
    return loss_fn

def r2_metric(y_pred, y_true):
    ss_res = torch.sum((y_true - y_pred) ** 2)
    ss_tot = torch.sum((y_true - torch.mean(y_true)) ** 2)
    return 1 - ss_res / (ss_tot + 1e-6)

def neg_r2_loss(y_pred, y_true):
    return -r2_metric(y_pred, y_true)


def run(params, train_Xtime, train_ytime, training_ratio=0.95, device='cpu'):
    # Hyperparameters
    lstm_units = params['TreeTime_lstm_units']
    num_layers = params['TreeTime_lstm_num_layers']
    dropout = params['TreeTime_lstm_dropout']
    recurrent_dropout = params['TreeTime_lstm_recurrent_dropout']
    learning_rate = params['TreeTime_lstm_learning_rate']
    optimizer_name = params['TreeTime_lstm_optimizer']
    bidirectional = params['TreeTime_lstm_bidirectional']
    batch_size = params['TreeTime_lstm_batch_size']
    epochs = params['TreeTime_lstm_epochs']
    loss_name = params['TreeTime_lstm_loss']
    l1 = params.get('TreeTime_lstm_l1', 0.0)
    l2 = params.get('TreeTime_lstm_l2', 0.0)
    inter_dropout = params.get('TreeTime_inter_dropout', 0.0)
    noise_std = params.get('TreeTime_input_gaussian_noise', 0.0)
    use_conv1d = params.get('TreeTime_lstm_conv1d', False)
    conv_kernel = params.get('TreeTime_lstm_conv1d_kernel_size', 3)

    # Data split
    n_total = train_Xtime.shape[0]
    split_at = int(n_total * training_ratio)
    X_train, y_train = train_Xtime[:split_at], train_ytime[:split_at]
    X_val, y_val = train_Xtime[split_at:], train_ytime[split_at:]

    train_ds = TensorDataset(
        torch.tensor(X_train, dtype=torch.float32),
        torch.tensor(y_train, dtype=torch.float32)
    )
    val_ds = TensorDataset(
        torch.tensor(X_val, dtype=torch.float32),
        torch.tensor(y_val, dtype=torch.float32)
    )
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=False)
    val_loader = DataLoader(val_ds, batch_size=batch_size)

    # Model
    model = TreeTimeLSTM(
        input_size=train_Xtime.shape[-1],
        lstm_units=lstm_units,
        num_layers=num_layers,
        dropout=dropout,
        recurrent_dropout=recurrent_dropout,
        bidirectional=bidirectional,
        l1=l1,
        l2=l2,
        use_conv1d=use_conv1d,
        conv_kernel=conv_kernel,
        noise_std=noise_std,
        inter_dropout=inter_dropout
    ).to(device)

    # Loss & optimizer
    if loss_name == 'mse':
        criterion = nn.MSELoss()
    elif loss_name == 'r2':
        criterion = lambda pred, true: neg_r2_loss(pred, true)
    else:
        q = int(loss_name.split('_')[1]) / 10.0
        criterion = quantile_loss(q)
    optimizer = optim.Adam(
        model.parameters(), lr=learning_rate, weight_decay=l2
    )
    if optimizer_name == 'rmsprop':
        optimizer = optim.RMSprop(
            model.parameters(), lr=learning_rate, weight_decay=l2
        )
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, factor=0.5, patience=2
    )

    best_rmse, wait = float('inf'), 0
    start_time = time.time()

    for epoch in trange(epochs, desc='Epochs'):
        model.train()
        sum_sq_error = 0.0
        total_samples = 0

        for X_batch, y_batch in tqdm(train_loader, desc='Training', leave=False):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()

            preds = model(X_batch).squeeze()
            # compute loss (with optional L1)
            loss = criterion(preds, y_batch)
            if l1 > 0:
                loss = loss + l1 * sum(p.abs().sum() for p in model.parameters())
            loss.backward()
            optimizer.step()

            # accumulate squared error for RMSE
            # note: detach so it doesn't track grads
            se = ((preds.detach() - y_batch) ** 2).sum().item()
            sum_sq_error += se
            total_samples += y_batch.numel()

            if time.time() - start_time > 3600:
                break

        train_rmse = (sum_sq_error / total_samples) ** 0.5

        # validation as before
        model.eval()
        val_rmses = []
        with torch.no_grad():
            for X_batch, y_batch in tqdm(val_loader, desc='Validation', leave=False):
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                preds = model(X_batch).squeeze()
                mse = nn.MSELoss()(preds, y_batch)
                val_rmses.append(torch.sqrt(mse).item())
        val_rmse = sum(val_rmses) / len(val_rmses)

        print(f"Epoch {epoch+1}/{epochs} — "
            f"Train RMSE: {train_rmse:.4f} — "
            f"Validation RMSE: {val_rmse:.4f}")

        scheduler.step(val_rmse)
        
        if val_rmse < best_rmse:
            best_rmse, wait = val_rmse, 0
            best_state = model.state_dict()
        else:
            wait += 1
            if wait >= 3:
                break
        if time.time() - start_time > 3600:
            break

    model.load_state_dict(best_state)
    return best_rmse, model

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
val_rmse, model = run(params, train_Xtime, train_ytime, training_ratio=0.9, device=device)
print(f"Validation RMSE: {val_rmse:.4f}")

Epochs:  50%|█████     | 1/2 [01:05<01:05, 65.07s/it]

Epoch 1/2 — Train RMSE: 0.1182 — Validation RMSE: 0.0919


Epochs: 100%|██████████| 2/2 [02:08<00:00, 64.14s/it]


Epoch 2/2 — Train RMSE: 0.0978 — Validation RMSE: 0.0919
Validation RMSE: 0.0919


In [121]:
X_new = np.random.randn(5, train_Xtime.shape[1], train_Xtime.shape[2])

# Convert to torch tensor and send to device
n = 100000
X_tensor = torch.tensor(train_Xtime[-n:], dtype=torch.float32).to(device)

# Put model into eval mode and disable grad
model.eval()
with torch.no_grad():
    preds = model(X_tensor)        # (N, 1) tensor
    preds = preds[:,0]      # (N,) tensor

# Bring back to CPU NumPy array if you like
preds = preds.cpu().numpy()

true_val = train_ytime[-n:]

rsme_err = np.sqrt(np.mean((preds - train_ytime[-n:])**2))
q = 0.95
mask_pred_above = preds >= np.quantile(preds, q)
mask_pred_below = preds <= np.quantile(preds, 1-q)
print(f"Mean error: {rsme_err:.4f}")
print(f"Mean all prediction: {np.mean(true_val):.4f}")
print(f"Mean above prediction: {np.mean(true_val[mask_pred_above]):.4f}")
print(f"Mean below prediction: {np.mean(true_val[mask_pred_below]):.4f}")
print(f"True values above zero: {np.sum(mask_pred_above)/len(mask_pred_above):.4f}")
print(f"True values below zero: {np.sum(true_val[mask_pred_below])/len(mask_pred_below):.4f}")

Mean error: 0.0983
Mean all prediction: 0.5034
Mean above prediction: 0.5712
Mean below prediction: 0.4662
True values above zero: 0.0500
True values below zero: 0.0233


In [23]:
np.diff(np.where(mask_pred_above)[0])

array([239, 239, 239, 239,  92, 147,  69,  23, 147,  92, 147,  92, 147,
        10,  82, 147, 239,  92,  77, 162,  77,  70,   2,  67, 170,  69,
       100,  70, 169,  70, 391,  81, 158,  81, 239, 158,  81, 158,  81,
       158,  97, 142,  17,  64,   6,  10, 142,  17,  64,   6,  10, 142,
        17,  70,  10, 142,  97, 239, 142,  97, 620, 239, 256, 222,  97,
       142,  97, 239, 142,  97, 239, 231,   8, 231,   8, 190,  49, 229,
        10, 190,  39,  10, 159,  31,  39,  10, 159,  31,  39,   2,   8,
        59,  83,  17,  31,  39,   2,   8,  59, 100,  70,   2,   8,  59,
       100,  70,  10,  59, 100,  66,  10,  59,  99, 135,  99, 135,  99,
       135,  99, 132, 114, 153,  25,  89,  45, 267,  45,  65, 202,   6,
        39,  27,  38, 199,   3,  45,  65,  56, 143,   3,   6,  39,  27,
        38,  23,  33,  17, 126,   3,  45,  65,  23,  33, 143,   3,  45,
        65,  23,  33, 143,   3, 110,  56, 146, 110,  56, 143, 169, 143,
        48, 264,  48, 121, 772, 307,   5,  49, 258,   5,   4,  6

In [12]:
val_rmse, model = run_tf(params, train_Xtime, train_ytime, training_ratio=0.9)
print(f"Validation RMSE: {val_rmse:.4f}")

Epoch 1/30
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m792s[0m 5s/step - loss: 0.1599 - mse: 0.0145 - rmse: 0.1049 - val_loss: 0.1010 - val_mse: 9.3428e-04 - val_rmse: 0.0306 - learning_rate: 0.0010
Epoch 2/30
[1m 55/148[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m7:58[0m 5s/step - loss: 0.0960 - mse: 6.3655e-04 - rmse: 0.0251

KeyboardInterrupt: 

In [12]:
import torch

def explain_grad_input(model, test_data, device='cpu'):
    """
    Compute feature importances via Gradient x Input for an LSTM model.

    model: trained TreeTimeLSTM
    test_data: numpy array (N, seq, features)
    """
    model.to(device).eval()
    inputs = torch.tensor(test_data, dtype=torch.float32, device=device, requires_grad=True)
    
    # forward pass
    outputs = model(inputs).squeeze()               # (N,)
    
    # backpropagate to get ∂y/∂x
    grads = torch.autograd.grad(
        outputs,
        inputs,
        grad_outputs=torch.ones_like(outputs),
        retain_graph=False,
        create_graph=False
    )[0]                                             # shape: (N, seq, features)
    
    # gradient × input
    attributions = grads * inputs                   # elementwise
    # aggregate over sequence dimension to get per-feature importance
    feature_importance = attributions.abs().mean(dim=1)  # shape: (N, features)
    
    return feature_importance.cpu().detach().numpy()


In [13]:
# usage
test_samples = train_Xtime[-1000:-1]
importances = explain_grad_input(model, test_samples)
print("Importances shape:", importances.shape)  # -> (5, n_features)

importances_mean = importances.mean(axis=0)

for i, name in enumerate(timenames):
    print(f"{name}: {importances_mean[i]*1e5:.4f}")

Importances shape: (999, 5)
MathFeature_TradedPrice: 0.0000
MathFeature_TradedPrice_sp0: 0.0000
MathFeature_TradedPrice_sp1: 0.0000
MathFeature_Return: 0.0000
MathFeature_PriceAdjustment: 0.0000


IndexError: index 5 is out of bounds for axis 0 with size 5

In [12]:
model = model.to(device).eval()

# pick k random indices
idx = random.sample(range(len(train_Xtime)), 200)
Xtime_sample = train_Xtime[-1000:-1]
ytime_true  = train_ytime[-1000:-1].reshape(-1)
ytree_true = train_ytree[-1000:-1].reshape(-1)

# prepare tensor
X_tensor = torch.tensor(Xtime_sample, dtype=torch.float32)

# tensor on correct device
X_tensor = torch.from_numpy(Xtime_sample).float().to(device)
with torch.no_grad():
    preds = model(X_tensor).squeeze().cpu().numpy()
    
preds_tree = (preds-0.5)/5 +1.0
rmse = np.sqrt(((ytree_true - preds_tree)**2).mean())

print(f"RMSE on sample: {rmse:.4f}")

mask = preds_tree > np.quantile(preds_tree, 0.8)
mean_true_masked = (ytree_true[mask]).mean()
print(f"Mean on masked : {mean_true_masked:.4f}")

RMSE on sample: 0.0234
Mean on masked : nan


  mean_true_masked = (ytree_true[mask]).mean()
  ret = ret.dtype.type(ret / rcount)


In [88]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

def run_gpt(params,
        train_Xtime_np,
        train_ytime_np,
        training_ratio=0.95,
        subset_ratio=0.1,
        device='cuda' if torch.cuda.is_available() else 'cpu'):
    """
    Train an LSTM on the first `training_ratio` of (train_Xtime, train_ytime),
    then select the top `subset_ratio` fraction of samples by predicted score
    on both train and held-out validation, and report their actual means.
    """
    # -- split train / val by time order --
    train_Xtime = torch.from_numpy(train_Xtime_np).float()
    train_ytime = torch.from_numpy(train_ytime_np).float()
    N = train_Xtime.shape[0]
    split = int(N * training_ratio)
    X_train = train_Xtime[:split].to(device)
    y_train = train_ytime[:split].to(device)
    X_val   = train_Xtime[split:].to(device)
    y_val   = train_ytime[split:].to(device)

    # -- model setup --
    D = X_train.size(-1)
    H = params.get('hidden_size', 64)
    model = nn.Sequential(
        nn.LSTM(input_size=D, hidden_size=H, batch_first=True),
        nn.Flatten(start_dim=0, end_dim=1),                     # (h, _) → (N, H)
        nn.Linear(H, 1)
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=params.get('lr', 1e-3))
    loss_fn = nn.MSELoss()

    # -- training loop --
    batch_size = params.get('batch_size', 512)
    train_ds = TensorDataset(X_train, y_train)
    loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    for epoch in range(params.get('epochs', 10)):
        model.train()
        for xb, yb in loader:
            optimizer.zero_grad()
            preds, _ = model[0](xb)        # unpack LSTM
            last = preds[:, -1, :]
            out = model[2](last)
            loss = loss_fn(out.squeeze(), yb)
            loss.backward()
            optimizer.step()

    # -- scoring and subset selection --
    device = torch.device('cpu')
    # move model parts
    model0 = model[0].to(device)
    model2 = model[2].to(device)

    # move your data
    X_train = X_train.to(device)
    y_train = y_train.to(device)
    X_val   = X_val.to(device)
    y_val   = y_val.to(device)
    model.eval()
    with torch.no_grad():
        # train
        preds_tr, _ = model0(X_train)
        scores_tr = model2(preds_tr[:, -1, :]).squeeze()
        k_tr = max(1, int(len(scores_tr) * subset_ratio))
        top_tr = torch.topk(scores_tr, k_tr).indices
        mean_tr_subset = y_train[top_tr].mean().item()
        mean_tr_all    = y_train.mean().item()

        # validation
        preds_val, _ = model0(X_val)
        scores_val = model2(preds_val[:, -1, :]).squeeze()
        k_val = max(1, int(len(scores_val) * subset_ratio))
        top_val = torch.topk(scores_val, k_val).indices
        mean_val_subset = y_val[top_val].mean().item()
        mean_val_all    = y_val.mean().item()

    # -- results --
    torch.cuda.empty_cache()
    return {
        'mean_train_all': mean_tr_all,
        'mean_train_subset': mean_tr_subset,
        'mean_val_all': mean_val_all,
        'mean_val_subset': mean_val_subset,
        'train_subset_idx': top_tr.cpu().numpy(),
        'val_subset_idx': top_val.cpu().numpy(),
        'model': model
    }


In [89]:
print(torch.version.cuda)
print(torch.cuda.is_available())      # should print True
print(torch.cuda.get_device_name(0))  # your GPU name
torch.cuda.empty_cache()

11.8
True
NVIDIA GeForce RTX 3060


In [90]:
# set your LSTM params
params = {
    'hidden_size': 128,
    'lr': 1e-3,
    'batch_size': 128,
    'epochs': 50
}

results = run_gpt(params, train_Xtime, train_ytime,
              training_ratio=0.90, subset_ratio=0.2)

# inspect outcome
print(f"Mean y (all train):   {results['mean_train_all']:.3f}")
print(f"Mean y (top 10% train):{results['mean_train_subset']:.3f}")
print(f"Mean y (all val):     {results['mean_val_all']:.3f}")
print(f"Mean y (top 10% val): {results['mean_val_subset']:.3f}")

# indices of selected samples
print("Selected train indices:", results['train_subset_idx'])
print("Selected val   indices:", results['val_subset_idx'])


KeyboardInterrupt: 

In [None]:
d = results['val_subset_idx']
ddiff = np.diff(np.sort(d))

In [None]:
params.update({
    "TreeTime_lstm_units": 16,
    "TreeTime_lstm_num_layers": 4,
    "TreeTime_lstm_learning_rate": 0.002,
    "TreeTime_lstm_conv1d": True,
    "TreeTime_lstm_batch_size": 2**11,
    "TreeTime_lstm_epochs": 2,
    "TreeTime_lstm_dropout": 0.00,
    "TreeTime_inter_dropout": 0.00,
    "TreeTime_input_gaussian_noise": 0.0,
    "TreeTime_lstm_loss": "mse",
})

In [None]:
q = 0.2
itermax = 5

mask = np.ones(train_Xtime.shape[0], dtype=bool)
for i in range(itermax):
    print(f"Iteration {i+1}/{itermax}")
    
    val_rmse, model = run_torch(params, train_Xtime=train_Xtime[mask], train_ytime=train_ytime[mask], training_ratio=0.9)
    print(f"Validation RMSE: {val_rmse:.4f}")
    
    # Update mask based on validation RMSE
    y_train_pred = model.predict(train_Xtime[mask], batch_size=params['TreeTime_lstm_batch_size'])[:,0]
    rsme_train = np.sqrt(np.mean((y_train_pred - train_ytime[mask]) ** 2))
    print(f"Train RMSE: {rsme_train:.4f}")
    print(f"Sqrt Variance: {np.sqrt(np.var(y_train_pred)):.4f}")
    print(f"mean y time: {np.mean(train_ytime[mask]):.4f}")
    print(f"pred Quantile 0.5: {np.mean(train_ytime[mask][y_train_pred > np.quantile(y_train_pred, 0.5)]):.4f}")
    print(f"pred Quantile 0.8: {np.mean(train_ytime[mask][y_train_pred > np.quantile(y_train_pred, 0.8)]):.4f}")
    
    mask_loop = (y_train_pred > np.quantile(y_train_pred, q))
    prev_count = mask.sum()
    mask[mask] = mask_loop
    new_count = mask.sum()
    print(f"Mask size: {new_count/train_Xtime.shape[0]}")
    if new_count == prev_count:
        print("No change in mask, stopping early")
        break

Iteration 1/5
Epoch 1/4
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 8s/step - loss: 0.3672 - mse: 0.2512 - rmse: 0.4989 - val_loss: 0.1738 - val_mse: 0.0611 - val_rmse: 0.2472 - learning_rate: 0.0010
Epoch 2/4
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 8s/step - loss: 0.1861 - mse: 0.0744 - rmse: 0.2705 - val_loss: 0.1918 - val_mse: 0.0798 - val_rmse: 0.2825 - learning_rate: 0.0010
Epoch 3/4
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 8s/step - loss: 0.1859 - mse: 0.0738 - rmse: 0.2723 - val_loss: 0.1674 - val_mse: 0.0564 - val_rmse: 0.2375 - learning_rate: 0.0010
Epoch 4/4
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 8s/step - loss: 0.1759 - mse: 0.0650 - rmse: 0.2548 - val_loss: 0.1640 - val_mse: 0.0540 - val_rmse: 0.2323 - learning_rate: 0.0010
Validation RMSE: 0.2323
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2s/step
Train RMSE: 0.2496
Sqrt Variance: 0.0849
mean y time: 0.5315
pred Quant

In [None]:
y_train_pred = model.predict(train_Xtime, batch_size=params['TreeTime_lstm_batch_size'])[:,0]
rsme_train = np.sqrt(np.mean((y_train_pred - train_ytime) ** 2))
print(f"Train RMSE: {rsme_train:.4f}")
print(f"Sqrt Variance: {np.sqrt(np.var(y_train_pred)):.4f}")

[1m2613/2613[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 16ms/step
Train RMSE: 0.1545
Sqrt Variance: 0.0001


In [None]:
print(f"mean y time: {np.mean(train_ytime):.4f}")
print(f"pred Quantile 0.5: {np.mean(train_ytime[y_train_pred > np.quantile(y_train_pred, 0.5)]):.4f}")
print(f"pred Quantile 0.8: {np.mean(train_ytime[y_train_pred > np.quantile(y_train_pred, 0.8)]):.4f}")

mean y time: 0.5085
pred Quantile 0.5: 0.5106
pred Quantile 0.8: 0.5124


In [None]:
y_test_pred = model.predict(test_Xtime, batch_size=params['TreeTime_lstm_batch_size'])[:,0]
rsme_test = np.sqrt(np.mean((y_test_pred - test_ytime) ** 2))
print(f"Test RMSE: {rsme_test:.4f}")
print(f"Sqrt Variance: {np.sqrt(np.var(y_test_pred)):.4f}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Test RMSE: 0.1250
Sqrt Variance: 0.0001


In [None]:
z = 0
zz = 0
zzz = 0

In [None]:
import optuna
def objective(trial: optuna.Trial):
    # sample hyperparameters
    optparams = params.copy()
    optparams.update({
        "TreeTime_lstm_units": trial.suggest_categorical("units", [64, 128, 256]),
        "TreeTime_lstm_num_layers": trial.suggest_int("num_layers", 3, 8),
        "TreeTime_lstm_learning_rate": trial.suggest_float("learning_rate", 1e-3, 5e-2),
        "TreeTime_lstm_conv1d": trial.suggest_categorical("use_conv1d", [False, True]),
        "TreeTime_lstm_batch_size": trial.suggest_categorical("batch_size", [2**12, 2**13, 2**14]),
    })
    # run and report validation RMSE
    val_rmse = run_torch(optparams)
    trial.report(val_rmse, step=0)
    if trial.should_prune():
        raise optuna.TrialPruned()
    return val_rmse


study = optuna.create_study(
    direction="minimize",
    sampler=optuna.samplers.TPESampler(),
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5)
)
study.optimize(objective, timeout=60*60*8)

print("Best RMSE:", study.best_value)
print("Best params:", study.best_trial.params)