In [12]:
import sys
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, GaussianNoise, LSTM, Bidirectional, Dropout, Dense, Conv1D
from tensorflow.keras import regularizers, backend as K
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, Callback
from tensorflow.keras.metrics import RootMeanSquaredError, R2Score

# Get the absolute path to the project directory
project_dir = os.path.abspath("..")

# Append the project directory to sys.path
if project_dir not in sys.path:
    sys.path.append(project_dir)
    
from src.predictionModule.LoadupSamples import LoadupSamples

import pandas as pd
import numpy as np
import polars as pl
import datetime
import seaborn as sns
import lightgbm as lgb
import random
import matplotlib.pyplot as plt
import logging
import time
import re

In [13]:
params = {
    "idxAfterPrediction": 3,
    'timesteps': 60,
    'target_option': 'last',
    "LoadupSamples_time_scaling_stretch": True,
    
    "TreeTime_lstm_units": 64,
    "TreeTime_lstm_num_layers": 4,
    "TreeTime_lstm_dropout": 0.00001,
    "TreeTime_lstm_recurrent_dropout": 0.00001,
    "TreeTime_lstm_learning_rate": 0.001,
    "TreeTime_lstm_optimizer": "adam",
    "TreeTime_lstm_bidirectional": True,
    "TreeTime_lstm_batch_size": 2**12,
    "TreeTime_lstm_epochs": 30,
    "TreeTime_lstm_l1": 0.00001,
    "TreeTime_lstm_l2": 0.00001,
    "TreeTime_inter_dropout": 0.00001,
    "TreeTime_input_gaussian_noise": 0.00001,
    "TreeTime_lstm_conv1d": True,
    "TreeTime_lstm_conv1d_kernel_size": 3,
    "TreeTime_lstm_loss": "mse",
}

In [14]:
stock_group = "group_snp500_finanTo2011"

eval_date = datetime.date(year=2025, month=2, day=13)
start_train_date = datetime.date(year=2014, month=1, day=1)


params["idxAfterPrediction"] = 5
ls = LoadupSamples(
    train_start_date=start_train_date,
    test_dates=[eval_date],
    group=stock_group,
    params=params,
)

In [15]:
ls.load_samples(main_path = "../src/featureAlchemy/bin/")

In [16]:
train_Xtree = ls.train_Xtree
train_Xtime = ls.train_Xtime

treenames = ls.featureTreeNames
timenames = ls.featureTimeNames

train_ytree = ls.train_ytree
train_ytime = ls.train_ytime

meta = ls.meta_pl_train
idx_after = params["idxAfterPrediction"]
train_yhot = meta.select([f"target_close_at{i}" for i in range(1, idx_after + 1)]).to_numpy()
train_yhot = (train_yhot == train_yhot.max(axis=1, keepdims=True)).astype(int)


In [17]:
print(np.mean(train_ytree))
print(np.mean(train_ytime))

print(train_yhot.shape)

1.0025045
0.5120161
(668702, 5)


In [18]:
idx1 = np.where(timenames == "MathFeature_TradedPrice")[0][0]
idx2 = np.where(timenames == "FeatureTA_High")[0][0]
idx3 = np.where(timenames == "FeatureTA_Low")[0][0]
idx4 = np.where(timenames == "FeatureTA_volume_obv")[0][0]
idx5 = np.where(timenames == "FeatureTA_volume_vpt")[0][0]

#train_Xtime = train_Xtime[:, :, [idx1, idx2, idx3, idx4, idx5]]

In [19]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm, trange
import shap

class TreeTimeLSTM(nn.Module):
    def __init__(self, 
                 input_size,
                 lstm_units,
                 num_layers,
                 dropout,
                 recurrent_dropout,
                 bidirectional,
                 output_size,         # new: number of classes m
                 l1=0.0,
                 l2=0.0,
                 use_conv1d=False,
                 conv_kernel=3,
                 noise_std=0.0,
                 inter_dropout=0.0):
        super().__init__()
        self.use_conv1d = use_conv1d
        self.noise_std = noise_std
        self.inter_dropout = inter_dropout

        if use_conv1d:
            self.conv1d = nn.Conv1d(
                in_channels=input_size,
                out_channels=lstm_units,
                kernel_size=conv_kernel,
                padding=conv_kernel//2
            )
            input_size = lstm_units

        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=lstm_units,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0.0,
            bidirectional=bidirectional,
            batch_first=True
        )
        self.dropout = nn.Dropout(inter_dropout) if inter_dropout > 0 else None

        self.output = nn.Linear(
            lstm_units * (2 if bidirectional else 1),
            output_size        # now outputs m logits
        )
        self.l1 = l1
        self.l2 = l2

    def forward(self, x):
        if self.noise_std > 0:
            x = x + torch.randn_like(x) * self.noise_std
        if self.use_conv1d:
            x = x.transpose(1, 2)
            x = self.conv1d(x)
            x = x.transpose(1, 2)
        out, _ = self.lstm(x)
        out_last = out[:, -1, :]
        if self.dropout:
            out_last = self.dropout(out_last)
        return self.output(out_last)

class TreeTimeGRU(nn.Module):
    def __init__(self, 
                 input_size,
                 lstm_units,
                 num_layers,
                 dropout,
                 recurrent_dropout,
                 bidirectional,
                 output_size,         # new: number of classes m
                 l1=0.0,
                 l2=0.0,
                 use_conv1d=False,
                 conv_kernel=3,
                 noise_std=0.0,
                 inter_dropout=0.0):
        super().__init__()
        self.use_conv1d = use_conv1d
        self.noise_std = noise_std
        self.inter_dropout = inter_dropout

        if use_conv1d:
            self.conv1d = nn.Conv1d(
                in_channels=input_size,
                out_channels=lstm_units,
                kernel_size=conv_kernel,
                padding=conv_kernel//2
            )
            input_size = lstm_units

        self.gru = nn.GRU(
            input_size=input_size,
            hidden_size=lstm_units,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0.0,
            bidirectional=bidirectional,
            batch_first=True
        )
        self.dropout = nn.Dropout(inter_dropout) if inter_dropout > 0 else None

        self.output = nn.Linear(
            lstm_units * (2 if bidirectional else 1),
            output_size
        )
        self.l1 = l1
        self.l2 = l2

    def forward(self, x):
        if self.noise_std > 0:
            x = x + torch.randn_like(x) * self.noise_std
        if self.use_conv1d:
            x = x.transpose(1, 2)
            x = self.conv1d(x)
            x = x.transpose(1, 2)
        out, _ = self.gru(x)
        out_last = out[:, -1, :]
        if self.dropout:
            out_last = self.dropout(out_last)
        return self.output(out_last)

# Loss functions
def quantile_loss(q):
    def loss_fn(y_pred, y_true):
        e = y_true - y_pred
        return torch.mean(torch.max(q * e, (q - 1) * e))
    return loss_fn

def r2_metric(y_pred, y_true):
    ss_res = torch.sum((y_true - y_pred) ** 2)
    ss_tot = torch.sum((y_true - torch.mean(y_true)) ** 2)
    return 1 - ss_res / (ss_tot + 1e-6)

def neg_r2_loss(y_pred, y_true):
    return -r2_metric(y_pred, y_true)


def run(params, train_Xtime, train_ytime, training_ratio=0.90, device='cpu', type='lstm'):
    num_classes = train_ytime.shape[1]
    
    # Hyperparameters
    lstm_units = params['TreeTime_lstm_units']
    num_layers = params['TreeTime_lstm_num_layers']
    dropout = params['TreeTime_lstm_dropout']
    recurrent_dropout = params['TreeTime_lstm_recurrent_dropout']
    learning_rate = params['TreeTime_lstm_learning_rate']
    optimizer_name = params['TreeTime_lstm_optimizer']
    bidirectional = params['TreeTime_lstm_bidirectional']
    batch_size = params['TreeTime_lstm_batch_size']
    epochs = params['TreeTime_lstm_epochs']
    l1 = params.get('TreeTime_lstm_l1', 0.0)
    l2 = params.get('TreeTime_lstm_l2', 0.0)
    inter_dropout = params.get('TreeTime_inter_dropout', 0.0)
    noise_std = params.get('TreeTime_input_gaussian_noise', 0.0)
    use_conv1d = params.get('TreeTime_lstm_conv1d', False)
    conv_kernel = params.get('TreeTime_lstm_conv1d_kernel_size', 3)

    # Data split
    split_at = int(train_Xtime.shape[0] * training_ratio)
    X_train, y_train = train_Xtime[:split_at], train_ytime[:split_at]
    X_val,   y_val   = train_Xtime[split_at:],   train_ytime[split_at:]

    train_loader = DataLoader(
        TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                      torch.tensor(y_train, dtype=torch.float32)),
        batch_size=batch_size, shuffle=False
    )
    val_loader = DataLoader(
        TensorDataset(torch.tensor(X_val, dtype=torch.float32),
                      torch.tensor(y_val, dtype=torch.float32)),
        batch_size=batch_size
    )

    # Model
    if type == 'gru':
        model = TreeTimeGRU(
            input_size=X_train.shape[-1],
            lstm_units=lstm_units,
            num_layers=num_layers,
            dropout=dropout,
            recurrent_dropout=recurrent_dropout,
            bidirectional=bidirectional,
            output_size=num_classes,    # <-- here
            l1=l1,
            l2=l2,
            use_conv1d=use_conv1d,
            conv_kernel=conv_kernel,
            noise_std=noise_std,
            inter_dropout=inter_dropout
        ).to(device)
    else:  # default to LSTM
        model = TreeTimeLSTM(
            input_size=X_train.shape[-1],
            lstm_units=lstm_units,
            num_layers=num_layers,
            dropout=dropout,
            recurrent_dropout=recurrent_dropout,
            bidirectional=bidirectional,
            output_size=num_classes,    # <-- here
            l1=l1,
            l2=l2,
            use_conv1d=use_conv1d,
            conv_kernel=conv_kernel,
            noise_std=noise_std,
            inter_dropout=inter_dropout
        ).to(device)

    # Loss: BCEWithLogits for one-hot targets
    criterion = nn.CrossEntropyLoss()

    # Optimizer & scheduler as before…
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=l2)
    if optimizer_name == 'rmsprop':
        optimizer = optim.RMSprop(model.parameters(), lr=learning_rate, weight_decay=l2)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=2)

    history = {'train_loss': [], 'val_loss': [], 'val_acc': []}
    best_loss, wait = float('inf'), 0

    best_loss, wait = float('inf'), 0
    for epoch in range(1, epochs+1):
        # --- Training ---
        model.train()
        train_losses = []
        for Xb, yb in train_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            optimizer.zero_grad()
            logits = model(Xb)
            loss = criterion(logits, yb)
            if l1>0:
                l1_pen = sum(p.abs().sum() for p in model.parameters())
                loss += l1 * l1_pen
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
        avg_train = sum(train_losses) / len(train_losses)

        # --- Validation ---
        model.eval()
        val_losses, correct, total = [], 0, 0
        with torch.no_grad():
            for Xb, yb in val_loader:
                Xb, yb = Xb.to(device), yb.to(device)
                logits = model(Xb)
                loss = criterion(logits, yb)
                val_losses.append(loss.item())
                preds = logits.argmax(dim=1)
                targets = yb.argmax(dim=1)
                correct += (preds == targets).sum().item()
                total += yb.size(0)
        avg_val = sum(val_losses) / len(val_losses)
        val_acc = correct / total

        # record & print
        history['train_loss'].append(avg_train)
        history['val_loss'].append(avg_val)
        history['val_acc'].append(val_acc)
        print(f"Epoch {epoch}/{epochs} — "
              f"train_loss: {avg_train:.4f}, "
              f"val_loss: {avg_val:.4f}, "
              f"val_acc: {val_acc:.4f}")

        scheduler.step(avg_val)
        # early stopping
        if avg_val < best_loss:
            best_loss, best_state = avg_val, model.state_dict()
            wait = 0
        else:
            wait += 1
            if wait >= 3: break

    model.load_state_dict(best_state)
    return best_loss, val_acc, model

In [20]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
best_loss, val_acc, model = run(params, train_Xtime, train_yhot, training_ratio=0.9, device=device, type='lstm')
print(f"Validation RMSE: {val_acc:.4f}")

Epoch 1/30 — train_loss: 1.6915, val_loss: 1.5757, val_acc: 0.2972
Epoch 2/30 — train_loss: 1.6064, val_loss: 1.5754, val_acc: 0.2972
Epoch 3/30 — train_loss: 1.5926, val_loss: 1.5752, val_acc: 0.2972


KeyboardInterrupt: 

In [11]:
X_new = np.random.randn(5, train_Xtime.shape[1], train_Xtime.shape[2])

# Convert to torch tensor and send to device
n = 20000
X_tensor = torch.tensor(train_Xtime[-n:], dtype=torch.float32).to(device)

# Put model into eval mode and disable grad
model.eval()
with torch.no_grad():
    preds = model(X_tensor)        # (N, 1) tensor
    preds = preds.squeeze(-1)      # (N,) tensor

# Bring back to CPU NumPy array if you like
preds = preds.cpu().numpy()

true_val = train_ytree[-n:]

rsme_err = np.sqrt(np.mean((preds - train_ytime[-n:])**2))
q = 0.995
mask_pred_above = preds >= np.quantile(preds, q)
mask_pred_below = preds <= np.quantile(preds, 1-q)
print(f"Mean error: {rsme_err:.4f}")
print(f"Mean all prediction: {np.mean(true_val):.4f}")
print(f"Mean above prediction: {np.mean(true_val[mask_pred_above]):.4f}")
print(f"Mean below prediction: {np.mean(true_val[mask_pred_below]):.4f}")
print(f"True values above zero: {np.sum(mask_pred_above)/len(mask_pred_above):.4f}")
print(f"True values below zero: {np.sum(true_val[mask_pred_below])/len(mask_pred_below):.4f}")

Mean error: 0.0465
Mean all prediction: 1.0001
Mean above prediction: 1.0001
Mean below prediction: 1.0001
True values above zero: 1.0000
True values below zero: 1.0001


In [12]:
import torch

def explain_grad_input(model, test_data, device='cpu'):
    """
    Compute feature importances via Gradient x Input for an LSTM model.

    model: trained TreeTimeLSTM
    test_data: numpy array (N, seq, features)
    """
    model.to(device).eval()
    inputs = torch.tensor(test_data, dtype=torch.float32, device=device, requires_grad=True)
    
    # forward pass
    outputs = model(inputs).squeeze()               # (N,)
    
    # backpropagate to get ∂y/∂x
    grads = torch.autograd.grad(
        outputs,
        inputs,
        grad_outputs=torch.ones_like(outputs),
        retain_graph=False,
        create_graph=False
    )[0]                                             # shape: (N, seq, features)
    
    # gradient × input
    attributions = grads * inputs                   # elementwise
    # aggregate over sequence dimension to get per-feature importance
    feature_importance = attributions.abs().mean(dim=1)  # shape: (N, features)
    
    return feature_importance.cpu().detach().numpy()


In [13]:
# usage
test_samples = train_Xtime[-1000:-1]
importances = explain_grad_input(model, test_samples)
print("Importances shape:", importances.shape)  # -> (5, n_features)

importances_mean = importances.mean(axis=0)

for i, name in enumerate(timenames):
    print(f"{name}: {importances_mean[i]*1e5:.4f}")

Importances shape: (999, 5)
MathFeature_TradedPrice: 0.0000
MathFeature_TradedPrice_sp0: 0.0000
MathFeature_TradedPrice_sp1: 0.0000
MathFeature_Return: 0.0000
MathFeature_PriceAdjustment: 0.0000


IndexError: index 5 is out of bounds for axis 0 with size 5

In [12]:
model = model.to(device).eval()

# pick k random indices
idx = random.sample(range(len(train_Xtime)), 200)
Xtime_sample = train_Xtime[-1000:-1]
ytime_true  = train_ytime[-1000:-1].reshape(-1)
ytree_true = train_ytree[-1000:-1].reshape(-1)

# prepare tensor
X_tensor = torch.tensor(Xtime_sample, dtype=torch.float32)

# tensor on correct device
X_tensor = torch.from_numpy(Xtime_sample).float().to(device)
with torch.no_grad():
    preds = model(X_tensor).squeeze().cpu().numpy()
    
preds_tree = (preds-0.5)/5 +1.0
rmse = np.sqrt(((ytree_true - preds_tree)**2).mean())

print(f"RMSE on sample: {rmse:.4f}")

mask = preds_tree > np.quantile(preds_tree, 0.8)
mean_true_masked = (ytree_true[mask]).mean()
print(f"Mean on masked : {mean_true_masked:.4f}")

RMSE on sample: 0.0234
Mean on masked : nan


  mean_true_masked = (ytree_true[mask]).mean()
  ret = ret.dtype.type(ret / rcount)
