### Jane Street Real-Time Market Data Forecasting with GRU

The GRU model is trained with all available data except one for testing.

Link to the competition: https://www.kaggle.com/competitions/jane-street-real-time-market-data-forecasting/overview

In [1]:
import numpy as np
import math
import polars as pls
from pathlib import Path

import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
#from torch.optim.lr_scheduler import StepLR
#from torchmetrics.functional import r2_score

#import plotly.express as px

import wandb

In [2]:
data_path = "/home/yang/kaggle/jane/data"

In [3]:
# for each training set, we take 20% of the data for validation
#frac_train = 0.8
train_raw_data_num = ["0", "1", "2", "4", "5", "6", "8", "9"]
# a completely new dataset for testing
test_raw_data_num = "7"

In [4]:
train_feature_list = ["time_id", "symbol_id"] + [f"feature_{idx:02d}" for idx in range(79)]

In [5]:
num_features = len(train_feature_list)

In [6]:
sample_testing_data = pls.read_parquet(Path(data_path, "test.parquet", f"date_id=0", "part-0.parquet"))
num_sample_testing_data = len(sample_testing_data)

In [7]:
class TimeseriesDataset(Dataset):
    def __init__(self, df: pls.DataFrame):
        df = df.fill_null(0)
        self.features = torch.tensor(df.select([col for col in df.columns if col in train_feature_list]).to_numpy(), dtype=torch.float32)
        self.target = torch.tensor(df.select(pls.col("responder_6")).to_numpy(), dtype=torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.target[idx]

In [8]:
class DataModule(pl.LightningDataModule):
    def __init__(self, file_path: str, batch_size: int = 32):
        super().__init__()
        self.file_path = file_path
        self.batch_size = batch_size

    def train_dataloader(self):
        df = pls.read_parquet(self.file_path)  # Adjust for your file format (e.g., CSV, Parquet)
        dataset = TimeseriesDataset(df)
        return DataLoader(dataset, batch_size=self.batch_size, shuffle=False, num_workers=15, multiprocessing_context='fork')

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :].to(x.device)

In [9]:
class TransformerRegressor(pl.LightningModule):
    def __init__(self, input_dim: int, d_model=128, nhead=4, num_layers=2, lr: float = 1e-3):
        super().__init__()
        self.save_hyperparameters()
        self.lr = lr

        self.input_projection = nn.Linear(input_dim, d_model)
        self.positional_encoding = PositionalEncoding(d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(d_model, 1)

        self.training_step_outputs = []
        self.validation_step_outputs = []
        self.test_step_outputs = []
        self.criterion = nn.MSELoss()
        #self.criterion = r2_score

    def forward(self, x):
        if x.dim() == 2:  # Handle single sequence case (no batch dimension)
            x = x.unsqueeze(0)  # Add batch dimension: (1, sequence_length, features)
        # x: (batch_size, seq_len, input_dim)
        x = self.input_projection(x)
        x = self.positional_encoding(x)
        # Transformer expects input of shape (seq_len, batch_size, d_model)
        x = x.permute(1, 0, 2)
        x = self.transformer(x)
        # Use the output of the last time step
        x = x[-1]
        return self.fc(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = self.criterion(y_hat, y.squeeze())
        self.training_step_outputs.append(loss.item())
        self.log("train_loss", loss)
        return loss
    
    def on_train_epoch_end(self):
        epoch_average = torch.tensor(self.training_step_outputs).mean()
        self.log("training_epoch_average", epoch_average)
        self.training_step_outputs.clear()  # free memory

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = self.criterion(y_hat, y.squeeze())
        self.validation_step_outputs.append(loss.item())
        self.log("val_loss", loss, prog_bar=True)
        return {"val_loss": loss}

    def on_validation_epoch_end(self):
        avg_val_loss = torch.tensor(self.validation_step_outputs).mean()
        self.log("avg_val_loss", avg_val_loss)
        self.validation_step_outputs.clear()

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = self.criterion(y_hat, y.squeeze())
        self.test_step_outputs.append(loss.item())
        self.log("test_loss", loss)
        return {"test_loss": loss}
    
    def on_test_epoch_end(self):
        epoch_average = torch.tensor(self.test_step_outputs).mean()
        self.log("test_epoch_average", epoch_average)
        self.test_step_outputs.clear()  # free memory

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

In [10]:
# define hyperparameters and the 
parameters = dict(
    epoch = 20,
    input_dim = num_features,
    d_model = 128,
    nhead = 4,
    num_layers = 2,
    batch_size = 1024,
    #dropout = 0.0,
    learning_rate = 0.05,
    dataset = 'Jane street market data',
    architecture = 'GRU'
)

# initialize weights & biases service
mode = 'online'
#mode = 'disabled'
wandb.init(config=parameters, project='jane_street', entity='git-yang', mode=mode)
config = wandb.config
wandb_logger = WandbLogger(log_model="all")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mgit-yang[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [11]:
%%time
file_paths = [Path(data_path, "train.parquet", f"partition_id={i}", "part-0.parquet") for i in train_raw_data_num]

model = TransformerRegressor(input_dim=config.input_dim, hidden_dim=config.hidden_dim, num_layers=config.num_layers, lr=config.learning_rate)

#model = TransformerRegressor.load_from_checkpoint("model_init/mlp_hidden_64_checkpoint.ckpt")
#model = TransformerRegressor.load_from_checkpoint("model_init/jane_mlp_hidden_32_epoch_30.ckpt")
wandb.watch(model)

for file_path in file_paths:
    print(f"Traing on dataset: {file_path}")

    # Initialize DataModule and model
    datamodule = DataModule(file_path, batch_size=config.batch_size)

    # Training using PyTorch Lightning
    trainer = pl.Trainer(max_epochs=config.epoch, accelerator="auto", devices="auto", logger=wandb_logger)

    # Train with dataframes sequentially
    trainer.fit(model, train_dataloaders=datamodule.train_dataloader())

trainer.save_checkpoint("model_checkpoint.ckpt")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Traing on dataset: /home/yang/kaggle/jane/data/train.parquet/partition_id=0/part-0.parquet


/home/yang/.pyenv/versions/3.11.10/envs/ml/lib/python3.11/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
You are using a CUDA device ('NVIDIA GeForce RTX 4070 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/home/yang/.pyenv/versions/3.11.10/envs/ml/lib/python3.11/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type    | Params | Mode 
------------------------------

Epoch 1:   0%|          | 0/195 [00:00<?, ?it/s, v_num=t69c]          

In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

  self.pid = os.fork()


Epoch 6:   0%|          | 0/195 [00:00<?, ?it/s, v_num=t69c]               

In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

  self.pid = os.fork()


Epoch 7:   0%|          | 0/195 [00:00<?, ?it/s, v_num=t69c]          

In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

  self.pid = os.fork()


Epoch 19: 100%|██████████| 195/195 [00:08<00:00, 22.73it/s, v_num=t69c]    

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 195/195 [00:08<00:00, 22.08it/s, v_num=t69c]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



Traing on dataset: /home/yang/kaggle/jane/data/train.parquet/partition_id=1/part-0.parquet


/home/yang/.pyenv/versions/3.11.10/envs/ml/lib/python3.11/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
/home/yang/.pyenv/versions/3.11.10/envs/ml/lib/python3.11/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory ./lightning_logs/vhxzt69c/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type    | Params | Mode 
----------------------------------------------
0 | gru       | GRU     | 180 K  | train
1 | fc        | Linear  | 129    | train
2 | criterion | MSELoss | 0      | train
----------------------------------------------
180 K     Trainable params
0         Non-trainable params
180 K     Total params
0.721     Total estimated model params size (MB)
3         Modules in train mode
0         Modules in eval mode


Epoch 19: 100%|██████████| 281/281 [00:09<00:00, 28.67it/s, v_num=t69c]   

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 281/281 [00:10<00:00, 27.81it/s, v_num=t69c]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



Traing on dataset: /home/yang/kaggle/jane/data/train.parquet/partition_id=2/part-0.parquet


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type    | Params | Mode 
----------------------------------------------
0 | gru       | GRU     | 180 K  | train
1 | fc        | Linear  | 129    | train
2 | criterion | MSELoss | 0      | train
----------------------------------------------
180 K     Trainable params
0         Non-trainable params
180 K     Total params
0.721     Total estimated model params size (MB)
3         Modules in train mode
0         Modules in eval mode


Epoch 19: 100%|██████████| 304/304 [00:13<00:00, 22.66it/s, v_num=t69c]   

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 304/304 [00:13<00:00, 22.21it/s, v_num=t69c]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores



Traing on dataset: /home/yang/kaggle/jane/data/train.parquet/partition_id=4/part-0.parquet


HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type    | Params | Mode 
----------------------------------------------
0 | gru       | GRU     | 180 K  | train
1 | fc        | Linear  | 129    | train
2 | criterion | MSELoss | 0      | train
----------------------------------------------
180 K     Trainable params
0         Non-trainable params
180 K     Total params
0.721     Total estimated model params size (MB)
3         Modules in train mode
0         Modules in eval mode


Epoch 19: 100%|██████████| 503/503 [00:17<00:00, 28.11it/s, v_num=t69c]   

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 503/503 [00:18<00:00, 27.66it/s, v_num=t69c]
Traing on dataset: /home/yang/kaggle/jane/data/train.parquet/partition_id=5/part-0.parquet


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type    | Params | Mode 
----------------------------------------------
0 | gru       | GRU     | 180 K  | train
1 | fc        | Linear  | 129    | train
2 | criterion | MSELoss | 0      | train
----------------------------------------------
180 K     Trainable params
0         Non-trainable params
180 K     Total params
0.721     Total estimated model params size (MB)
3         Modules in train mode
0         Modules in eval mode


Epoch 19: 100%|██████████| 535/535 [00:19<00:00, 27.05it/s, v_num=t69c]    

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 535/535 [00:20<00:00, 26.68it/s, v_num=t69c]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



Traing on dataset: /home/yang/kaggle/jane/data/train.parquet/partition_id=6/part-0.parquet


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type    | Params | Mode 
----------------------------------------------
0 | gru       | GRU     | 180 K  | train
1 | fc        | Linear  | 129    | train
2 | criterion | MSELoss | 0      | train
----------------------------------------------
180 K     Trainable params
0         Non-trainable params
180 K     Total params
0.721     Total estimated model params size (MB)
3         Modules in train mode
0         Modules in eval mode


Epoch 19: 100%|██████████| 621/621 [00:24<00:00, 25.44it/s, v_num=t69c]    

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 621/621 [00:24<00:00, 25.15it/s, v_num=t69c]


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Traing on dataset: /home/yang/kaggle/jane/data/train.parquet/partition_id=8/part-0.parquet


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type    | Params | Mode 
----------------------------------------------
0 | gru       | GRU     | 180 K  | train
1 | fc        | Linear  | 129    | train
2 | criterion | MSELoss | 0      | train
----------------------------------------------
180 K     Trainable params
0         Non-trainable params
180 K     Total params
0.721     Total estimated model params size (MB)
3         Modules in train mode
0         Modules in eval mode


Epoch 19: 100%|██████████| 615/615 [00:24<00:00, 25.51it/s, v_num=t69c]  

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 615/615 [00:24<00:00, 25.23it/s, v_num=t69c]


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Traing on dataset: /home/yang/kaggle/jane/data/train.parquet/partition_id=9/part-0.parquet


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type    | Params | Mode 
----------------------------------------------
0 | gru       | GRU     | 180 K  | train
1 | fc        | Linear  | 129    | train
2 | criterion | MSELoss | 0      | train
----------------------------------------------
180 K     Trainable params
0         Non-trainable params
180 K     Total params
0.721     Total estimated model params size (MB)
3         Modules in train mode
0         Modules in eval mode


Epoch 19: 100%|██████████| 628/628 [00:25<00:00, 24.73it/s, v_num=t69c]

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 628/628 [00:25<00:00, 24.45it/s, v_num=t69c]
CPU times: user 42min 46s, sys: 7min 21s, total: 50min 8s
Wall time: 50min 5s


### Evaluation

In [12]:
# Evaluation with testing dataset
def test_dataloader(df: pls.DataFrame, batch_size: int = 10000):
    dataset = TimeseriesDataset(df)
    return DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=15, multiprocessing_context='fork')

In [13]:
test_data = pls.read_parquet(Path(data_path, "train.parquet", f"partition_id={test_raw_data_num}", "part-0.parquet"))
data_loader = test_dataloader(test_data, batch_size=10000)

In [14]:
test_results = trainer.test(model, data_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 634/634 [00:13<00:00, 47.29it/s]    
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
   test_epoch_average        0.699215292930603
        test_loss           0.6991668343544006
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


In [15]:
wandb.finish()

0,1
epoch,▂▄▆▃▄▁▂▂▄▇▁▂▅▅▇▂▄▅▅▅█▃▄▅▆▇█▁▂▃▄▆▇▇█▃▅▅▆█
test_epoch_average,▁
test_loss,▁
train_loss,▃▄▄▄▅▃▅▃█▄▅▅▃▇▃▄▅▃▃▃▂▁▃▂▄▃▆▇▄▄▃▂▅▃▃▂▃▂▄▂
trainer/global_step,▃▃▁▂▃▃▁▂▃▄▁▂▂▄▄▂▂▃▄▄▇▇▁▆█▄▅▆▆▇▂▃▄▄▄▅▅▆▇█
training_epoch_average,▃▃▄▄▅▅▅▅▅▅▇▇▇███▅▅▅▅▄▅▅▅▅▅▅▅▅▃▃▃▃▃▃▁▁▁▁▁

0,1
epoch,20.0
test_epoch_average,0.69922
test_loss,0.69917
train_loss,0.66779
trainer/global_step,12560.0
training_epoch_average,0.7319


## Evaluation using the given metric

In [16]:
# load model
model = GRURegressor.load_from_checkpoint("model_init/jane_gru_hidden_64_layer_2_rmse.ckpt")

In [17]:
def sample_weighted_zero_mean_r2(y_pred, y_truth, weight):
    """
    Zero-mean R-squared metrics.

    Args:
        y_pred: Array of predicted values.
        y_truth: Array of true values.
        weight: Array of sample weights.

    Returns:
        1-corr: Zero-mean R-squared.
    """

    # Ensure weights are valid
    weight = weight if weight is not None else np.ones_like(y_pred)
    
    corr = np.sum((weight * (y_truth - y_pred) ** 2)) / np.sum(weight * y_truth ** 2)
    
    return 1 - corr 

In [18]:
# Assuming your model is already defined as `model`
device = "cuda" if torch.cuda.is_available() else "cpu"

In [19]:
# Move the model to GPU
model.to(device)

GRURegressor(
  (gru): GRU(81, 64, num_layers=2, batch_first=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (criterion): MSELoss()
)

In [20]:
# inference without batch is slow. We will try to accerlate it with batch processing
# %%time

# test_data_subset = test_data.select([col for col in test_data.columns if col in train_feature_list])
# test_data_subset = test_data_subset.fill_null(0)

# model.eval()
# with torch.no_grad():
#     y_pred = model(torch.tensor(test_data_subset.to_numpy(), dtype=torch.float32)).squeeze().numpy()


# score = sample_weighted_zero_mean_r2(y_pred, test_data.select(pls.col("responder_6")).to_numpy()[:,0],
#                                      test_data.select(pls.col("weight")).to_numpy()[:,0])
# score

In [21]:
%%time

all_predictions = []

model.eval()
with torch.no_grad():
    for batch in data_loader:
        x, y = batch
        y_pred = model(x.to(device)).squeeze()
        all_predictions.append(y_pred)

predictions = torch.cat(all_predictions, dim=0).cpu().numpy()

score = sample_weighted_zero_mean_r2(predictions, test_data.select(pls.col("responder_6")).to_numpy()[:,0],
                                     test_data.select(pls.col("weight")).to_numpy()[:,0])
score

CPU times: user 4.92 s, sys: 3.48 s, total: 8.4 s
Wall time: 11.8 s


np.float32(-0.0010317564)