### Jane Street Real-Time Market Data Forecasting with MLP

The MLP model is trained with all available data except one for testing.

Link to the competition: https://www.kaggle.com/competitions/jane-street-real-time-market-data-forecasting/overview

In [1]:
import numpy as np
import polars as pls
from pathlib import Path

import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
#from torch.optim.lr_scheduler import StepLR
from torchmetrics.functional import r2_score

#import plotly.express as px

import wandb

In [2]:
data_path = "/home/yang/kaggle/jane/data"

In [3]:
# for each training set, we take 20% of the data for validation
#frac_train = 0.8
train_raw_data_num = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
# a completely new dataset for testing
test_raw_data_num = "0"

In [4]:
train_feature_list = ["time_id", "symbol_id"] + [f"feature_{idx:02d}" for idx in range(79)]

In [5]:
num_features = len(train_feature_list)

In [6]:
sample_testing_data = pls.read_parquet(Path(data_path, "test.parquet", f"date_id=0", "part-0.parquet"))
num_sample_testing_data = len(sample_testing_data)

In [7]:
class TimeseriesDataset(Dataset):
    def __init__(self, df: pls.DataFrame):
        df = df.fill_null(0)
        self.features = torch.tensor(df.select([col for col in df.columns if col in train_feature_list]).to_numpy(), dtype=torch.float32)
        self.target = torch.tensor(df.select(pls.col("responder_6")).to_numpy(), dtype=torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.target[idx]

In [8]:
class DataModule(pl.LightningDataModule):
    def __init__(self, file_path: str, batch_size: int = 32, shuffle: bool = True):
        super().__init__()
        self.file_path = file_path
        self.batch_size = batch_size
        self.shuffle = shuffle

    def train_dataloader(self):
        df = pls.read_parquet(self.file_path)  # Adjust for your file format (e.g., CSV, Parquet)
        dataset = TimeseriesDataset(df)
        return DataLoader(dataset, batch_size=self.batch_size, shuffle=self.shuffle, num_workers=15, multiprocessing_context='fork')

In [9]:
class MLPRegressor(pl.LightningModule):
    def __init__(self, input_dim: int, hidden_dim: int = 128, lr: float = 1e-3):
        super().__init__()
        self.save_hyperparameters()
        self.lr = lr
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )
        self.training_step_outputs = []
        self.validation_step_outputs = []
        self.test_step_outputs = []
        self.criterion = nn.MSELoss()
        #self.criterion = r2_score

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = self.criterion(y_hat, y.squeeze())
        self.training_step_outputs.append(loss.item())
        self.log("train_loss", loss)
        return loss
    
    def on_train_epoch_end(self):
        epoch_average = torch.tensor(self.training_step_outputs).mean()
        self.log("training_epoch_average", epoch_average)
        self.training_step_outputs.clear()  # free memory

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = self.criterion(y_hat, y.squeeze())
        self.validation_step_outputs.append(loss.item())
        self.log("val_loss", loss, prog_bar=True)
        return {"val_loss": loss}

    def on_validation_epoch_end(self):
        avg_val_loss = torch.tensor(self.validation_step_outputs).mean()
        self.log("avg_val_loss", avg_val_loss)
        self.validation_step_outputs.clear()

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = self.criterion(y_hat, y.squeeze())
        self.test_step_outputs.append(loss.item())
        self.log("test_loss", loss)
        return {"test_loss": loss}
    
    def on_test_epoch_end(self):
        epoch_average = torch.tensor(self.test_step_outputs).mean()
        self.log("test_epoch_average", epoch_average)
        self.test_step_outputs.clear()  # free memory

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

In [10]:
# define hyperparameters
parameters = dict(
    epoch =  25,
    input_dim = num_features,
    hidden_dim = 256,
    batch_size = 15000,
    #dropout = 0.0,
    learning_rate = 1e-5,
    dataset = 'Jane street market data',
    architecture = 'MLP'
)

# initialize weights & biases service
mode = 'online'
#mode = 'disabled'
wandb.init(config=parameters, project='jane_street', entity='git-yang', mode=mode)
config = wandb.config
wandb_logger = WandbLogger(log_model="all")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mgit-yang[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [11]:
%%time
file_paths = [Path(data_path, "train.parquet", f"partition_id={i}", "part-0.parquet") for i in train_raw_data_num]

model = MLPRegressor(input_dim=config.input_dim, hidden_dim=config.hidden_dim, lr=config.learning_rate)

#model = MLPRegressor.load_from_checkpoint("model_init/mlp_hidden_64_checkpoint.ckpt")
#model = MLPRegressor.load_from_checkpoint("model_init/jane_mlp_hidden_32_epoch_30.ckpt")
wandb.watch(model)

for file_path in file_paths:
    print(f"Traing on dataset: {file_path}")

    # Initialize DataModule and model
    datamodule = DataModule(file_path, batch_size=config.batch_size)

    # Training using PyTorch Lightning
    trainer = pl.Trainer(max_epochs=config.epoch, accelerator="auto", devices="auto", logger=wandb_logger)

    # Train with dataframes sequentially
    trainer.fit(model, train_dataloaders=datamodule.train_dataloader())

trainer.save_checkpoint("model_checkpoint.ckpt")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Traing on dataset: /home/yang/kaggle/jane/data/train.parquet/partition_id=0/part-0.parquet


/home/yang/.pyenv/versions/3.11.10/envs/ml/lib/python3.11/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
You are using a CUDA device ('NVIDIA GeForce RTX 4070 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/home/yang/.pyenv/versions/3.11.10/envs/ml/lib/python3.11/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type       | Params | Mode 
---------------------------

Epoch 1:   0%|          | 0/130 [00:00<?, ?it/s, v_num=drhm]                

In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

  self.pid = os.fork()


Epoch 8:   0%|          | 0/130 [00:00<?, ?it/s, v_num=drhm]          

In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

  self.pid = os.fork()


Epoch 9:   0%|          | 0/130 [00:00<?, ?it/s, v_num=drhm]          

In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

  self.pid = os.fork()


Epoch 24: 100%|██████████| 130/130 [00:02<00:00, 56.05it/s, v_num=drhm]      

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 130/130 [00:03<00:00, 38.21it/s, v_num=drhm]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



Traing on dataset: /home/yang/kaggle/jane/data/train.parquet/partition_id=1/part-0.parquet


/home/yang/.pyenv/versions/3.11.10/envs/ml/lib/python3.11/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
/home/yang/.pyenv/versions/3.11.10/envs/ml/lib/python3.11/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory ./lightning_logs/d8utdrhm/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type       | Params | Mode 
-------------------------------------------------
0 | model     | Sequential | 87.0 K | train
1 | criterion | MSELoss    | 0      | train
-------------------------------------------------
87.0 K    Trainable params
0         Non-trainable params
87.0 K    Total params
0.348     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 24: 100%|██████████| 187/187 [00:04<00:00, 40.53it/s, v_num=drhm]      

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 187/187 [00:07<00:00, 24.52it/s, v_num=drhm]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



Traing on dataset: /home/yang/kaggle/jane/data/train.parquet/partition_id=2/part-0.parquet


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type       | Params | Mode 
-------------------------------------------------
0 | model     | Sequential | 87.0 K | train
1 | criterion | MSELoss    | 0      | train
-------------------------------------------------
87.0 K    Trainable params
0         Non-trainable params
87.0 K    Total params
0.348     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 24: 100%|██████████| 203/203 [00:08<00:00, 24.46it/s, v_num=drhm]      

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 203/203 [00:09<00:00, 20.91it/s, v_num=drhm]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



Traing on dataset: /home/yang/kaggle/jane/data/train.parquet/partition_id=3/part-0.parquet


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type       | Params | Mode 
-------------------------------------------------
0 | model     | Sequential | 87.0 K | train
1 | criterion | MSELoss    | 0      | train
-------------------------------------------------
87.0 K    Trainable params
0         Non-trainable params
87.0 K    Total params
0.348     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 24: 100%|██████████| 268/268 [00:10<00:00, 24.75it/s, v_num=drhm]

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 268/268 [00:11<00:00, 24.17it/s, v_num=drhm]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



Traing on dataset: /home/yang/kaggle/jane/data/train.parquet/partition_id=4/part-0.parquet


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type       | Params | Mode 
-------------------------------------------------
0 | model     | Sequential | 87.0 K | train
1 | criterion | MSELoss    | 0      | train
-------------------------------------------------
87.0 K    Trainable params
0         Non-trainable params
87.0 K    Total params
0.348     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 24: 100%|██████████| 335/335 [00:12<00:00, 26.59it/s, v_num=drhm]      

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 335/335 [00:12<00:00, 26.04it/s, v_num=drhm]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



Traing on dataset: /home/yang/kaggle/jane/data/train.parquet/partition_id=5/part-0.parquet


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type       | Params | Mode 
-------------------------------------------------
0 | model     | Sequential | 87.0 K | train
1 | criterion | MSELoss    | 0      | train
-------------------------------------------------
87.0 K    Trainable params
0         Non-trainable params
87.0 K    Total params
0.348     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 24: 100%|██████████| 357/357 [00:13<00:00, 26.88it/s, v_num=drhm]     

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 357/357 [00:13<00:00, 26.36it/s, v_num=drhm]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



Traing on dataset: /home/yang/kaggle/jane/data/train.parquet/partition_id=6/part-0.parquet


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type       | Params | Mode 
-------------------------------------------------
0 | model     | Sequential | 87.0 K | train
1 | criterion | MSELoss    | 0      | train
-------------------------------------------------
87.0 K    Trainable params
0         Non-trainable params
87.0 K    Total params
0.348     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 24: 100%|██████████| 414/414 [00:13<00:00, 31.62it/s, v_num=drhm]    

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 414/414 [00:13<00:00, 31.00it/s, v_num=drhm]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



Traing on dataset: /home/yang/kaggle/jane/data/train.parquet/partition_id=7/part-0.parquet


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type       | Params | Mode 
-------------------------------------------------
0 | model     | Sequential | 87.0 K | train
1 | criterion | MSELoss    | 0      | train
-------------------------------------------------
87.0 K    Trainable params
0         Non-trainable params
87.0 K    Total params
0.348     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 24: 100%|██████████| 423/423 [00:16<00:00, 26.19it/s, v_num=drhm]

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 423/423 [00:16<00:00, 25.78it/s, v_num=drhm]


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Traing on dataset: /home/yang/kaggle/jane/data/train.parquet/partition_id=8/part-0.parquet


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type       | Params | Mode 
-------------------------------------------------
0 | model     | Sequential | 87.0 K | train
1 | criterion | MSELoss    | 0      | train
-------------------------------------------------
87.0 K    Trainable params
0         Non-trainable params
87.0 K    Total params
0.348     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 24: 100%|██████████| 410/410 [00:15<00:00, 27.21it/s, v_num=drhm]

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 410/410 [00:15<00:00, 26.74it/s, v_num=drhm]


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Traing on dataset: /home/yang/kaggle/jane/data/train.parquet/partition_id=9/part-0.parquet


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type       | Params | Mode 
-------------------------------------------------
0 | model     | Sequential | 87.0 K | train
1 | criterion | MSELoss    | 0      | train
-------------------------------------------------
87.0 K    Trainable params
0         Non-trainable params
87.0 K    Total params
0.348     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 24: 100%|██████████| 419/419 [00:17<00:00, 23.38it/s, v_num=drhm]

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 419/419 [00:18<00:00, 23.04it/s, v_num=drhm]
CPU times: user 25min 59s, sys: 11min 39s, total: 37min 38s
Wall time: 53min 37s


### Evaluation

In [12]:
# Evaluation with testing dataset
def test_dataloader(df: pls.DataFrame, batch_size: int = 15000):
    dataset = TimeseriesDataset(df)
    return DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=15, multiprocessing_context='fork')

In [13]:
test_data = pls.read_parquet(Path(data_path, "train.parquet", f"partition_id={test_raw_data_num}", "part-0.parquet"))
data_loader = test_dataloader(test_data, batch_size=15000)

In [14]:
test_results = trainer.test(model, data_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 130/130 [00:03<00:00, 34.32it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
   test_epoch_average       0.7978482246398926
        test_loss           0.7976667284965515
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


In [15]:
wandb.finish()

0,1
epoch,▂▆▂▆▇██▃▅▇▁▂▄▆▇▅▇█▂▂▆▆▁▂▇█▁▃▅▅▆▇▇▂▂▄▅▇▂▇
test_epoch_average,▁
test_loss,▁
train_loss,▃▃▃▄▄▅▆▆█████▅▆▅▆▄▄▄▄▄▄▃▄▄▃▄▃▂▂▁▃▃▄▁▁▁▂▁
trainer/global_step,▂▂▂▁▁▂▂▃▃▄▄▁▁▁▃▂▃▅▅▅▆▅▅▅▆▄▄▅▅█▂▃▇█▁▃▄▄▅▇
training_epoch_average,▃▃▃▃▃▄▄▄▄▅█▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▃▂▁▁▁▃▃▃▃▁▁▁

0,1
epoch,25.0
test_epoch_average,0.79785
test_loss,0.79767
train_loss,0.66026
trainer/global_step,10475.0
training_epoch_average,0.6517


## Evaluation using the given metric

In [16]:
def sample_weighted_zero_mean_r2(y_pred, y_truth, weight):
    """
    Zero-mean R-squared metrics.

    Args:
        y_pred: Array of predicted values.
        y_truth: Array of true values.
        weight: Array of sample weights.

    Returns:
        1-corr: Zero-mean R-squared.
    """

    # Ensure weights are valid
    weight = weight if weight is not None else np.ones_like(y_pred)
    
    corr = np.sum((weight * (y_truth - y_pred) ** 2)) / np.sum(weight * y_truth ** 2)
    
    return 1 - corr 

In [17]:
test_data_subset = test_data.select([col for col in test_data.columns if col in train_feature_list])
test_data_subset = test_data_subset.fill_null(0)

In [18]:
model.eval()
with torch.no_grad():
    y_pred = model(torch.tensor(test_data_subset.to_numpy(), dtype=torch.float32)).squeeze().numpy()


score = sample_weighted_zero_mean_r2(y_pred, test_data.select(pls.col("responder_6")).to_numpy()[:,0],
                                     test_data.select(pls.col("weight")).to_numpy()[:,0])
score

np.float32(-0.045132518)