### Jane Street Real-Time Market Data Forecasting with MLP

Link to the competition: https://www.kaggle.com/competitions/jane-street-real-time-market-data-forecasting/overview

In [1]:
import numpy as np
import polars as pls
from pathlib import Path

import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
import torch
from torch.utils.data import DataLoader, Dataset, IterableDataset, ConcatDataset
from torch import nn
from torch.optim.lr_scheduler import StepLR
from torchmetrics.functional import r2_score

import plotly.express as px

import wandb

In [2]:
data_path = "/home/yang/kaggle/jane/data"

In [3]:
# for each training set, we take 20% of the data for validation
frac_train = 0.8
train_raw_data_num = ["0", "1",  "2", "4", "5", "6", "8", "9"]
# a completely new dataset for testing
test_raw_data_num = "7"

In [4]:
train_feature_list = ["time_id", "symbol_id"] + [f"feature_{idx:02d}" for idx in range(79)]

In [5]:
num_features = len(train_feature_list)

In [6]:
sample_testing_data = pls.read_parquet(Path(data_path, "test.parquet", f"date_id=0", "part-0.parquet"))
num_sample_testing_data = len(sample_testing_data)

In [7]:
class TimeseriesDataset(Dataset):
    def __init__(self, df: pls.DataFrame):
        self.data = df

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        df = self.data[idx]
        df = df.fill_null(0)
        self.features = torch.tensor(df.select([col for col in df.columns if col in train_feature_list]).to_numpy(), dtype=torch.float32)
        self.target = torch.tensor(df.select(pls.col("responder_6")).to_numpy(), dtype=torch.float32)        
        return self.features, self.target

In [8]:
class MultiFileDataModule(pl.LightningDataModule):
    def __init__(self, file_paths: list, batch_size: int = 32):
        super().__init__()
        self.file_paths = file_paths
        self.batch_size = batch_size

    def train_dataloader(self):
        df_list = [pls.read_parquet(file_path) for file_path in self.file_paths]

        combined_dataset = ConcatDataset([TimeseriesDataset(df) for df in df_list])

        return DataLoader(combined_dataset, batch_size=self.batch_size, num_workers=15, shuffle=False)

    # def val_dataloader(self):
    #     return [DataLoader(self.val_dataset_1), DataLoader(self.val_dataset_2)]

    # def test_dataloader(self):
    #     return DataLoader(self.test_dataset)

    # def predict_dataloader(self):
    #     return DataLoader(self.predict_dataset)

In [9]:
class MLPRegressor(pl.LightningModule):
    def __init__(self, input_dim: int, hidden_dim: int = 128, lr: float = 1e-3):
        super().__init__()
        self.save_hyperparameters()
        self.lr = lr
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim*2),
            nn.ReLU(),
            nn.Linear(hidden_dim*2, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )
        self.training_step_outputs = []
        self.validation_step_outputs = []
        self.test_step_outputs = []
        self.criterion = nn.MSELoss()

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = self.criterion(y_hat, y.squeeze())
        #loss = r2_score(y_hat, y.squeeze())
        self.training_step_outputs.append(loss.item())
        self.log("train_loss", loss)
        return loss
    
    def on_train_epoch_end(self):
        epoch_average = torch.tensor(self.training_step_outputs).mean()
        self.log("training_epoch_average", epoch_average)
        self.training_step_outputs.clear()  # free memory

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = self.criterion(y_hat, y.squeeze())
        self.validation_step_outputs.append(loss.item())
        self.log("val_loss", loss, prog_bar=True)
        return {"val_loss": loss}

    def on_validation_epoch_end(self):
        avg_val_loss = torch.tensor(self.validation_step_outputs).mean()
        self.log("avg_val_loss", avg_val_loss)
        self.validation_step_outputs.clear()

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = self.criterion(y_hat, y.squeeze())
        #loss = r2_score(y_hat, y.squeeze())
        self.test_step_outputs.append(loss.item())
        self.log("test_loss", loss)
        return {"test_loss": loss}
    
    def on_test_epoch_end(self):
        epoch_average = torch.tensor(self.test_step_outputs).mean()
        self.log("test_epoch_average", epoch_average)
        self.test_step_outputs.clear()  # free memory

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

In [10]:
# call weights & biases service

# define hyperparameters and the 
parameters = dict(
    epoch = 2,
    input_dim = num_features,
    hidden_dim = 32,
    batch_size = 10000,
    #dropout = 0.0,
    learning_rate = 0.1,
    dataset = 'Jane street market data',
    architecture = 'MLP'
)

# initialize weights & biases service
#mode = 'online'
mode = 'disabled'
wandb.init(config=parameters, project='jane_street', entity='git-yang', mode=mode)
config = wandb.config
wandb_logger = WandbLogger(log_model="all")

In [11]:
# Training
file_paths = [Path(data_path, "train.parquet", f"partition_id={i}", "part-0.parquet") for i in train_raw_data_num]

# Initialize DataModule and model
datamodule = MultiFileDataModule(file_paths, batch_size=config.batch_size)
model = MLPRegressor(input_dim=config.input_dim, hidden_dim=config.hidden_dim, lr=config.learning_rate)

wandb.watch(model)

# Training using PyTorch Lightning
trainer = pl.Trainer(max_epochs=config.epoch, accelerator="auto", devices="auto", logger=wandb_logger)
trainer.fit(model, train_dataloaders=datamodule.train_dataloader())

trainer.save_checkpoint("model_checkpoint.ckpt")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/yang/.pyenv/versions/3.11.10/envs/ml/lib/python3.11/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
You are using a CUDA device ('NVIDIA GeForce RTX 4070 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/home/yang/.pyenv/versions/3.11.10/envs/ml/lib/python3.11/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`

Epoch 0:   0%|          | 0/3678 [00:00<?, ?it/s] 


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

### Evaluation

In [None]:
# Evaluation with testing dataset
def test_dataloader(df: pls.DataFrame, batch_size: int = 10000):
    dataset = TimeseriesDataset(df)
    return DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=15, multiprocessing_context='fork')

In [None]:
test_data = pls.read_parquet(Path(data_path, "train.parquet", f"partition_id={test_raw_data_num}", "part-0.parquet"))
data_loader = test_dataloader(test_data, batch_size=10000)

In [None]:
test_results = trainer.test(model, data_loader)

In [None]:
wandb.finish()

## Evaluation using the given metric

In [None]:
def sample_weighted_zero_mean_r2(y_pred, y_truth, weight):
    """
    Zero-mean R-squared metrics.

    Args:
        y_pred: Array of predicted values.
        y_truth: Array of true values.
        weight: Array of sample weights.

    Returns:
        1-corr: Zero-mean R-squared.
    """

    # Ensure weights are valid
    weight = weight if weight is not None else np.ones_like(y_pred)
    
    corr = np.sum((weight * (y_truth - y_pred) ** 2)) / np.sum(weight * y_truth ** 2)
    
    return 1 - corr 

In [None]:
test_data_subset = test_data.select([col for col in test_data.columns if col in train_feature_list])
test_data_subset = test_data_subset.fill_null(0)

model.eval()
with torch.no_grad():
    y_pred = model(torch.tensor(test_data_subset.to_numpy(), dtype=torch.float32)).squeeze().numpy()


score = sample_weighted_zero_mean_r2(y_pred, test_data.select(pls.col("responder_6")).to_numpy()[:,0],
                                     test_data.select(pls.col("weight")).to_numpy()[:,0])
score