## Example notebook submission
This notebook is a successful submission. It was submitted and scored without any issue.

In [1]:
import os
import numpy as np # linear algebra
import pandas as pd
import polars as pls
import lightgbm as lgb

from pathlib import Path

import pytorch_lightning as pl
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset

#import kaggle_evaluation.jane_street_inference_server

#### Check GPU availability

In [None]:
# Assuming your model is already defined as `model`
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

#### Load model

In [2]:
# model uploaded via kaggle UI
#path_to_model_lgb = "/kaggle/input/test_lgbm_null_to_0/other/default/1/jane_lgbm.txt"
path_to_model_lgb = "../mono_model/model_init/jane_lgbm_baseline.txt"
#path_to_model_nn = "/kaggle/input/test_lgbm_null_to_0/other/default/1/jane_nn.ckpt"
path_to_model_nn = "../hybrid_model/model_init/jane_gru_hidden_64_layer_2_rmse.ckpt"

In [3]:
class GRURegressor(pl.LightningModule):
    def __init__(self, input_dim: int, hidden_dim: int = 128, num_layers: int = 2, lr: float = 1e-3):
        super().__init__()
        self.save_hyperparameters()
        self.lr = lr
        # Define GRU layer
        self.gru = nn.GRU(input_dim, hidden_dim, num_layers, batch_first=True)

        # Define a fully connected layer to map GRU outputs to a single value
        self.fc = nn.Linear(hidden_dim, 1)

        self.training_step_outputs = []
        self.validation_step_outputs = []
        self.test_step_outputs = []
        self.criterion = nn.MSELoss()
        #self.criterion = r2_score

    def forward(self, x):
        # Forward pass through GRU
        #_, hidden = self.gru(x)  # hidden is the last hidden state
        outputs, _ = self.gru(x)
        
        # Pass the last hidden state through the fully connected layer
        #output = self.fc(hidden[-1])
        output = self.fc(outputs)
        return output

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = self.criterion(y_hat, y.squeeze())
        self.training_step_outputs.append(loss.item())
        self.log("train_loss", loss)
        return loss
    
    def on_train_epoch_end(self):
        epoch_average = torch.tensor(self.training_step_outputs).mean()
        self.log("training_epoch_average", epoch_average)
        self.training_step_outputs.clear()  # free memory

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = self.criterion(y_hat, y.squeeze())
        self.validation_step_outputs.append(loss.item())
        self.log("val_loss", loss, prog_bar=True)
        return {"val_loss": loss}

    def on_validation_epoch_end(self):
        avg_val_loss = torch.tensor(self.validation_step_outputs).mean()
        self.log("avg_val_loss", avg_val_loss)
        self.validation_step_outputs.clear()

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = self.criterion(y_hat, y.squeeze())
        self.test_step_outputs.append(loss.item())
        self.log("test_loss", loss)
        return {"test_loss": loss}
    
    def on_test_epoch_end(self):
        epoch_average = torch.tensor(self.test_step_outputs).mean()
        self.log("test_epoch_average", epoch_average)
        self.test_step_outputs.clear()  # free memory

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

#### Load models

In [5]:
# load saved model to make predictions
model_lgb = lgb.Booster(model_file=path_to_model_lgb)
model_nn = GRURegressor.load_from_checkpoint(path_to_model_nn).to(device)

In [6]:
model_nn.eval()

GRURegressor(
  (gru): GRU(81, 64, num_layers=2, batch_first=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (criterion): MSELoss()
)

#### Create data loader for inference

In [7]:
train_feature_list = ["time_id", "symbol_id"] + [f"feature_{idx:02d}" for idx in range(79)]

In [8]:
class TimeseriesDataset(Dataset):
    def __init__(self, df: pls.DataFrame):
        df = df.fill_null(0)
        self.features = torch.tensor(df.select([col for col in df.columns if col in train_feature_list]).to_numpy(), dtype=torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx]

In [9]:
# Evaluation with testing dataset
def test_dataloader(df: pls.DataFrame, batch_size: int = 10000):
    dataset = TimeseriesDataset(df)
    return DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=15, multiprocessing_context='fork')

In [10]:
def predict(test: pls.DataFrame, lags: pls.DataFrame | None) -> pls.DataFrame | pd.DataFrame:
    """Make a prediction."""
    # All the responders from the previous day are passed in at time_id == 0. We save them in a global variable for access at every time_id.
    # Use them as extra features, if you like.
    global lags_
    if lags is not None:
        lags_ = lags
    
    # Inference with LGBM
    y_pred_lgb = model_lgb.predict(test.select([col for col in train_feature_list]))

    # Inference with NN
    nn_predictions = []

    # Use dataloader to load all batches
    data_loader = test_dataloader(test, batch_size=10000)

    with torch.no_grad():
        for batch in data_loader:
            y_pred = model_nn(batch.to(device)).squeeze()
            nn_predictions.append(y_pred)

    y_pred_nn = torch.cat(nn_predictions, dim=0).cpu().numpy()

    # Combine the predictions from the two models
    predictions = test.select(pls.col("row_id"))
    y_pred = (y_pred_lgb + y_pred_nn) / 2
    predictions = predictions.with_columns(pls.Series("responder_6", y_pred))

    if isinstance(predictions, pls.DataFrame):
        assert predictions.columns == ['row_id', 'responder_6']
    elif isinstance(predictions, pd.DataFrame):
        assert (predictions.columns == ['row_id', 'responder_6']).all()
    else:
        raise TypeError('The predict function must return a DataFrame')
    # Confirm has as many rows as the test data.
    assert len(predictions) == len(test)

    return predictions

In [11]:
# Verify the pipeline using the example test data
data_path = "/home/yang/kaggle/jane/data"
# load example test data
test_data = pls.read_parquet(Path(data_path, "test.parquet", "date_id=0", "part-0.parquet"))

predictions = predict(test_data, None)
predictions

In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

  self.pid = os.fork()
In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

  self.pid = os.fork()


row_id,responder_6
i64,f64
0,0.017081
1,0.013513
2,0.013513
3,0.013513
4,0.013513
…,…
34,0.013513
35,0.013513
36,0.013513
37,0.013513


In [None]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-realtime-marketdata-forecasting/test.parquet',
            '/kaggle/input/jane-street-realtime-marketdata-forecasting/lags.parquet',
        )
    )