## Example notebook submission
This notebook is a successful submission. It was submitted and scored without any issue.

In [None]:
import os
import numpy as np # linear algebra
import pandas as pd
import polars as pls
import lightgbm as lgb

from pathlib import Path

import pytorch_lightning as pl
import torch
from torch import nn

#import kaggle_evaluation.jane_street_inference_server

In [7]:
# model uploaded via kaggle UI
#path_to_model_lgb = "/kaggle/input/test_lgbm_null_to_0/other/default/1/jane_lgbm.txt"
path_to_model_lgb = "./mono_model/model_init/jane_lgbm_baseline.txt"
#path_to_model_nn = "/kaggle/input/test_lgbm_null_to_0/other/default/1/jane_nn.ckpt"
path_to_model_nn = "./hybrid_model/model_init/jane_mlp_hidden_32_rmse.ckpt"

In [4]:
class MLPRegressor(pl.LightningModule):
    def __init__(self, input_dim: int, hidden_dim: int = 128, lr: float = 1e-3):
        super().__init__()
        self.save_hyperparameters()
        self.lr = lr
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim*2),
            nn.ReLU(),
            nn.Linear(hidden_dim*2, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )
        self.training_step_outputs = []
        self.validation_step_outputs = []
        self.test_step_outputs = []
        self.criterion = nn.MSELoss()

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = self.criterion(y_hat, y.squeeze())
        #loss = r2_score(y_hat, y.squeeze())
        self.training_step_outputs.append(loss.item())
        self.log("train_loss", loss)
        return loss
    
    def on_train_epoch_end(self):
        epoch_average = torch.tensor(self.training_step_outputs).mean()
        self.log("training_epoch_average", epoch_average)
        self.training_step_outputs.clear()  # free memory

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = self.criterion(y_hat, y.squeeze())
        self.validation_step_outputs.append(loss.item())
        self.log("val_loss", loss, prog_bar=True)
        return {"val_loss": loss}

    def on_validation_epoch_end(self):
        avg_val_loss = torch.tensor(self.validation_step_outputs).mean()
        self.log("avg_val_loss", avg_val_loss)
        self.validation_step_outputs.clear()

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x).squeeze()
        loss = self.criterion(y_hat, y.squeeze())
        #loss = r2_score(y_hat, y.squeeze())
        self.test_step_outputs.append(loss.item())
        self.log("test_loss", loss)
        return {"test_loss": loss}
    
    def on_test_epoch_end(self):
        epoch_average = torch.tensor(self.test_step_outputs).mean()
        self.log("test_epoch_average", epoch_average)
        self.test_step_outputs.clear()  # free memory

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

In [5]:
train_feature_list = ["time_id", "symbol_id"] + [f"feature_{idx:02d}" for idx in range(79)]

In [8]:
# load saved model to make predictions
model_lgb = lgb.Booster(model_file=path_to_model_lgb)
model_nn = MLPRegressor.load_from_checkpoint(path_to_model_nn)

In [9]:
model_nn.eval()

MLPRegressor(
  (model): Sequential(
    (0): Linear(in_features=81, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): ReLU()
    (6): Linear(in_features=32, out_features=1, bias=True)
  )
  (criterion): MSELoss()
)

In [17]:
def predict(test: pls.DataFrame, lags: pls.DataFrame | None) -> pls.DataFrame | pd.DataFrame:
    """Make a prediction."""
    # All the responders from the previous day are passed in at time_id == 0. We save them in a global variable for access at every time_id.
    # Use them as extra features, if you like.
    global lags_
    if lags is not None:
        lags_ = lags
    
    # Replace this section with your own predictions
    y_pred_lgb = model_lgb.predict(test.select([col for col in train_feature_list]))

    with torch.no_grad():
        y_pred_nn = model_nn(torch.tensor(test.fill_null(0).select([col for col in train_feature_list]).to_numpy(), dtype=torch.float32)).squeeze().numpy()

    predictions = test.select(pls.col("row_id"))
    y_pred = (y_pred_lgb + y_pred_nn) / 2
    predictions = predictions.with_columns(pls.Series("responder_6", y_pred))

    if isinstance(predictions, pls.DataFrame):
        assert predictions.columns == ['row_id', 'responder_6']
    elif isinstance(predictions, pd.DataFrame):
        assert (predictions.columns == ['row_id', 'responder_6']).all()
    else:
        raise TypeError('The predict function must return a DataFrame')
    # Confirm has as many rows as the test data.
    assert len(predictions) == len(test)

    return predictions

In [18]:
# Verify the pipeline using the example test data
data_path = "/home/yang/kaggle/jane/data"
# load example test data
test_data = pls.read_parquet(Path(data_path, "test.parquet", "date_id=0", "part-0.parquet"))

predictions = predict(test_data, None)
predictions



row_id,responder_6
i64,f64
0,-0.021293
1,-0.021293
2,-0.021293
3,-0.021293
4,-0.021293
…,…
34,-0.021293
35,-0.021293
36,-0.021293
37,-0.021293


In [None]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-realtime-marketdata-forecasting/test.parquet',
            '/kaggle/input/jane-street-realtime-marketdata-forecasting/lags.parquet',
        )
    )