### Imports and Set Flag

In [25]:
import os
import pandas as pd
import polars as pl

from typing import List


### Environment Setup ###
IS_KAGGLE = False  # Flag to switch between environments

### Environment Dependent Path Setup

In [26]:
# Setup paths based on environment
if IS_KAGGLE:
    data_path = '/kaggle/input/jane-street-real-time-market-data-forecasting'
    trained_models_path = "/kaggle/input/janestreet-models/trained_models/"
else:
    # For local testing
    import sys

    # Path to main project directory
    PROJ_DIR = os.path.dirname(os.getcwd())
    
    # Add paths to system path
    sys.path.append(os.path.join(PROJ_DIR, "jane-street-real-time-market-data-forecasting"))
    sys.path.append(os.path.join(PROJ_DIR, "training", "src", "utils"))
    
    # Set data path to our local test data
    data_path = os.path.join(os.getcwd(), "local_test_data")
    
    # Import local testing metrics
    from metrics import r2_score_weighted

import kaggle_evaluation.jane_street_inference_server

### Pre-processing

In [27]:
def preprocess_features(df: pl.DataFrame) -> pl.DataFrame:
    """
    Preprocess features consistent with training:
    1. Create null flags for features with nulls
    2. Fill nulls with zeros (starting point)
    """
    feature_cols = [f'feature_{i:02d}' for i in range(79)]
    
    # Create expressions for null flags and filled features
    expressions = []
    
    for col in feature_cols:
        # Create null flag for each feature
        expressions.append(
            pl.col(col).is_null().cast(pl.Int8).alias(f'{col}_is_null')
        )
        # Fill nulls with 0
        expressions.append(
            pl.col(col).fill_null(0).alias(col)
        )
    
    # Add all other columns and apply transformations
    return df.with_columns(expressions)

### Load model

In [28]:
import lightgbm

In [29]:
# My models
# "val_r2_0.0122524.joblib"
# "backup_best_valr2.joblib"

# Benchmark Models
# LGBMV1_1.joblib

model_name = "backup_best_valr2.joblib"

In [30]:
def setup_prediction():
    """Load and setup model and preprocessing"""
    if IS_KAGGLE:
        model_path = trained_models_path + model_name
    else:
        model_path = os.path.join(PROJ_DIR, "training", "trained_models", model_name)
    
    # Load model directly as a Booster
    model = lightgbm.Booster(model_file=model_path)
    
    return model

# Load model globally
model = setup_prediction()

### Predict API

In [31]:
# TODO: FLoat value in predictions?

# Global variables
lags_ : pl.DataFrame | None = None

def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    """Make a prediction."""
    global lags_
    
    if lags is not None:
        lags_ = lags

    # Preprocess features
    test = preprocess_features(test)
    
    # Get predictions from model using all required features including null flags
    feature_cols = ['symbol_id', 'weight'] + \
                  [f'feature_{i:02d}' for i in range(79)] + \
                  [f'feature_{i:02d}_is_null' for i in range(79)]
                  
    test_data = test.select(feature_cols).to_numpy()
    model_predictions = model.predict(test_data)
    
    # Create prediction DataFrame
    predictions = test.select(
        'row_id'
    ).with_columns([
        pl.Series(model_predictions, dtype=pl.Float64).alias('responder_6')
    ])
    
    assert isinstance(predictions, pl.DataFrame | pd.DataFrame) # The predict function must return a DataFrame
    assert predictions.columns == ['row_id', 'responder_6']     # with columns 'row_id', 'responer_6'
    assert len(predictions) == len(test)                        # and as many rows as the test data.

    return predictions

### Submit

In [32]:
# Set up inference server
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

# Run based on environment
if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            os.path.join(data_path, "test.parquet"),
            os.path.join(data_path, "lags.parquet"),
        )
    )
    
    # Local scoring
    print("Scoring submission...")
    predictions = pl.read_parquet("submission.parquet")                      # Read submission predictions
    test_data = pl.read_parquet(os.path.join(data_path, "test.parquet"))     # Read test data with actual values

    # Score only rows marked for scoring
    mask = test_data['is_scored']
    score = r2_score_weighted(
        test_data.filter(mask)['responder_6'].to_numpy(),
        predictions.filter(mask)['responder_6'].to_numpy(),
        test_data.filter(mask)['weight'].to_numpy()
    )
    print(f"Overall R² score: {score}")

[LightGBM] [Fatal] The number of features in data (160) is not the same as it was in training data (122).
You can set ``predict_disable_shape_check=true`` to discard this error, but please be aware what you are doing.


GatewayRuntimeError: (<GatewayRuntimeErrorType.SERVER_RAISED_EXCEPTION: 3>, 'The number of features in data (160) is not the same as it was in training data (122).\\nYou can set ``predict_disable_shape_check=true`` to discard this error, but please be aware what you are doing.", grpc_status:2, created_time:"2024-10-26T12:45:13.390118529+11:00"}')