### Imports and Set Flag

In [1]:
import os
import pandas as pd
import polars as pl

### Environment Setup ###
IS_KAGGLE = False  # Flag to switch between environments

### Environment Dependent Path Setup

In [2]:
# Setup paths based on environment
if IS_KAGGLE:
    data_path = '/kaggle/input/jane-street-real-time-market-data-forecasting'
    trained_models_path = "/kaggle/input/janestreet-models/trained_models/"
else:
    # For local testing
    import sys

    # Path to main project directory
    PROJ_DIR = os.path.dirname(os.getcwd())
    
    # Add paths to system path
    sys.path.append(os.path.join(PROJ_DIR, "jane-street-real-time-market-data-forecasting"))
    sys.path.append(os.path.join(PROJ_DIR, "training", "src", "utils"))
    
    # Set data path to our local test data
    data_path = os.path.join(os.getcwd(), "local_test_data")
    
    # Import local testing metrics
    from metrics import r2_score_weighted

import kaggle_evaluation.jane_street_inference_server

### Load model and Pre-processing

In [3]:
import lightgbm

In [4]:
# My models
# "val_r2_0.0122524.joblib"
# "backup_best_valr2.joblib"

# Benchmark Models
# LGBMV1_1.joblib

model_name = "val_r2_0.0122524.joblib"

In [5]:
def setup_prediction():
    """Load and setup model and preprocessing"""
    if IS_KAGGLE:
        model_path = trained_models_path + model_name
    else:
        model_path = os.path.join(PROJ_DIR, "training", "trained_models", model_name)
    
    # Load model directly as a Booster
    model = lightgbm.Booster(model_file=model_path)
    
    return model

# Load model globally
model = setup_prediction()

### Submit

In [6]:
# TODO: FLoat value in predictions?

# Global variables
lags_ : pl.DataFrame | None = None

def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    """Make a prediction."""
    global lags_
    
    if lags is not None:
        lags_ = lags

    # Get predictions from model using all required features
    feature_cols = ['symbol_id', 'weight'] + [f'feature_{i:02d}' for i in range(79)]
    test_data = test.select(feature_cols).to_numpy()
    model_predictions = model.predict(test_data)
    
    # Create prediction DataFrame
    predictions = test.select(
        'row_id'
    ).with_columns([
        pl.Series(model_predictions, dtype=pl.Float64).alias('responder_6')
    ])
    
    assert isinstance(predictions, pl.DataFrame | pd.DataFrame) # The predict function must return a DataFrame
    assert predictions.columns == ['row_id', 'responder_6']     # with columns 'row_id', 'responer_6'
    assert len(predictions) == len(test)                        # and as many rows as the test data.

    return predictions

In [7]:
# Set up inference server
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

# Run based on environment
if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            os.path.join(data_path, "test.parquet"),
            os.path.join(data_path, "lags.parquet"),
        )
    )
    
    # Local scoring
    print("Scoring submission...")
    predictions = pl.read_parquet("submission.parquet")                      # Read submission predictions
    test_data = pl.read_parquet(os.path.join(data_path, "test.parquet"))     # Read test data with actual values

    # Score only rows marked for scoring
    mask = test_data['is_scored']
    score = r2_score_weighted(
        test_data.filter(mask)['responder_6'].to_numpy(),
        predictions.filter(mask)['responder_6'].to_numpy(),
        test_data.filter(mask)['weight'].to_numpy()
    )
    print(f"Overall R² score: {score}")

I0000 00:00:1729837785.434293 1149040 config.cc:230] gRPC experiments enabled: call_status_override_on_cancellation, event_engine_dns, event_engine_listener, http2_stats_fix, monitoring_experiment, pick_first_new, trace_record_callops, work_serializer_clears_time_cache


Scoring submission...
Overall R² score: 0.00619853601475151
