In [2]:
import numpy as np
import pandas as pd 
import polars as pl
import gc
import os
import joblib 
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import kaggle_evaluation.jane_street_inference_server as inference_server

In [3]:
# use the Kaggle input directory
train_path = '/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet'

# Define the feature names based on the number of features (79 in this case)
features_names = [f"feature_{i:02d}"for i in range(79)]

# Define the target 
target = 'responder_6'

# Skip_dates
skip_dates = 1400

In [4]:
# ============================
# Reduce Memory Usage Function
# ============================
def reduce_memory_usage(df,float16_as32=False):
    start_mem = df.memory_usage().sum()/1024**2
    print(f'df memory usage before reduce : {start_mem} MB')
    for col in df.columns:
        col_type = df[col].dtype
        
        # Skip non-numeric columns
        if col_type.kind not in ['i','f']:
            continue
        
        c_min = df[col].min()
        c_max = df[col].max()

        # Integer types
        if col_type.kind in ['i']:
            if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.int64)

        # Float types
        else:
            if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float32 if float16_as32 else np.float16)
            elif c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        
    end_mem = df.memory_usage().sum()/1024**2
    print(f"Memory usage after optimization is: {end_mem:.2f} MB")
    print(f"Decreased by {(100 * (start_mem - end_mem) / start_mem):.1f}%")
    return df

In [5]:
df = pd.read_parquet(train_path, filters=[('date_id','>=', skip_dates)])
df = reduce_memory_usage(df)

df memory usage before reduce : 3711.112632751465 MB
Memory usage after optimization is: 1907.97 MB
Decreased by 48.6%


In [6]:
START_TRAIN = 1500
END_TRAIN   = 1599
VALID_START = 1650
VALID_END   = 1698
N_FOLDS     = 2

# ===============================#
#     LOAD DATA (Train + Final Valid)
# ===============================#
train_df = ( df[df["date_id"].between(START_TRAIN, END_TRAIN)].sort_values("date_id"))

valid_df = df[df["date_id"].between(VALID_START, VALID_END)]
X_valid = valid_df[features_names]
y_valid = valid_df["responder_6"]
w_valid =  valid_df["weight"]
# ===============================#
#     CREATE FOLDS FROM DATES
# ===============================#
all_dates = np.arange(START_TRAIN, END_TRAIN+1)
folds = np.array_split(all_dates, N_FOLDS)   

In [7]:
# ============================
#  Model Dictionary
# ============================
model_dict = {
    "LightGBM": lambda: LGBMRegressor(n_estimators=50, learning_rate=0.1),
    "XGBoost":  lambda: XGBRegressor(n_estimators=50, learning_rate=0.1,),
    "CatBoost": lambda: CatBoostRegressor(iterations=50, learning_rate=0.1, verbose=False)
}

In [8]:
# ============================
#  Create model directory
# ============================
os.makedirs("models", exist_ok=True)

In [9]:
# ===============================#
#     LOOP THROUGH FOLDS
# ===============================#
models = []

for fold, train_dates in enumerate(folds, start=1):
    print(f"--------------------- Fold {fold}/{N_FOLDS} -------------------")
    print(f"Train dates: from day {train_dates.min()} to {train_dates.max()} ({len(train_dates)} days)")
    print('-'*50)
    
    fold_df = train_df[train_df["date_id"].isin(train_dates)]
    X_train = fold_df[features_names]
    y_train = fold_df["responder_6"]
    w_train = fold_df["weight"]

    
    # IMPORTANT: create new model instance inside the fold
    for model_name, model_class in model_dict.items():
        
        print(f'============== {model_name} with Fold {fold}/{N_FOLDS} =========')
        # create NEW model object for THIS fold
        model = model_class()
        model.fit(X_train, y_train, sample_weight=w_train)
        # Save model
        joblib.dump(model, f"models/{model_name}_{fold}.model")
        
        models.append((model_name, fold, model))

    del X_train, y_train, w_train
    gc.collect()

--------------------- Fold 1/2 -------------------
Train dates: from day 1500 to 1549 (50 days)
--------------------------------------------------
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.823490 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19236
[LightGBM] [Info] Number of data points in the train set: 1862432, number of used features: 79
[LightGBM] [Info] Start training from score -0.011091
--------------------- Fold 2/2 -------------------
Train dates: from day 1550 to 1599 (50 days)
--------------------------------------------------
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.767759 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19234
[LightGBM] [Info] Number of data points in the train set: 1855656, number of used features: 79
[LightGBM] [Info] Start training from score 0.000357


In [10]:
models.clear()
for fold, train_dates in enumerate(folds, start=1):
        for model_name in model_dict.keys():
            model = joblib.load(f"models/{model_name}_{fold}.model")
            models.append((model_name, fold, model))

In [11]:
models

[('LightGBM', 1, LGBMRegressor(n_estimators=50)),
 ('XGBoost',
  1,
  XGBRegressor(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               gamma=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=0.1, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=None, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               multi_strategy=None, n_estimators=50, n_jobs=None,
               num_parallel_tree=None, random_state=None, ...)),
 ('CatBoost', 1, <catboost.core.CatBoostRegressor at 0x7fcc92be7c10>),
 ('LightGBM', 2, LGBMRegressor(n_estimators=50)),
 ('XGBoost',
  2,
  XGBRegressor(base_score

In [12]:
# ========================================
# Prediction Using the Ensemble of Models
# ========================================

lags_: pl.DataFrame | None = None

def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame:
    """Make a prediction using the ensemble of models."""
    global lags_
    if lags is not None:
        lags_ = lags

    # Convert features to NumPy for model prediction
    feat = test.select(feature_names).to_numpy()
    
    # Ensemble prediction (average over all models)
    pred = np.mean([model.predict(feat) for model in models], axis=0)
    
    # Create Polars DataFrame for submission
    predictions = pl.DataFrame({
        'row_id': test['row_id'],
        'responder_6': pred.astype(np.float32)
    })
    
    # Assertions for safety
    assert isinstance(predictions, pl.DataFrame)
    assert list(predictions.columns) == ['row_id', 'responder_6']
    assert len(predictions) == len(test)

    return predictions


In [None]:
if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-realtime-marketdata-forecasting/test.parquet',
            '/kaggle/input/jane-street-realtime-marketdata-forecasting/lags.parquet',
        )
    )

In [None]:
# ===============================
# Submission
# ===============================
if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    # If the competition is currently running on Kaggle
    inference_server.serve()

elif os.getenv('KAGGLE_IS_COMPETITION_ACTIVE'):
    # If the competition is still active
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-realtime-marketdata-forecasting/test.parquet',
            '/kaggle/input/jane-street-realtime-marketdata-forecasting/lags.parquet',
        )
    )

else:
    # After the competition has ended
    test_df = pl.read_parquet('/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet/date_id=0/part-0.parquet')
    lags_df = pl.read_parquet('/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet/date_id=0/part-0.parquet')
    
    # Use your previously defined predict() function
    predictions = predict(test_df, lags_df)
    print(predictions)