In [1]:
import numpy as np
import pandas as pd 
import polars as pl
import gc
import os
import joblib 
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from xgboost.callback import EarlyStopping
from catboost import CatBoostRegressor
import kaggle_evaluation.jane_street_inference_server
import warnings
warnings.filterwarnings('ignore')

In [2]:
# ============================
# Reduce Memory Usage Function
# ============================
def reduce_memory_usage(df,float16_as32=False):
    start_mem = df.memory_usage().sum()/1024**2
    print(f'df memory usage before reduce : {start_mem} MB')
    for col in df.columns:
        col_type = df[col].dtype
        
        # Skip non-numeric columns
        if col_type.kind not in ['i','f']:
            continue
        
        c_min = df[col].min()
        c_max = df[col].max()

        # Integer types
        if col_type.kind in ['i']:
            if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.int64)

        # Float types
        else:
            if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float32 if float16_as32 else np.float16)
            elif c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        
    end_mem = df.memory_usage().sum()/1024**2
    print(f"Memory usage after optimization is: {end_mem:.2f} MB")
    print(f"Decreased by {(100 * (start_mem - end_mem) / start_mem):.1f}%")
    return df

In [3]:
# -------------------------------------------
# Custom Weighted Zero-Mean R² for Lgb Model
# ------------------------------------------
def weighted_zero_mean_r2_lgb(y_true, y_pred, sample_weight):
    y_true_zm = y_true - np.average(y_true, weights=sample_weight)
    y_pred_zm = y_pred - np.average(y_pred, weights=sample_weight)

    numerator = np.sum(sample_weight * (y_true_zm - y_pred_zm) ** 2)
    denominator = np.sum(sample_weight * (y_true_zm) ** 2)

    r2 = 1 - numerator / (denominator + 1e-38)
    return "weighted_zero_mean_r2", r2, True   # maximize=True
# -------------------------------------------
# Custom Weighted Zero-Mean R² for Xgb Model
# -------------------------------------------
def weighted_zero_mean_r2_xgb(y_true, y_pred, sample_weight):
    y_true_zm = y_true - np.average(y_true, weights=sample_weight)
    y_pred_zm = y_pred - np.average(y_pred, weights=sample_weight)
    
    numerator = np.sum(sample_weight * (y_true_zm - y_pred_zm)**2)
    denominator = np.sum(sample_weight * (y_true_zm)**2)
    
    r2 = 1 - numerator / (denominator + 1e-38)
    return r2        

In [4]:
# ============================
#  Model Dictionary
# ============================
model_dict = {
    "LightGBM": lambda:LGBMRegressor(
    n_estimators=3000,
    learning_rate=0.01,
    num_leaves=50,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    max_bin=128,
    device="gpu"
    ),

    "XGBoost": lambda: XGBRegressor(
    n_estimators=2000,
    learning_rate=0.01,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="reg:squarederror",
    device="cuda",
    tree_method="gpu_hist",
    max_bin=128,
    random_state=42,
    eval_metric=weighted_zero_mean_r2_xgb,
    disable_default_eval_metric=True
    ),
}

In [6]:
# use the Kaggle input directory
train_path = '/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet'

# Responders_Columns
features_cols = [f"feature_{i:02d}"for i in range(79)]

# Define the target 
target = 'responder_6'

#  Create models directory 
os.makedirs("models", exist_ok=True)

# Preapare Valid_df

In [7]:
# prepare valid_df
skip_dates= 1499  # I will use last 200 days for validation
valid_df = pd.read_parquet(train_path, filters=[('date_id','>=', skip_dates)])
valid_df = reduce_memory_usage(valid_df)

# X,y,w 
X_valid = valid_df[features_cols + ['time_id']]
y_valid = (valid_df[target]+ 0.5 * valid_df['responder_7']+ .5 * valid_df['responder_8'])
w_valid =  valid_df["weight"]

df memory usage before reduce : 2510.1318740844727 MB
Memory usage after optimization is: 1290.52 MB
Decreased by 48.6%


# Prepare train data & Train models

In [8]:
START_TRAIN = 1200
END_TRAIN   = 1399
folds = 2

models = []

for i in range(folds):
    print(f'Load train data and apply reduce memory function on Fold {i+1}')
   # load train data
    train_df = pd.read_parquet(
        train_path,
        filters=[[('date_id', '>=', START_TRAIN),
                  ('date_id', '<=', END_TRAIN)]]
    )
    train_df = reduce_memory_usage(train_df)

    # X,y,w
    X_train = train_df[features_cols + ['time_id']]
    y_train = (
        train_df[target]
        + 0.5 * train_df['responder_7']
        + 0.5 * train_df['responder_8']
    )
    w_train = train_df["weight"]
    
    print(f"\n================ Fold {i+1}/{folds} ================")
    print(f"Train dates: from day {train_df['date_id'].min()} to {train_df['date_id'].max()} ({train_df['date_id'].nunique()} days)")

    # Train and evulate models
    for model_name, model_class in model_dict.items():

        print(f'\n============== {model_name} | Fold {i+1} =========')

        model = model_class()

        if model_name == "LightGBM":

            model.fit(
                X_train, y_train,
                sample_weight=w_train,
                eval_set=[(X_valid, y_valid)],
                eval_sample_weight=[w_valid],
                eval_metric=weighted_zero_mean_r2_lgb,
                callbacks=[lgb.early_stopping(100)]
            )

            print("Best iteration:", model.best_iteration_)
            print(
                "Best score:",
                model.best_score_['valid_0']['weighted_zero_mean_r2']
            )

        else:  # XGBoost

            model.fit(
                X_train, y_train,
                sample_weight=w_train,
                eval_set=[(X_valid, y_valid)],
                sample_weight_eval_set=[w_valid],
                callbacks=[EarlyStopping(rounds=100, maximize=True, save_best=True)],
                verbose=20
            )

            print(f"Best iteration: {model.best_iteration}")
            print(f"Best CV score: {model.best_score}\n")

        joblib.dump(model, f"models/{model_name}_{i+1}.model")
        models.append((model_name, i+1, model))

        del model
        gc.collect()

    del train_df, X_train, y_train, w_train
    gc.collect()

    START_TRAIN += 200
    END_TRAIN   += 200


df memory usage before reduce : 2494.772392272949 MB
Memory usage after optimization is: 1282.62 MB
Decreased by 48.6%

Train dates: from day 1200 to 1399 (7389712 days)
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 9913
[LightGBM] [Info] Number of data points in the train set: 7389712, number of used features: 80
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 80 dense feature groups (563.79 MB) transferred to GPU in 0.587026 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.006386
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[376]	valid_0's l2: 1.49738	valid_0's weighted_zero_mean_r2: 0.0107017
Best iteration: 376
Best score: 0.010701737129045652
[0]	validation_0-weighted_zero_mean_r2_xgb:0.00012
[20]	validation_0-weighted_zero_mean_r2_xgb:0.00234
[40]	validation_0-weighted_zero_mean_r2_xgb:0.00408
[60]	validation_0-weighted_zero_mean_r2_xgb:0.00547
[80]	validation_0-weighted_zero_mean_r2_xgb:0.00655
[100]	validation_0-weighted_zero_mean_r2_xgb:0.00741
[120]	validation_0-weighted_zero_mean_r2_xgb:0.00805
[140]	validation_0-weighted_zero_mean_r2_xgb:0.00858
[160]	validation_0-weighted_zero_mean_r2_xgb:0.00907
[180]	validation_0-weighted_zero_mean_r2_xgb