In [1]:
import numpy as np
import pandas as pd 
import polars as pl
import gc
import os
import joblib 
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from xgboost.callback import EarlyStopping
from catboost import CatBoostRegressor
import kaggle_evaluation.jane_street_inference_server
import warnings
warnings.filterwarnings('ignore')

In [None]:
'''
#add axurailty target 
#use weight and basis to track the model 
'''

In [2]:
# use the Kaggle input directory
train_path = '/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet'

# Responders_Columns
features_cols = [f"feature_{i:02d}"for i in range(79)]

# Define the target 
target = 'responder_6'

# Skip_dates
skip_dates = 1200

In [3]:
# ============================
# Reduce Memory Usage Function
# ============================
def reduce_memory_usage(df,float16_as32=False):
    start_mem = df.memory_usage().sum()/1024**2
    print(f'df memory usage before reduce : {start_mem} MB')
    for col in df.columns:
        col_type = df[col].dtype
        
        # Skip non-numeric columns
        if col_type.kind not in ['i','f']:
            continue
        
        c_min = df[col].min()
        c_max = df[col].max()

        # Integer types
        if col_type.kind in ['i']:
            if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.int64)

        # Float types
        else:
            if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float32 if float16_as32 else np.float16)
            elif c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        
    end_mem = df.memory_usage().sum()/1024**2
    print(f"Memory usage after optimization is: {end_mem:.2f} MB")
    print(f"Decreased by {(100 * (start_mem - end_mem) / start_mem):.1f}%")
    return df

In [4]:
df = pd.read_parquet(train_path, filters=[('date_id','>=', skip_dates)])
df = reduce_memory_usage(df)

df memory usage before reduce : 6205.884635925293 MB
Memory usage after optimization is: 3190.60 MB
Decreased by 48.6%


In [5]:
START_TRAIN = 1200
END_TRAIN   = 1599
VALID_START = 1600
VALID_END   = 1698
N_FOLDS     = 2

# ===============================#
#     LOAD DATA (Train + Final Valid)
# ===============================#
train_df = ( df[df["date_id"].between(START_TRAIN, END_TRAIN)].sort_values("date_id"))

valid_df = df[df["date_id"].between(VALID_START, VALID_END)]
X_valid = valid_df[features_cols + ['time_id']]
y_valid = valid_df[target]
w_valid =  valid_df["weight"]
# ===============================#
#     CREATE FOLDS FROM DATES
# ===============================#
all_dates = np.arange(START_TRAIN, END_TRAIN+1)
folds = np.array_split(all_dates, N_FOLDS)   

In [6]:
del df 
gc.collect()

0

In [7]:
# -------------------------------------------
# Custom Weighted Zero-Mean R² for Lgb Model
# ------------------------------------------
def weighted_zero_mean_r2_lgb(y_true, y_pred, sample_weight):
    y_true_zm = y_true - np.average(y_true, weights=sample_weight)
    y_pred_zm = y_pred - np.average(y_pred, weights=sample_weight)

    numerator = np.sum(sample_weight * (y_true_zm - y_pred_zm) ** 2)
    denominator = np.sum(sample_weight * (y_true_zm) ** 2)

    r2 = 1 - numerator / (denominator + 1e-38)
    return "weighted_zero_mean_r2", r2, True   # maximize=True
# -------------------------------------------
# Custom Weighted Zero-Mean R² for Xgb Model
# -------------------------------------------
def weighted_zero_mean_r2_xgb(y_true, y_pred, sample_weight):
    y_true_zm = y_true - np.average(y_true, weights=sample_weight)
    y_pred_zm = y_pred - np.average(y_pred, weights=sample_weight)
    
    numerator = np.sum(sample_weight * (y_true_zm - y_pred_zm)**2)
    denominator = np.sum(sample_weight * (y_true_zm)**2)
    
    r2 = 1 - numerator / (denominator + 1e-38)
    return r2        

In [8]:
# ============================
#  Model Dictionary
# ============================
model_dict = {
    "LightGBM": lambda:LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.01,
    num_leaves=50,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    max_bin=128,
    device="gpu"
    ),

    "XGBoost": lambda: XGBRegressor(
    n_estimators=2000,
    learning_rate=0.01,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="reg:squarederror",
    device="cuda",
    tree_method="gpu_hist",
    max_bin=128,
    random_state=42,
    eval_metric=weighted_zero_mean_r2_xgb,
    disable_default_eval_metric=True
    ),
}

In [9]:
# ==========================
#  Create models directory 
# ==========================
os.makedirs("models", exist_ok=True)

In [10]:
# ===============================#
#     LOOP THROUGH FOLDS
# ===============================#
models = []

for fold, train_dates in enumerate(folds, start=1):
    print(f"--------------------- Fold {fold}/{N_FOLDS} -------------------")
    print(f"Train dates: from day {train_dates.min()} to {train_dates.max()} ({len(train_dates)} days)")
    print('-'*50)
    
    fold_df = train_df[train_df["date_id"].isin(train_dates)]
    X_train = fold_df[features_cols + ['time_id']]
    y_train = fold_df[target]
    w_train = fold_df["weight"]

    
    for model_name, model_class in model_dict.items():
        
        if model_name =="LightGBM":
            print(f'============== {model_name} with Fold {fold}/{N_FOLDS} =========')
            # create NEW model object for THIS fold
            model = model_class()
            model.fit(X_train,y_train,
            sample_weight=w_train,
            eval_set=[(X_valid, y_valid)],
            eval_sample_weight=[w_valid],
            eval_metric=weighted_zero_mean_r2_lgb,
            callbacks=[lgb.early_stopping(stopping_rounds=100)])
        
        else:
            print(f'============== {model_name} with Fold {fold}/{N_FOLDS} =========')
            # create NEW model object for THIS fold
            model = model_class()
            model.fit(
            X_train,
            y_train,
            sample_weight=w_train,
            eval_set=[(X_valid, y_valid)],
            sample_weight_eval_set=[w_valid],
            verbose=20,
            callbacks=[EarlyStopping(rounds=100, maximize=True, save_best=True)],)

        
        # Save model
        joblib.dump(model, f"models/{model_name}_{fold}.model")
        models.append((model_name, fold, model))
        
        # Delete model to free memory
        del model
        gc.collect()

    del X_train, y_train, w_train
    gc.collect()

--------------------- Fold 1/2 -------------------
Train dates: from day 1200 to 1399 (200 days)
--------------------------------------------------
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 9952
[LightGBM] [Info] Number of data points in the train set: 7389712, number of used features: 81
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 81 dense feature groups (591.98 MB) transferred to GPU in 0.575752 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.000885
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[534]	valid_0's l2: 0.622352	valid_0's weighted_zero_mean_r2: 0.00528614
[0]	validation_0-weighted_zero_mean_r2_xgb:0.00007
[20]	validation_0-weighted_zero_mean_r2_xgb:0.00132
[40]	validation_0-weighted_zero_mean_r2_xgb:0.00220
[60]	validation_0-weighted_zero_mean_r2_xgb:0.00286
[80]	validation_0-weighted_zero_mean_r2_xgb:0.00338
[100]	validation_0-weighted_zero_mean_r2_xgb:0.00379
[120]	validation_0-weighted_zero_mean_r2_xgb:0.00411
[140]	validation_0-weighted_zero_mean_r2_xgb:0.00435
[160]	validation_0-weighted_zero_mean_r2_xgb:0.00456
[180]	validation_0-weighted_zero_mean_r2_xgb:0.00471
[200]	validation_0-weighted_zero_mean_r2_x

In [None]:
# ======================
# Load Pretrained model 
# ======================
'''
models_path ='/kaggle/input/models/scikitlearn/default/1/models'
for file in os.listdir(models_path):
    if file.endswith(".model"):
        model = joblib.load(os.path.join(models_path, file))
        models.append(model)
'''

In [None]:
# ========================================
# Prediction Using the Ensemble of Models
# ========================================

lags_: pl.DataFrame | None = None

def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame:
    """Make a prediction using the ensemble of models."""
    global lags_
    if lags is not None:
        lags_ = lags

    # Convert features to NumPy for model prediction
    feat = test.select(features_names).to_numpy()
    
    # Ensemble prediction (average over all models)
    pred = np.mean([model.predict(feat) for model in models], axis=0)
    
    # Create Polars DataFrame for submission
    predictions = pl.DataFrame({
        'row_id': test['row_id'],
        'responder_6': pred.astype(np.float32)
    })
    
    # Assertions for safety
    assert isinstance(predictions, pl.DataFrame)
    assert list(predictions.columns) == ['row_id', 'responder_6']
    assert len(predictions) == len(test)

    return predictions


In [None]:
# ===============
# Submission
# ===============
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    # If the competition is currently running on Kaggle
    inference_server.serve()

elif os.getenv('KAGGLE_IS_COMPETITION_ACTIVE'):
    # If the competition is still active
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-realtime-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-realtime-market-data-forecasting/lags.parquet',
        )
    )

else:
    # After the competition has ended
    test_df = pl.read_parquet('/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet/date_id=0/part-0.parquet')
    lags_df = pl.read_parquet('/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet/date_id=0/part-0.parquet')
    
    predictions = predict(test_df, lags_df)
    print(predictions)