In [1]:
import numpy as np
import pandas as pd 
import polars as pl
import gc
import os
import joblib 
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from xgboost.callback import EarlyStopping
from catboost import CatBoostRegressor
import kaggle_evaluation.jane_street_inference_server
import warnings
warnings.filterwarnings('ignore')

In [2]:
# ============================
# Reduce Memory Usage Function
# ============================
def reduce_memory_usage(df,float16_as32=False):
    start_mem = df.memory_usage().sum()/1024**2
    print(f'df memory usage before reduce : {start_mem} MB')
    for col in df.columns:
        col_type = df[col].dtype
        
        # Skip non-numeric columns
        if col_type.kind not in ['i','f']:
            continue
        
        c_min = df[col].min()
        c_max = df[col].max()

        # Integer types
        if col_type.kind in ['i']:
            if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.int64)

        # Float types
        else:
            if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float32 if float16_as32 else np.float16)
            elif c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        
    end_mem = df.memory_usage().sum()/1024**2
    print(f"Memory usage after optimization is: {end_mem:.2f} MB")
    print(f"Decreased by {(100 * (start_mem - end_mem) / start_mem):.1f}%")
    return df

In [3]:
# -------------------------------------------
# Custom Weighted Zero-Mean R² for Lgb Model
# ------------------------------------------
def weighted_zero_mean_r2_lgb(y_true, y_pred, sample_weight):
    y_true_zm = y_true - np.average(y_true, weights=sample_weight)
    y_pred_zm = y_pred - np.average(y_pred, weights=sample_weight)

    numerator = np.sum(sample_weight * (y_true_zm - y_pred_zm) ** 2)
    denominator = np.sum(sample_weight * (y_true_zm) ** 2)

    r2 = 1 - numerator / (denominator + 1e-38)
    return "weighted_zero_mean_r2", r2, True   # maximize=True
# -------------------------------------------
# Custom Weighted Zero-Mean R² for Xgb Model
# -------------------------------------------
def weighted_zero_mean_r2_xgb(y_true, y_pred, sample_weight):
    y_true_zm = y_true - np.average(y_true, weights=sample_weight)
    y_pred_zm = y_pred - np.average(y_pred, weights=sample_weight)
    
    numerator = np.sum(sample_weight * (y_true_zm - y_pred_zm)**2)
    denominator = np.sum(sample_weight * (y_true_zm)**2)
    
    r2 = 1 - numerator / (denominator + 1e-38)
    return r2        

In [19]:
# ============================
#  Model Dictionary
# ============================
model_dict = {
    "LightGBM": lambda:LGBMRegressor(
    n_estimators=3000,          # زيادة عدد الأشجار لتحسين الدقة على التعلم البطيء
    learning_rate=0.003,        # تقليل LR لتحسين الثبات ونتائج أفضل
    num_leaves=128,              # عدد أكبر من العقد لزيادة المرونة، مع الحذر من overfitting
    max_depth=10,               # لتجنب النمو الزائد للشجرة
    min_child_samples=20,       # الحد الأدنى لعدد العينات في الورقة لتقليل overfitting
    subsample=0.8,              # لتقليل التباين (bagging)
    colsample_bytree=0.8,       # لتقليل التباين عن طريق عمود العينات
    reg_alpha=0.1,              # L1 regularization
    reg_lambda=0.2,             # L2 regularization
    random_state=42,
    max_bin=255,                # زيادة bins قد تحسن الدقة على float16
    device="gpu"
    ),

    "XGBoost": lambda: XGBRegressor(
    n_estimators=3000,          # عدد الأشجار لتحسين التعلم البطيء
    learning_rate=0.003,        # تقليل LR
    max_depth=8,                # لتجنب overfitting
    min_child_weight=10,        # لتجنب overfitting
    subsample=0.8,              # bagging
    colsample_bytree=0.8,       # column sampling
    gamma=0.1,                  # regularization على الورقة
    reg_alpha=0.1,              # L1 regularization
    reg_lambda=0.2,             # L2 regularization
    objective="reg:squarederror",
    tree_method="gpu_hist",      # GPU fast training
    eval_metric=weighted_zero_mean_r2_xgb,
    random_state=42
    ),
}

In [5]:
# use the Kaggle input directory
train_path = '/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet'

# Responders_Columns
features_cols = [f"feature_{i:02d}"for i in range(79)]

# Define the target 
target = 'responder_6'

#  Create models directory 
os.makedirs("models", exist_ok=True)

# Preapare Valid_df

In [6]:
# prepare valid_df
skip_dates= 1499  # I will use last 200 days for validation
valid_df = pd.read_parquet(train_path, filters=[('date_id','>=', skip_dates)])
valid_df = reduce_memory_usage(valid_df)

# X,y,w 
X_valid = valid_df[features_cols + ['time_id']]
y_valid = valid_df[target]
w_valid = valid_df["weight"]

df memory usage before reduce : 2510.1318740844727 MB
Memory usage after optimization is: 1290.52 MB
Decreased by 48.6%


# Prepare train data & Tracking Train models with mlflow

In [10]:
%%capture
!pip install wandb -q
import wandb
wandb.login(key="your _wandb_api ")

In [23]:
# =====================================================
# W&B Setup for Kaggle (Clean & Correct)
# =====================================================

import os
import gc
import wandb
import pandas as pd
import joblib

# Create directories
os.makedirs("models", exist_ok=True)

# =========================
# Config
# =========================
START_TRAIN = 1099
END_TRAIN   = 1299
folds = 2

models = []

# =========================================================
# Training Loop
# =========================================================
for i in range(folds):

    print(f'Load train data and apply reduce memory function on Fold {i+1}')

    # ===== Load Train Data =====
    train_df = pd.read_parquet(
        train_path,
        filters=[[('date_id', '>=', START_TRAIN), ('date_id', '<', END_TRAIN)]]
    )
    train_df = reduce_memory_usage(train_df)

    X_train = train_df[features_cols + ['time_id']]
    y_train = (
        train_df[target]
        + 0.5 * train_df['responder_7']
        + 0.5 * train_df['responder_8']
    )
    w_train = train_df["weight"]

    print(f"\n================ Fold {i+1}/{folds} ================")
    print(
        f"Train dates: from day {train_df['date_id'].min()} "
        f"to {train_df['date_id'].max()} "
        f"({train_df['date_id'].nunique()} days)"
    )

    # =====================================================
    # Track X features
    # =====================================================
    x_features_path = f"X_features_fold_{i+1}.txt"
    with open(x_features_path, "w") as f:
        for col in X_train.columns:
            f.write(col + "\n")

    Y_TARGET_EXPR = "target + 0.5*responder_7 + 0.5*responder_8"

    # =====================================================
    # Train Models
    # =====================================================
    for model_name, model_class in model_dict.items():

        run_name = f"{model_name}_Fold_{i+1}_suggest2"

        model = model_class()

        # ===== W&B Run =====
        wandb_run = wandb.init(
            project="JS_Kaggle_Experiments1",
            entity="mohamedzakariaafouda-mansoura-university",
            name=run_name,
            reinit=True,
            group=f"window_{START_TRAIN}_{END_TRAIN}",
            tags=[model_name,"suggest"],
            config={
                "model": model_name,
                "fold": i + 1,
                "n_features": X_train.shape[1],
                "train_window": f"{START_TRAIN}_{END_TRAIN}",
                "y_target_expr": Y_TARGET_EXPR,
                **model.get_params()
            }
        )

        print(f'\n============== {model_name} | Fold {i+1} ==============')

        # ===== Train =====
        if model_name == "LightGBM":

            model.fit(
                X_train, y_train,
                sample_weight=w_train,
                eval_set=[(X_valid, y_valid)],
                eval_sample_weight=[w_valid],
                eval_metric=weighted_zero_mean_r2_lgb,
                callbacks=[lgb.early_stopping(100)]
            )

            best_iter = model.best_iteration_
            best_score = model.best_score_['valid_0']['weighted_zero_mean_r2']

        else:  # XGBoost

            model.fit(
                X_train, y_train,
                sample_weight=w_train,
                eval_set=[(X_valid, y_valid)],
                sample_weight_eval_set=[w_valid],
                callbacks=[EarlyStopping(
                    rounds=100,
                    maximize=True,
                    save_best=True
                )],
                verbose=20
            )

            best_iter = model.best_iteration
            best_score = model.best_score

        print(f"Best iteration: {best_iter}")
        print(f"Best score: {best_score}")

        # ===== Log metrics =====
        wandb.log({
            "best_iteration": best_iter,
            "best_score": best_score
        })

        # ===== Save model & artifacts =====
        model_file = f"models/{model_name}_Fold_{i+1}.model"
        joblib.dump(model, model_file)

        wandb.save(model_file)
        wandb.save(x_features_path)

        models.append((model_name, i + 1, model))

        wandb_run.finish()

        del model
        gc.collect()

    del train_df, X_train, y_train, w_train
    gc.collect()

    if folds > 1:
        START_TRAIN += 200
        END_TRAIN   += 200


Load train data and apply reduce memory function on Fold 1
df memory usage before reduce : 2522.8769760131836 MB
Memory usage after optimization is: 1297.07 MB
Decreased by 48.6%

Train dates: from day 1099 to 1298 (200 days)



[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 19634
[LightGBM] [Info] Number of data points in the train set: 7472960, number of used features: 80
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 80 dense feature groups (570.14 MB) transferred to GPU in 0.543681 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.006229
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[237]	valid_0's l2: 0.595514	valid_0's weighted_zero_mean_r2: 0.00433453
Best iteration: 237
Best score: 0.004334531552220078


0,1
best_iteration,▁
best_score,▁

0,1
best_iteration,237.0
best_score,0.00433



[0]	validation_0-rmse:0.77335	validation_0-weighted_zero_mean_r2_xgb:0.00006
[20]	validation_0-rmse:0.77296	validation_0-weighted_zero_mean_r2_xgb:0.00108
[40]	validation_0-rmse:0.77264	validation_0-weighted_zero_mean_r2_xgb:0.00191
[60]	validation_0-rmse:0.77241	validation_0-weighted_zero_mean_r2_xgb:0.00251
[80]	validation_0-rmse:0.77221	validation_0-weighted_zero_mean_r2_xgb:0.00302
[100]	validation_0-rmse:0.77208	validation_0-weighted_zero_mean_r2_xgb:0.00335
[120]	validation_0-rmse:0.77200	validation_0-weighted_zero_mean_r2_xgb:0.00354
[140]	validation_0-rmse:0.77195	validation_0-weighted_zero_mean_r2_xgb:0.00368
[160]	validation_0-rmse:0.77190	validation_0-weighted_zero_mean_r2_xgb:0.00379
[180]	validation_0-rmse:0.77190	validation_0-weighted_zero_mean_r2_xgb:0.00379
[200]	validation_0-rmse:0.77192	validation_0-weighted_zero_mean_r2_xgb:0.00376
[220]	validation_0-rmse:0.77192	validation_0-weighted_zero_mean_r2_xgb:0.00375
[240]	validation_0-rmse:0.77200	validation_0-weighted_zer

0,1
best_iteration,▁
best_score,▁

0,1
best_iteration,176.0
best_score,0.00382


Load train data and apply reduce memory function on Fold 2
df memory usage before reduce : 2454.9030990600586 MB
Memory usage after optimization is: 1262.13 MB
Decreased by 48.6%

Train dates: from day 1299 to 1498 (200 days)



[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 19637
[LightGBM] [Info] Number of data points in the train set: 7271616, number of used features: 80
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 80 dense feature groups (554.78 MB) transferred to GPU in 0.517257 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.003815
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[397]	valid_0's l2: 0.593736	valid_0's weighted_zero_mean_r2: 0.00719065
Best iteration: 397
Best score: 0.007190647703161468


0,1
best_iteration,▁
best_score,▁

0,1
best_iteration,397.0
best_score,0.00719



[0]	validation_0-rmse:0.77332	validation_0-weighted_zero_mean_r2_xgb:0.00006
[20]	validation_0-rmse:0.77290	validation_0-weighted_zero_mean_r2_xgb:0.00114
[40]	validation_0-rmse:0.77254	validation_0-weighted_zero_mean_r2_xgb:0.00207
[60]	validation_0-rmse:0.77222	validation_0-weighted_zero_mean_r2_xgb:0.00289
[80]	validation_0-rmse:0.77195	validation_0-weighted_zero_mean_r2_xgb:0.00358
[100]	validation_0-rmse:0.77172	validation_0-weighted_zero_mean_r2_xgb:0.00418
[120]	validation_0-rmse:0.77151	validation_0-weighted_zero_mean_r2_xgb:0.00471
[140]	validation_0-rmse:0.77133	validation_0-weighted_zero_mean_r2_xgb:0.00518
[160]	validation_0-rmse:0.77118	validation_0-weighted_zero_mean_r2_xgb:0.00557
[180]	validation_0-rmse:0.77104	validation_0-weighted_zero_mean_r2_xgb:0.00590
[200]	validation_0-rmse:0.77093	validation_0-weighted_zero_mean_r2_xgb:0.00620
[220]	validation_0-rmse:0.77083	validation_0-weighted_zero_mean_r2_xgb:0.00644
[240]	validation_0-rmse:0.77076	validation_0-weighted_zer

0,1
best_iteration,▁
best_score,▁

0,1
best_iteration,378.0
best_score,0.00728
