In [1]:
import numpy as np
import pandas as pd 
import polars as pl
import gc
import os
import joblib 
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from xgboost.callback import EarlyStopping
import kaggle_evaluation.jane_street_inference_server
import warnings
warnings.filterwarnings('ignore')

In [2]:
# ============================
# Reduce Memory Usage Function
# ============================
def reduce_memory_usage(df,float16_as32=False):
    start_mem = df.memory_usage().sum()/1024**2
    print(f'df memory usage before reduce : {start_mem} MB')
    for col in df.columns:
        col_type = df[col].dtype
        
        # Skip non-numeric columns
        if col_type.kind not in ['i','f']:
            continue
        
        c_min = df[col].min()
        c_max = df[col].max()

        # Integer types
        if col_type.kind in ['i']:
            if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.int64)

        # Float types
        else:
            if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float32 if float16_as32 else np.float16)
            elif c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        
    end_mem = df.memory_usage().sum()/1024**2
    print(f"Memory usage after optimization is: {end_mem:.2f} MB")
    print(f"Decreased by {(100 * (start_mem - end_mem) / start_mem):.1f}%")
    return df

In [3]:
# -------------------------------------------
# Custom Weighted Zero-Mean R² for Lgb Model
# ------------------------------------------
def weighted_zero_mean_r2_lgb(y_true, y_pred, sample_weight):
    y_true_zm = y_true - np.average(y_true, weights=sample_weight)
    y_pred_zm = y_pred - np.average(y_pred, weights=sample_weight)

    numerator = np.sum(sample_weight * (y_true_zm - y_pred_zm) ** 2)
    denominator = np.sum(sample_weight * (y_true_zm) ** 2)

    r2 = 1 - numerator / (denominator + 1e-38)
    return "weighted_zero_mean_r2", r2, True   # maximize=True
# -------------------------------------------
# Custom Weighted Zero-Mean R² for Xgb Model
# -------------------------------------------
def weighted_zero_mean_r2_xgb(y_true, y_pred, sample_weight):
    y_true_zm = y_true - np.average(y_true, weights=sample_weight)
    y_pred_zm = y_pred - np.average(y_pred, weights=sample_weight)
    
    numerator = np.sum(sample_weight * (y_true_zm - y_pred_zm)**2)
    denominator = np.sum(sample_weight * (y_true_zm)**2)
    
    r2 = 1 - numerator / (denominator + 1e-38)
    return r2        

In [4]:
# use the Kaggle input directory
train_path = '/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet'

# Responders_Columns
features_cols = [f"feature_{i:02d}"for i in range(79)]

# Define the target 
target = 'responder_6'

#  Create models directory 
os.makedirs("models", exist_ok=True)

# Preapare Valid_df

In [5]:
# prepare valid_df
skip_dates= 1499  # I will use last 100 days for validation
valid_df = pd.read_parquet(train_path, filters=[('date_id','>=', skip_dates)])
valid_df = reduce_memory_usage(valid_df)

# X,y,w 
X_valid = valid_df[features_cols + ['time_id']]
y_valid = valid_df[target]
w_valid = valid_df["weight"]
print(f"valid dates: from day {valid_df['date_id'].min()} "
        f"to {valid_df['date_id'].max()} "
        f"({valid_df['date_id'].nunique()} days)")

df memory usage before reduce : 1254.5759353637695 MB
Memory usage after optimization is: 645.01 MB
Decreased by 48.6%
valid dates: from day 1599 to 1698 (100 days)


# Prepare train data & Tracking Train models with mlflow

In [6]:
%%capture
!pip install wandb -q
import wandb
wandb.login(key="e04a3ac3e8b68f363d1fecde23ad2a89bb7d146d")

In [11]:
# =========================
# Config
# =========================
START_TRAIN = 1099
END_TRAIN   = START_TRAIN + 200
folds = 2
model_name = "Xgboost"

models = []


# =========================================================
# Training Loop
# =========================================================
for i in range(folds):

    print(f'Load train data and apply reduce memory function on Fold {i+1}')

    # ===== Load Train Data =====
    train_df = pd.read_parquet(
        train_path,
        filters=[[('date_id', '>=', START_TRAIN), ('date_id', '<', END_TRAIN)]]
    )
    train_df = reduce_memory_usage(train_df)

    X_train = train_df[features_cols + ['time_id']]
    y_train = (
        train_df[target]
        + 0.5 * train_df['responder_7']
        + 0.5 * train_df['responder_8']
    )
    w_train = train_df["weight"]

    print(f"\n================ Fold {i+1}/{folds} ================")
    print(
        f"Train dates: from day {train_df['date_id'].min()} "
        f"to {train_df['date_id'].max()} "
        f"({train_df['date_id'].nunique()} days)"
    )

    # =====================================================
    # Track X features and y_target
    # =====================================================
    x_features_path = f"X_features_fold_{i+1}.txt"
    with open(x_features_path, "w") as f:
        for col in X_train.columns:
            f.write(col + "\n")

    Y_TARGET_EXPR = "target + 0.5*responder_7 + 0.5*responder_8"

    
    # =====================================================
    # Xgboost Model
    # =====================================================
    print(f'\n============== {model_name} | Fold {i+1} ==============')
    
    model = XGBRegressor(
        n_estimators=2000,
        learning_rate=0.01,
        max_depth=6,
        subsample=0.9,
        colsample_bytree=0.9,
        objective="reg:squarederror",
        device="cuda",
        tree_method="gpu_hist",
        max_bin=128,
        random_state=42,
        eval_metric=weighted_zero_mean_r2_xgb,
        disable_default_eval_metric=True,

    )

    model.fit(
    X_train, y_train,
    sample_weight=w_train,
    eval_set=[(X_valid, y_valid)],
    sample_weight_eval_set=[w_valid],
    callbacks=[EarlyStopping(rounds=100,maximize=True,save_best=True)],
    verbose=20
    )


    # =====================================================
    # Results
    # =====================================================
    best_iter = model.best_iteration
    best_score = model.best_score

    print(f"Best iteration: {best_iter}")
    print(f"Best score: {best_score}")

    model_file = f"models/{model_name}_Fold_{i+1}.model"
    joblib.dump(model, model_file)
    models.append((model_name, i + 1, model))



    # =====================================================
    # W&B Run
    # =====================================================
    run_name = f"{model_name}_Fold_{i+1}/{folds}"

    wandb_run = wandb.init(
        project="JS_Kaggle_Track_Xgboost",
        entity="mohamedzakariaafouda-mansoura-university",
        name=run_name,
        reinit=True,
        group=f"window_{START_TRAIN}_{END_TRAIN}",
        tags=["Xgboost","4folds"],
        config={
            "fold": i + 1,
            "n_features": X_train.shape[1],
            "train_window": f"{START_TRAIN}_{END_TRAIN}",
            "Number of days": f"({train_df['date_id'].nunique()} days)",
            "y_target_expr": Y_TARGET_EXPR,
            **model.get_params()
        }
    )


    wandb.log({
        "best_iteration": best_iter,
        "best_score": best_score
    })

    # =====================================================
    # Save artifacts
    # =====================================================
    
    wandb.save(model_file)
    wandb.save(x_features_path)
    wandb_run.finish()

    
    del model
    del train_df, X_train, y_train, w_train
    gc.collect()

    if folds > 1:
        START_TRAIN += 200
        END_TRAIN   = START_TRAIN + 200



Load train data and apply reduce memory function on Fold 1
df memory usage before reduce : 2448.040351867676 MB
Memory usage after optimization is: 1258.60 MB
Decreased by 48.6%

Train dates: from day 999 to 1198 (200 days)

[0]	validation_0-weighted_zero_mean_r2_xgb:0.00015
[20]	validation_0-weighted_zero_mean_r2_xgb:0.00235
[40]	validation_0-weighted_zero_mean_r2_xgb:0.00332
[60]	validation_0-weighted_zero_mean_r2_xgb:0.00374
[80]	validation_0-weighted_zero_mean_r2_xgb:0.00386
[100]	validation_0-weighted_zero_mean_r2_xgb:0.00375
[120]	validation_0-weighted_zero_mean_r2_xgb:0.00351
[140]	validation_0-weighted_zero_mean_r2_xgb:0.00322
[160]	validation_0-weighted_zero_mean_r2_xgb:0.00298
[180]	validation_0-weighted_zero_mean_r2_xgb:0.00268
[183]	validation_0-weighted_zero_mean_r2_xgb:0.00264
Best iteration: 83
Best score: 0.00389




0,1
best_iteration,▁
best_score,▁

0,1
best_iteration,83.0
best_score,0.00389


Load train data and apply reduce memory function on Fold 2
df memory usage before reduce : 2495.4259872436523 MB
Memory usage after optimization is: 1282.96 MB
Decreased by 48.6%

Train dates: from day 1199 to 1398 (200 days)

[0]	validation_0-weighted_zero_mean_r2_xgb:0.00013
[20]	validation_0-weighted_zero_mean_r2_xgb:0.00207
[40]	validation_0-weighted_zero_mean_r2_xgb:0.00317
[60]	validation_0-weighted_zero_mean_r2_xgb:0.00379
[80]	validation_0-weighted_zero_mean_r2_xgb:0.00409
[100]	validation_0-weighted_zero_mean_r2_xgb:0.00411
[120]	validation_0-weighted_zero_mean_r2_xgb:0.00398
[140]	validation_0-weighted_zero_mean_r2_xgb:0.00372
[160]	validation_0-weighted_zero_mean_r2_xgb:0.00354
[180]	validation_0-weighted_zero_mean_r2_xgb:0.00324
[189]	validation_0-weighted_zero_mean_r2_xgb:0.00312
Best iteration: 90
Best score: 0.004162


0,1
best_iteration,▁
best_score,▁

0,1
best_iteration,90.0
best_score,0.00416


Load train data and apply reduce memory function on Fold 3
df memory usage before reduce : 2468.628593444824 MB
Memory usage after optimization is: 1269.18 MB
Decreased by 48.6%

Train dates: from day 1399 to 1598 (200 days)

[0]	validation_0-weighted_zero_mean_r2_xgb:0.00012
[20]	validation_0-weighted_zero_mean_r2_xgb:0.00212
[40]	validation_0-weighted_zero_mean_r2_xgb:0.00336
[60]	validation_0-weighted_zero_mean_r2_xgb:0.00413
[80]	validation_0-weighted_zero_mean_r2_xgb:0.00460
[100]	validation_0-weighted_zero_mean_r2_xgb:0.00494
[120]	validation_0-weighted_zero_mean_r2_xgb:0.00512
[140]	validation_0-weighted_zero_mean_r2_xgb:0.00512
[160]	validation_0-weighted_zero_mean_r2_xgb:0.00503
[180]	validation_0-weighted_zero_mean_r2_xgb:0.00487
[200]	validation_0-weighted_zero_mean_r2_xgb:0.00468
[218]	validation_0-weighted_zero_mean_r2_xgb:0.00445
Best iteration: 119
Best score: 0.005133


0,1
best_iteration,▁
best_score,▁

0,1
best_iteration,119.0
best_score,0.00513


In [9]:
del train_df, X_train, y_train, w_train
gc.collect()

668

In [None]:
# ============================
#  Model Dictionary
# ============================
model_dict = {
    "LightGBM": lambda:LGBMRegressor(
    n_estimators=3000,          # زيادة عدد الأشجار لتحسين الدقة على التعلم البطيء
    learning_rate=0.003,        # تقليل LR لتحسين الثبات ونتائج أفضل
    num_leaves=128,              # عدد أكبر من العقد لزيادة المرونة، مع الحذر من overfitting
    max_depth=10,               # لتجنب النمو الزائد للشجرة
    min_child_samples=20,       # الحد الأدنى لعدد العينات في الورقة لتقليل overfitting
    subsample=0.8,              # لتقليل التباين (bagging)
    colsample_bytree=0.8,       # لتقليل التباين عن طريق عمود العينات
    reg_alpha=0.1,              # L1 regularization
    reg_lambda=0.2,             # L2 regularization
    random_state=42,
    max_bin=255,                # زيادة bins قد تحسن الدقة على float16
    device="gpu"
    ),

    "XGBoost": lambda: XGBRegressor(
    n_estimators=3000,          # عدد الأشجار لتحسين التعلم البطيء
    learning_rate=0.003,        # تقليل LR
    max_depth=8,                # لتجنب overfitting
    min_child_weight=10,        # لتجنب overfitting
    subsample=0.8,              # bagging
    colsample_bytree=0.8,       # column sampling
    gamma=0.1,                  # regularization على الورقة
    reg_alpha=0.1,              # L1 regularization
    reg_lambda=0.2,             # L2 regularization
    objective="reg:squarederror",
    tree_method="gpu_hist",      # GPU fast training
    eval_metric=weighted_zero_mean_r2_xgb,
    random_state=42
    ),
}

In [None]:
# =========================
# Config
# =========================
START_TRAIN = 1099
END_TRAIN   = 1299
folds = 2

models = []

# =========================================================
# Training Loop
# =========================================================
for i in range(folds):

    print(f'Load train data and apply reduce memory function on Fold {i+1}')

    # ===== Load Train Data =====
    train_df = pd.read_parquet(
        train_path,
        filters=[[('date_id', '>=', START_TRAIN), ('date_id', '<', END_TRAIN)]]
    )
    train_df = reduce_memory_usage(train_df)

    X_train = train_df[features_cols + ['time_id']]
    y_train = (
        train_df[target]
        + 0.5 * train_df['responder_7']
        + 0.5 * train_df['responder_8']
    )
    w_train = train_df["weight"]

    print(f"\n================ Fold {i+1}/{folds} ================")
    print(
        f"Train dates: from day {train_df['date_id'].min()} "
        f"to {train_df['date_id'].max()} "
        f"({train_df['date_id'].nunique()} days)"
    )

    # =====================================================
    # Track X features
    # =====================================================
    x_features_path = f"X_features_fold_{i+1}.txt"
    with open(x_features_path, "w") as f:
        for col in X_train.columns:
            f.write(col + "\n")

    Y_TARGET_EXPR = "target + 0.5*responder_7 + 0.5*responder_8"

    # =====================================================
    # Train Models
    # =====================================================
    for model_name, model_class in model_dict.items():

        run_name = f"{model_name}_Fold_{i+1}_suggest2"

        model = model_class()

        # ===== W&B Run =====
        wandb_run = wandb.init(
            project="JS_Kaggle_Experiments1",
            entity="mohamedzakariaafouda-mansoura-university",
            name=run_name,
            reinit=True,
            group=f"window_{START_TRAIN}_{END_TRAIN}",
            tags=[model_name,"suggest"],
            config={
                "model": model_name,
                "fold": i + 1,
                "n_features": X_train.shape[1],
                "train_window": f"{START_TRAIN}_{END_TRAIN}",
                "y_target_expr": Y_TARGET_EXPR,
                **model.get_params()
            }
        )

        print(f'\n============== {model_name} | Fold {i+1} ==============')

        # ===== Train =====
        if model_name == "LightGBM":

            model.fit(
                X_train, y_train,
                sample_weight=w_train,
                eval_set=[(X_valid, y_valid)],
                eval_sample_weight=[w_valid],
                eval_metric=weighted_zero_mean_r2_lgb,
                callbacks=[lgb.early_stopping(100)]
            )

            best_iter = model.best_iteration_
            best_score = model.best_score_['valid_0']['weighted_zero_mean_r2']

        else:  # XGBoost

            model.fit(
                X_train, y_train,
                sample_weight=w_train,
                eval_set=[(X_valid, y_valid)],
                sample_weight_eval_set=[w_valid],
                callbacks=[EarlyStopping(
                    rounds=100,
                    maximize=True,
                    save_best=True
                )],
                verbose=20
            )

            best_iter = model.best_iteration
            best_score = model.best_score

        print(f"Best iteration: {best_iter}")
        print(f"Best score: {best_score}")

        # ===== Log metrics =====
        wandb.log({
            "best_iteration": best_iter,
            "best_score": best_score
        })

        # ===== Save model & artifacts =====
        model_file = f"models/{model_name}_Fold_{i+1}.model"
        joblib.dump(model, model_file)

        wandb.save(model_file)
        wandb.save(x_features_path)

        models.append((model_name, i + 1, model))

        wandb_run.finish()

        del model
        gc.collect()

    del train_df, X_train, y_train, w_train
    gc.collect()

    if folds > 1:
        START_TRAIN += 200
        END_TRAIN   += 200
