In [1]:
import numpy as np
import pandas as pd 
import polars as pl
import gc
import os
import joblib 
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from xgboost.callback import EarlyStopping
from catboost import CatBoostRegressor
import kaggle_evaluation.jane_street_inference_server
import warnings
warnings.filterwarnings('ignore')

In [2]:
# ============================
# Reduce Memory Usage Function
# ============================
def reduce_memory_usage(df,float16_as32=False):
    start_mem = df.memory_usage().sum()/1024**2
    print(f'df memory usage before reduce : {start_mem} MB')
    for col in df.columns:
        col_type = df[col].dtype
        
        # Skip non-numeric columns
        if col_type.kind not in ['i','f']:
            continue
        
        c_min = df[col].min()
        c_max = df[col].max()

        # Integer types
        if col_type.kind in ['i']:
            if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.int64)

        # Float types
        else:
            if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float32 if float16_as32 else np.float16)
            elif c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        
    end_mem = df.memory_usage().sum()/1024**2
    print(f"Memory usage after optimization is: {end_mem:.2f} MB")
    print(f"Decreased by {(100 * (start_mem - end_mem) / start_mem):.1f}%")
    return df

In [3]:
# -------------------------------------------
# Custom Weighted Zero-Mean RÂ² for Lgb Model
# ------------------------------------------
def weighted_zero_mean_r2_lgb(y_true, y_pred, sample_weight):
    y_true_zm = y_true - np.average(y_true, weights=sample_weight)
    y_pred_zm = y_pred - np.average(y_pred, weights=sample_weight)

    numerator = np.sum(sample_weight * (y_true_zm - y_pred_zm) ** 2)
    denominator = np.sum(sample_weight * (y_true_zm) ** 2)

    r2 = 1 - numerator / (denominator + 1e-38)
    return "weighted_zero_mean_r2", r2, True   # maximize=True
# -------------------------------------------
# Custom Weighted Zero-Mean RÂ² for Xgb Model
# -------------------------------------------
def weighted_zero_mean_r2_xgb(y_true, y_pred, sample_weight):
    y_true_zm = y_true - np.average(y_true, weights=sample_weight)
    y_pred_zm = y_pred - np.average(y_pred, weights=sample_weight)
    
    numerator = np.sum(sample_weight * (y_true_zm - y_pred_zm)**2)
    denominator = np.sum(sample_weight * (y_true_zm)**2)
    
    r2 = 1 - numerator / (denominator + 1e-38)
    return r2        

In [4]:
# ============================
#  Model Dictionary
# ============================
model_dict = {
    "LightGBM": lambda:LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.01,
    num_leaves=50,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    max_bin=128,
    device="gpu"
    ),

    "XGBoost": lambda: XGBRegressor(
    n_estimators=2000,
    learning_rate=0.01,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="reg:squarederror",
    device="cuda",
    tree_method="gpu_hist",
    max_bin=128,
    random_state=42,
    eval_metric=weighted_zero_mean_r2_xgb,
    disable_default_eval_metric=True
    ),
}

In [5]:
# use the Kaggle input directory
train_path = '/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet'

# Responders_Columns
features_cols = [f"feature_{i:02d}"for i in range(79)]

# Define the target 
target = 'responder_6'

#  Create models directory 
os.makedirs("models", exist_ok=True)

# Preapare Valid_df

In [6]:
# prepare valid_df
skip_dates= 1499  # I will use last 200 days for validation
valid_df = pd.read_parquet(train_path, filters=[('date_id','>=', skip_dates)])
valid_df = reduce_memory_usage(valid_df)

# X,y,w 
X_valid = valid_df[features_cols + ['time_id']]
y_valid = valid_df[target]
w_valid = valid_df["weight"]

df memory usage before reduce : 2510.1318740844727 MB
Memory usage after optimization is: 1290.52 MB
Decreased by 48.6%


# Prepare train data & Tracking Train models with mlflow

In [24]:
%%capture
!pip install mlflow
import mlflow

In [8]:
# =====================================================
# MLflow Offline Setup (Kaggle) 
# =====================================================

MLFLOW_DIR = "/kaggle/working/mlruns"
os.makedirs(MLFLOW_DIR, exist_ok=True)

mlflow.set_tracking_uri(f"file://{MLFLOW_DIR}")
mlflow.set_experiment("JS_Kaggle_Experiments1")

# =========================
# Config 
# =========================

START_TRAIN = 1099
END_TRAIN   = 1299
folds = 2

models = []

# =========================================================
# Training Loop
# =========================================================

for i in range(folds):
    print(f'Load train data and apply reduce memory function on Fold {i+1}')

    # ===== Load Train Data =====
    train_df = pd.read_parquet(train_path,filters=[[('date_id', '>=', START_TRAIN),('date_id', '<', END_TRAIN)]])
    train_df = reduce_memory_usage(train_df)

    X_train = train_df[features_cols + ['time_id']]
    y_train = (train_df[target]+ 0.5 * train_df['responder_7']+ 0.5 * train_df['responder_8'])
    w_train = train_df["weight"]

    print(f"\n================ Fold {i+1}/{folds} ================")
    print(f"Train dates: from day {train_df['date_id'].min()} "f"to {train_df['date_id'].max()} "f"({train_df['date_id'].nunique()} days)")

    # =====================================================
    # Track X features and y target
    # =====================================================
    
    x_features_path = f"X_features_fold_{i+1}.txt"
    with open(x_features_path, "w") as f:
        for col in X_train.columns:
            f.write(col + "\n")

    Y_TARGET_EXPR = "target + 0.5*responder_7 + 0.5*responder_8"


    # =====================================================
    # Train Models
    # =====================================================
    for model_name, model_class in model_dict.items():

        run_name = f"{model_name}_Fold_{i+1}"

        with mlflow.start_run(run_name=run_name):

            model = model_class()

            # ===== MLflow Params =====
            mlflow.log_param("model", model_name)
            mlflow.log_param("fold", i + 1)
            mlflow.log_param("n_features", X_train.shape[1])
            mlflow.log_param("train_window", f"{START_TRAIN}_{END_TRAIN}")
            mlflow.log_param("y_target_expr", Y_TARGET_EXPR)
            mlflow.log_params(model.get_params())

            # ===== Log Artifacts =====
            mlflow.log_artifact(x_features_path)

            print(f'\n============== {model_name} | Fold {i+1} ==============')

            # ===== Train =====
            if model_name == "LightGBM":

                model.fit(
                    X_train, y_train,
                    sample_weight=w_train,
                    eval_set=[(X_valid, y_valid)],
                    eval_sample_weight=[w_valid],
                    eval_metric=weighted_zero_mean_r2_lgb,
                    callbacks=[lgb.early_stopping(100)]
                )

                best_iter = model.best_iteration_
                best_score = model.best_score_['valid_0']['weighted_zero_mean_r2']

            else:  # XGBoost

                model.fit(
                    X_train, y_train,
                    sample_weight=w_train,
                    eval_set=[(X_valid, y_valid)],
                    sample_weight_eval_set=[w_valid],
                    callbacks=[EarlyStopping(
                        rounds=100,
                        maximize=True,
                        save_best=True
                    )],
                    verbose=20
                )

                best_iter = model.best_iteration
                best_score = model.best_score

            print(f"Best iteration: {best_iter}")
            print(f"Best score: {best_score}")

            # ===== MLflow Metrics =====
            mlflow.log_metric("best_iteration", best_iter)
            mlflow.log_metric("best_score", best_score)

            # ===== Save model locally =====
            joblib.dump(model, f"models/{model_name}_Fold_{i+1}.model")

            # ===== Log model =====
            mlflow.sklearn.log_model(model,artifact_path=f"{model_name}_Fold_{i+1}")

            models.append((model_name, i + 1, model))

            del model
            gc.collect()

    del train_df, X_train, y_train, w_train
    gc.collect()

    if folds > 1:
        START_TRAIN += 200
        END_TRAIN   += 200

2025/12/24 17:50:19 INFO mlflow.tracking.fluent: Experiment with name 'JS_Kaggle_Experiments2' does not exist. Creating a new experiment.


Load train data and apply reduce memory function on Fold 1
df memory usage before reduce : 2522.8769760131836 MB
Memory usage after optimization is: 1297.07 MB
Decreased by 48.6%

Train dates: from day 1099 to 1298 (200 days)

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 9912
[LightGBM] [Info] Number of data points in the train set: 7472960, number of used features: 80
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 80 dense feature groups (570.14 MB) transferred to GPU in 0.599318 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.006229
Training until validation scores don't improve for 100 rounds




Early stopping, best iteration is:
[74]	valid_0's l2: 0.595266	valid_0's weighted_zero_mean_r2: 0.00471052
Best iteration: 74
Best score: 0.0047105231385636825

[0]	validation_0-weighted_zero_mean_r2_xgb:0.00017
[20]	validation_0-weighted_zero_mean_r2_xgb:0.00272
[40]	validation_0-weighted_zero_mean_r2_xgb:0.00409
[60]	validation_0-weighted_zero_mean_r2_xgb:0.00474
[80]	validation_0-weighted_zero_mean_r2_xgb:0.00504
[100]	validation_0-weighted_zero_mean_r2_xgb:0.00388
[120]	validation_0-weighted_zero_mean_r2_xgb:0.00263
[140]	validation_0-weighted_zero_mean_r2_xgb:0.00120
[160]	validation_0-weighted_zero_mean_r2_xgb:0.00060
[180]	validation_0-weighted_zero_mean_r2_xgb:-0.00070




Best iteration: 81
Best score: 0.005052
Load train data and apply reduce memory function on Fold 2
df memory usage before reduce : 2454.9030990600586 MB
Memory usage after optimization is: 1262.13 MB
Decreased by 48.6%

Train dates: from day 1299 to 1498 (200 days)

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 9913
[LightGBM] [Info] Number of data points in the train set: 7271616, number of used features: 80
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 80 dense feature groups (554.78 MB) transferred to GPU in 0.532090 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.003815
Training until validation scores don't improve for 100 rounds




Early stopping, best iteration is:
[119]	valid_0's l2: 0.594277	valid_0's weighted_zero_mean_r2: 0.0062599
Best iteration: 119
Best score: 0.0062599037080367514

[0]	validation_0-weighted_zero_mean_r2_xgb:0.00017
[20]	validation_0-weighted_zero_mean_r2_xgb:0.00270
[40]	validation_0-weighted_zero_mean_r2_xgb:0.00425
[60]	validation_0-weighted_zero_mean_r2_xgb:0.00524
[80]	validation_0-weighted_zero_mean_r2_xgb:0.00578
[100]	validation_0-weighted_zero_mean_r2_xgb:0.00605
[120]	validation_0-weighted_zero_mean_r2_xgb:0.00622
[140]	validation_0-weighted_zero_mean_r2_xgb:0.00619
[160]	validation_0-weighted_zero_mean_r2_xgb:0.00609
[180]	validation_0-weighted_zero_mean_r2_xgb:0.00591
[200]	validation_0-weighted_zero_mean_r2_xgb:0.00572
[220]	validation_0-weighted_zero_mean_r2_xgb:0.00549
[232]	validation_0-weighted_zero_mean_r2_xgb:0.00536




Best iteration: 132
Best score: 0.006234


In [9]:
%%capture
!zip -r mlruns.zip /kaggle/working/mlruns

from IPython.display import display, FileLink
print("ðŸ“¥ Download Links:\n")
display(FileLink('mlruns.zip', result_html_prefix=" Track the models: "))

In [None]:
''' 
#### How to run mlflow ui on local env
1- unzip mlruns.zip
2- write in your terminal after install mlfow
- cd Downloads/kaggle/working
- mlflow ui --backend-store-uri file:./mlruns
'''