In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dataset/main_training_data_optimized.csv


In [2]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier


In [3]:
DATA_PATH = "/kaggle/input/dataset/main_training_data_optimized.csv"

df = pd.read_csv(DATA_PATH)

In [4]:
y = df["target"].astype(np.int8)
user_ids = df["user_id"]

X = df.drop(columns=["target", "user_id"])
X = X.astype(np.float32)

unique_users = user_ids.unique()

train_users, val_users = train_test_split(
    unique_users,
    test_size=0.2,
    random_state=42
)

train_mask = user_ids.isin(train_users)
val_mask   = user_ids.isin(val_users)

X_train = X[train_mask]
X_val   = X[val_mask]

y_train = y[train_mask]
y_val   = y[val_mask]

assert set(user_ids[train_mask]).isdisjoint(set(user_ids[val_mask]))

print("Train rows:", X_train.shape)
print("Val rows:", X_val.shape)


Train rows: (10627766, 25)
Val rows: (2680187, 25)


In [5]:
def objective(trial):

    model_type = trial.suggest_categorical(
        "model_type", ["lightgbm", "xgboost", "catboost"]
    )

    # =====================
    # LIGHTGBM 
    # =====================
    if model_type == "lightgbm":

        params = {
            "objective": "binary",
            "metric": "auc",
            "boosting_type": "gbdt",
            "verbosity": -1,

            "learning_rate": trial.suggest_float("lgb_lr", 0.01, 0.08),
            "num_leaves": trial.suggest_int("lgb_num_leaves", 32, 128),
            "max_depth": trial.suggest_int("lgb_max_depth", 5, 12),
            "min_child_samples": trial.suggest_int("lgb_min_child", 20, 150),
            "feature_fraction": trial.suggest_float("lgb_feature_frac", 0.6, 1.0),
            "bagging_fraction": trial.suggest_float("lgb_bagging_frac", 0.6, 1.0),
            "bagging_freq": trial.suggest_int("lgb_bagging_freq", 1, 7),
            "lambda_l1": trial.suggest_float("lgb_l1", 0.0, 5.0),
            "lambda_l2": trial.suggest_float("lgb_l2", 0.0, 5.0),

            # CUDA
            "device": "gpu",
            "gpu_platform_id": 0,
            "gpu_device_id": 0,
        }

        train_data = lgb.Dataset(X_train, label=y_train)
        valid_data = lgb.Dataset(X_val, label=y_val)

        model = lgb.train(
            params,
            train_data,
            valid_sets=[valid_data],
            num_boost_round=1400,
            callbacks=[
                lgb.early_stopping(30),
                lgb.log_evaluation(0),
            ],
        )

        preds = model.predict(X_val)
        return roc_auc_score(y_val, preds)

    # =====================
    # XGBOOST 
    # =====================
    elif model_type == "xgboost":

        model = xgb.XGBClassifier(
            objective="binary:logistic",
            eval_metric="auc",
            tree_method="gpu_hist",
            predictor="gpu_predictor",

            learning_rate=trial.suggest_float("xgb_lr", 0.01, 0.08),
            max_depth=trial.suggest_int("xgb_max_depth", 4, 10),
            min_child_weight=trial.suggest_float("xgb_min_child", 1, 15),
            subsample=trial.suggest_float("xgb_subsample", 0.6, 1.0),
            colsample_bytree=trial.suggest_float("xgb_colsample", 0.6, 1.0),
            gamma=trial.suggest_float("xgb_gamma", 0.0, 5.0),
            reg_lambda=trial.suggest_float("xgb_lambda", 0.0, 5.0),
            reg_alpha=trial.suggest_float("xgb_alpha", 0.0, 5.0),

            n_estimators=1400,
            early_stopping_rounds=30,
            verbosity=0,
        )

        model.fit(
            X_train,
            y_train,
            eval_set=[(X_val, y_val)],
            verbose=False
        )

        preds = model.predict_proba(X_val)[:, 1]
        return roc_auc_score(y_val, preds)

    # =====================
    # CATBOOST 
    # =====================
    else:

        model = CatBoostClassifier(
            loss_function="Logloss",
            eval_metric="AUC",
            task_type="GPU",
            devices="0",

            learning_rate=trial.suggest_float("cat_lr", 0.01, 0.08),
            depth=trial.suggest_int("cat_depth", 4, 9),
            l2_leaf_reg=trial.suggest_float("cat_l2", 1.0, 8.0),
            random_strength=trial.suggest_float("cat_rand", 0.0, 2.0),
            bagging_temperature=trial.suggest_float("cat_bag", 0.0, 1.0),

            iterations=1400,
            early_stopping_rounds=30,
            verbose=False,
        )

        model.fit(X_train, y_train, eval_set=(X_val, y_val))
        preds = model.predict_proba(X_val)[:, 1]
        return roc_auc_score(y_val, preds)


In [6]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=60)


[I 2025-12-25 13:38:35,924] A new study created in memory with name: no-name-051281de-b138-4e25-bdd4-22379bbef0b3


Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[290]	valid_0's auc: 0.817641


[I 2025-12-25 13:42:16,683] Trial 0 finished with value: 0.8176408807855196 and parameters: {'model_type': 'lightgbm', 'lgb_lr': 0.0741187913684347, 'lgb_num_leaves': 46, 'lgb_max_depth': 10, 'lgb_min_child': 25, 'lgb_feature_frac': 0.7005400647527783, 'lgb_bagging_frac': 0.8291462375780057, 'lgb_bagging_freq': 1, 'lgb_l1': 1.7536357783869132, 'lgb_l2': 4.689166529732533}. Best is trial 0 with value: 0.8176408807855196.
[I 2025-12-25 13:42:50,522] Trial 1 finished with value: 0.8176522023525037 and parameters: {'model_type': 'xgboost', 'xgb_lr': 0.06880111614104577, 'xgb_max_depth': 9, 'xgb_min_child': 8.454647783815767, 'xgb_subsample': 0.9804881873202625, 'xgb_colsample': 0.9121038888272348, 'xgb_gamma': 1.4103678523324699, 'xgb_lambda': 1.5551012624765004, 'xgb_alpha': 4.021558336739586}. Best is trial 1 with value: 0.8176522023525037.
[I 2025-12-25 13:43:51,795] Trial 2 finished with value: 0.8177984973954346 and parameters: {'model_type': 'xgboost', 'xgb_lr': 0.03337850981638362, 

Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[807]	valid_0's auc: 0.817844


[I 2025-12-25 13:57:43,302] Trial 5 finished with value: 0.8178440685367452 and parameters: {'model_type': 'lightgbm', 'lgb_lr': 0.020524119954249836, 'lgb_num_leaves': 62, 'lgb_max_depth': 7, 'lgb_min_child': 68, 'lgb_feature_frac': 0.7048511951639247, 'lgb_bagging_frac': 0.8216334105118741, 'lgb_bagging_freq': 1, 'lgb_l1': 0.5067095801454502, 'lgb_l2': 4.219106216405748}. Best is trial 5 with value: 0.8178440685367452.
Default metric period is 5 because AUC is/are not implemented for GPU
[I 2025-12-25 13:59:06,444] Trial 6 finished with value: 0.8173645101096774 and parameters: {'model_type': 'catboost', 'cat_lr': 0.0485835800634149, 'cat_depth': 4, 'cat_l2': 5.064559290631564, 'cat_rand': 1.8350810579242367, 'cat_bag': 0.3449204827705107}. Best is trial 5 with value: 0.8178440685367452.
[I 2025-12-25 13:59:52,101] Trial 7 finished with value: 0.8177430564258055 and parameters: {'model_type': 'xgboost', 'xgb_lr': 0.03495021044346388, 'xgb_max_depth': 10, 'xgb_min_child': 4.5109916933

Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[259]	valid_0's auc: 0.817758


[I 2025-12-25 14:05:09,865] Trial 8 finished with value: 0.8177582823508418 and parameters: {'model_type': 'lightgbm', 'lgb_lr': 0.058073780061666073, 'lgb_num_leaves': 96, 'lgb_max_depth': 8, 'lgb_min_child': 33, 'lgb_feature_frac': 0.6520504745293689, 'lgb_bagging_frac': 0.7074275234305818, 'lgb_bagging_freq': 2, 'lgb_l1': 2.552136431498635, 'lgb_l2': 1.4316823986695681}. Best is trial 5 with value: 0.8178440685367452.
Default metric period is 5 because AUC is/are not implemented for GPU
[I 2025-12-25 14:06:42,472] Trial 9 finished with value: 0.8167634345191336 and parameters: {'model_type': 'catboost', 'cat_lr': 0.012994716080374065, 'cat_depth': 5, 'cat_l2': 6.529860823701608, 'cat_rand': 0.2743465173975048, 'cat_bag': 0.2615363023742804}. Best is trial 5 with value: 0.8178440685367452.


Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[1208]	valid_0's auc: 0.817701


[I 2025-12-25 14:26:07,036] Trial 10 finished with value: 0.8177012603629761 and parameters: {'model_type': 'lightgbm', 'lgb_lr': 0.011502367770731228, 'lgb_num_leaves': 54, 'lgb_max_depth': 6, 'lgb_min_child': 123, 'lgb_feature_frac': 0.9417666817196498, 'lgb_bagging_frac': 0.9780293237958657, 'lgb_bagging_freq': 7, 'lgb_l1': 0.1393826754107778, 'lgb_l2': 4.159270452600119}. Best is trial 5 with value: 0.8178440685367452.
[I 2025-12-25 14:28:11,405] Trial 11 finished with value: 0.817495729233457 and parameters: {'model_type': 'xgboost', 'xgb_lr': 0.01079076500451347, 'xgb_max_depth': 5, 'xgb_min_child': 1.0742458230444019, 'xgb_subsample': 0.6024942310404537, 'xgb_colsample': 0.6063331357559586, 'xgb_gamma': 4.837176794828675, 'xgb_lambda': 3.6837187755725163, 'xgb_alpha': 4.873199435895291}. Best is trial 5 with value: 0.8178440685367452.
[I 2025-12-25 14:29:07,659] Trial 12 finished with value: 0.817760166139395 and parameters: {'model_type': 'xgboost', 'xgb_lr': 0.0401660332991973

Training until validation scores don't improve for 30 rounds


[W 2025-12-25 14:40:42,173] Trial 13 failed with parameters: {'model_type': 'lightgbm', 'lgb_lr': 0.018330883405237582, 'lgb_num_leaves': 121, 'lgb_max_depth': 5, 'lgb_min_child': 90, 'lgb_feature_frac': 0.812185800672158, 'lgb_bagging_frac': 0.8441377450354582, 'lgb_bagging_freq': 4, 'lgb_l1': 4.889415161405484, 'lgb_l2': 3.098470276617171} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_47/378875036.py", line 37, in objective
    model = lgb.train(
            ^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/lightgbm/engine.py", line 322, in train
    booster.update(fobj=fobj)
  File "/usr/local/lib/python3.11/dist-packages/lightgbm/basic.py", line 4155, in update
    _LIB.LGBM_BoosterUpdateOneIter(
KeyboardInterrupt
[W 2025-12-25 14:40:42,176] Tr

KeyboardInterrupt: 

In [7]:
print("BEST AUC:", study.best_value)
print("BEST PARAMS:")
for k, v in study.best_params.items():
    print(f"{k}: {v}")


BEST AUC: 0.8178440685367452
BEST PARAMS:
model_type: lightgbm
lgb_lr: 0.020524119954249836
lgb_num_leaves: 62
lgb_max_depth: 7
lgb_min_child: 68
lgb_feature_frac: 0.7048511951639247
lgb_bagging_frac: 0.8216334105118741
lgb_bagging_freq: 1
lgb_l1: 0.5067095801454502
lgb_l2: 4.219106216405748
