In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/modelll/submission_xgboost.csv
/kaggle/input/modelll/train_FINAL.pkl
/kaggle/input/modelll/submission_lightgbm.csv
/kaggle/input/modelll/submission_catboost.csv
/kaggle/input/modelll/test_FINAL.pkl


In [8]:
# ================================
# 0. Install libraries (Kaggle GPU)
# ================================
!pip install optuna catboost lightgbm xgboost --quiet

import pandas as pd
import numpy as np
import optuna

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier

import joblib
import warnings
warnings.filterwarnings('ignore')

print("GPU test:")
!nvidia-smi

# ================================
# 1. Load final data
# ================================
train = joblib.load("/kaggle/input/modelll/train_FINAL.pkl")
test  = joblib.load("/kaggle/input/modelll/test_FINAL.pkl")

y = train["TARGET"]
X = train.drop(columns=["TARGET"])

test_ids = test["SK_ID_CURR"]
test_X = test.drop(columns=["SK_ID_CURR"])

# Ensure clean column names
X.columns = X.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
test_X.columns = test_X.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)


# ================================
# 2. KFold setup
# ================================
kf = KFold(n_splits=10, shuffle=True, random_state=42)


# ================================
# 3. Optuna search space + trial objective
# ================================
def objective(trial):

    model_type = trial.suggest_categorical("model_type", ["lightgbm", "xgboost", "catboost"])

    # ------------------
    # LightGBM
    # ------------------
    if model_type == "lightgbm":
        params = {
            "objective": "binary",
            "metric": "auc",
            "learning_rate": trial.suggest_float("lr", 0.01, 0.3),
            "num_leaves": trial.suggest_int("num_leaves", 31, 255),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
            "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
            "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 200),
            "verbosity": -1
        }

        oof_preds = np.zeros(len(X))

        for train_idx, valid_idx in kf.split(X):
            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

            dtrain = lgb.Dataset(X_train, y_train)
            dvalid = lgb.Dataset(X_valid, y_valid)

            model = lgb.train(
                params,
                dtrain,
                valid_sets=[dvalid],
                num_boost_round=2000,
                callbacks=[lgb.early_stopping(100, verbose=False)]
            )

            preds = model.predict(X_valid, num_iteration=model.best_iteration)
            oof_preds[valid_idx] = preds

        return roc_auc_score(y, oof_preds)

    # ------------------
    # XGBoost (GPU)
    # ------------------
    if model_type == "xgboost":
        params = {
            "objective": "binary:logistic",
            "eval_metric": "auc",
            "tree_method": "gpu_hist",  # GPU acceleration
            "eta": trial.suggest_float("eta", 0.01, 0.3),
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0)
        }

        oof_preds = np.zeros(len(X))

        for train_idx, valid_idx in kf.split(X):
            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

            dtrain = xgb.DMatrix(X_train, y_train)
            dvalid = xgb.DMatrix(X_valid, y_valid)

            model = xgb.train(
                params,
                dtrain,
                num_boost_round=2000,
                evals=[(dvalid, "valid")],
                early_stopping_rounds=100,
                verbose_eval=False
            )

            preds = model.predict(xgb.DMatrix(X_valid))
            oof_preds[valid_idx] = preds

        return roc_auc_score(y, oof_preds)

    # ------------------
    # CatBoost (GPU)
    # ------------------
    if model_type == "catboost":
        params = {
            "iterations": 2000,
            "learning_rate": trial.suggest_float("lr", 0.01, 0.3),
            "depth": trial.suggest_int("depth", 4, 10),
            "loss_function": "Logloss",
            "eval_metric": "AUC",
            "task_type": "GPU",
            "verbose": False
        }

        oof_preds = np.zeros(len(X))

        for train_idx, valid_idx in kf.split(X):
            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

            model = CatBoostClassifier(**params)
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stopping_rounds=100)

            preds = model.predict_proba(X_valid)[:, 1]
            oof_preds[valid_idx] = preds

        return roc_auc_score(y, oof_preds)


# ================================
# 4. Run Optuna with 10 trials
# ================================
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

print("Best params:", study.best_params)
print("Best AUC:", study.best_value)

# ================================
# 5. Train final model on full data using best params
# ================================
best = study.best_params
model_type = best["model_type"]

print("Training final model:", model_type)

if model_type == "lightgbm":
    params = best.copy()
    params["objective"] = "binary"
    params["metric"] = "auc"
    del params["model_type"]

    dtrain = lgb.Dataset(X, y)
    final_model = lgb.train(params, dtrain, num_boost_round=2000)

elif model_type == "xgboost":
    params = best.copy()
    params["objective"] = "binary:logistic"
    params["eval_metric"] = "auc"
    params["tree_method"] = "gpu_hist"
    del params["model_type"]

    final_model = xgb.train(params, xgb.DMatrix(X, y), num_boost_round=2000)

else:  # catboost
    params = best.copy()
    params["loss_function"] = "Logloss"
    params["eval_metric"] = "AUC"
    params["task_type"] = "GPU"
    del params["model_type"]

    final_model = CatBoostClassifier(**params)
    final_model.fit(X, y)


# ================================
# 6. Predict test & save submission
# ================================
if model_type == "lightgbm":
    preds = final_model.predict(test_X)

elif model_type == "xgboost":
    preds = final_model.predict(xgb.DMatrix(test_X))

else:
    preds = final_model.predict_proba(test_X)[:, 1]

sub = pd.DataFrame({
    "SK_ID_CURR": test_ids,
    "TARGET": preds
})

sub.to_csv("submission_optuna.csv", index=False)
print("Saved submission_optuna.csv")

joblib.dump(final_model, "best_optuna_model.pkl")
print("Saved best_optuna_model.pkl")


GPU test:
Fri Nov 21 15:17:52 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   38C    P0             32W /  250W |     259MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                      

[I 2025-11-21 15:17:53,022] A new study created in memory with name: no-name-d739792c-836a-4cd8-bef1-902e4a9fab7c
[I 2025-11-21 15:20:55,794] Trial 0 finished with value: 0.7675338709340829 and parameters: {'model_type': 'xgboost', 'eta': 0.12368271868631443, 'max_depth': 8, 'subsample': 0.7910402116677808, 'colsample_bytree': 0.5544278945624586}. Best is trial 0 with value: 0.7675338709340829.
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/

Best params: {'model_type': 'lightgbm', 'lr': 0.0196748929443546, 'num_leaves': 226, 'feature_fraction': 0.6531519961265604, 'bagging_fraction': 0.7796593941777206, 'bagging_freq': 1, 'min_data_in_leaf': 113}
Best AUC: 0.7792448155176982
Training final model: lightgbm
Saved submission_optuna.csv
Saved best_optuna_model.pkl
