In [1]:
import numpy as np
import pandas as pd
import joblib

# ML models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# LightGBM
import lightgbm as lgb

# XGBoost
import xgboost as xgb

# CatBoost
from catboost import CatBoostClassifier

Load Final Data

In [2]:
 
train = joblib.load("train_FINAL.pkl")
test  = joblib.load("test_FINAL.pkl")


SPLIT TARGET AND FEATURES

In [7]:
y = train["TARGET"]
X = train.drop(columns=["TARGET"])

Test set (drop id)

In [8]:
test_ids = test["SK_ID_CURR"]
test_X = test.drop(columns=["SK_ID_CURR"])


 CLEAN ALL COLUMN NAMES FOR LIGHTGBM

In [9]:
X.columns = X.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
test_X.columns = test_X.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)

TRAINâ€“VALID SPLIT

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [11]:
X_train.columns = X_train.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
X_valid.columns = X_valid.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)

MAKE LIGHTGBM DATASETS


In [12]:
train_data = lgb.Dataset(X_train, y_train)
valid_data = lgb.Dataset(X_valid, y_valid)

LIGHTGBM PARAMETERS

In [13]:
params = {
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": -1
}

TRAIN MODEL (USING CALLBACKS)

In [14]:
model = lgb.train(
    params,
    train_data,
    valid_sets=[valid_data],
    num_boost_round=5000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=50)
    ]
)

Training until validation scores don't improve for 200 rounds
[50]	valid_0's auc: 0.757138
[100]	valid_0's auc: 0.768134
[150]	valid_0's auc: 0.77188
[200]	valid_0's auc: 0.773121
[250]	valid_0's auc: 0.773399
[300]	valid_0's auc: 0.773587
[350]	valid_0's auc: 0.774222
[400]	valid_0's auc: 0.774021
[450]	valid_0's auc: 0.774048
[500]	valid_0's auc: 0.773572
[550]	valid_0's auc: 0.773564
Early stopping, best iteration is:
[357]	valid_0's auc: 0.774336


PREDICT ON TEST SET


In [15]:
test_preds = model.predict(test_X, num_iteration=model.best_iteration)

In [22]:
sub_lightgbm = pd.DataFrame({
    "SK_ID_CURR": test_ids,
    "TARGET": test_preds
})

sub_lightgbm.to_csv("submission_lightgbm.csv", index=False)

print("LightGBM submission saved: submission_lightgbm.csv")

LightGBM submission saved: submission_lightgbm.csv


CATBOOST TRAINING

In [16]:
from catboost import CatBoostClassifier

cat_model = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="AUC",
    learning_rate=0.05,
    depth=8,
    iterations=5000,
    silent=True,
    random_seed=42,
    od_type="Iter",
    od_wait=200  # early stopping
)

cat_model.fit(X_train, y_train, eval_set=(X_valid, y_valid))

# Predict validation AUC
cat_pred_valid = cat_model.predict_proba(X_valid)[:, 1]


In [19]:
print("CatBoost Validation AUC: ", roc_auc_score(y_valid, cat_pred_valid))
# 1. Remove ID column from test
 
test_ids = test["SK_ID_CURR"]
 
# 2. Align test columns EXACTLY with training
 
test_X = test_X.reindex(columns=X_train.columns, fill_value=0)
 
# 3. Predict probabilities
 
cat_test_preds = cat_model.predict_proba(test_X)[:, 1]

 
# 4. Build submission file
 
sub_cat = pd.DataFrame({
    "SK_ID_CURR": test_ids,
    "TARGET": cat_test_preds
})

sub_cat.to_csv("submission_catboost.csv", index=False)

print("CatBoost submission saved: submission_catboost.csv")


CatBoost Validation AUC:  0.7780986638740794
CatBoost submission saved: submission_catboost.csv


XGBOOST TRAINING

In [20]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)
dtest  = xgb.DMatrix(test_X)

params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "eta": 0.05,
    "max_depth": 8,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "tree_method": "hist",   
}

watchlist = [(dtrain, "train"), (dvalid, "valid")]

xgb_model = xgb.train(
    params,
    dtrain,
    num_boost_round=5000,
    evals=watchlist,
    early_stopping_rounds=200,
    verbose_eval=50
)

# Validation predictions
xgb_pred_valid = xgb_model.predict(dvalid)


[0]	train-auc:0.73958	valid-auc:0.70612
[50]	train-auc:0.84051	valid-auc:0.76291
[100]	train-auc:0.87712	valid-auc:0.77119
[150]	train-auc:0.89838	valid-auc:0.77434
[200]	train-auc:0.91427	valid-auc:0.77565
[250]	train-auc:0.92862	valid-auc:0.77615
[300]	train-auc:0.94003	valid-auc:0.77605
[350]	train-auc:0.94954	valid-auc:0.77618
[400]	train-auc:0.95814	valid-auc:0.77598
[450]	train-auc:0.96591	valid-auc:0.77551
[459]	train-auc:0.96675	valid-auc:0.77518


In [21]:
xgb_test_preds = xgb_model.predict(dtest)


In [23]:
sub_xgb = pd.DataFrame({
    "SK_ID_CURR": test_ids,
    "TARGET": xgb_test_preds
})

sub_xgb.to_csv("submission_xgboost.csv", index=False)

print("XGBoost submission saved: submission_xgboost.csv")

XGBoost submission saved: submission_xgboost.csv


Dynamic Model Selection + 5-Fold CV with Optuna

In [25]:
import optuna
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np

def objective(trial):

    # Choose which model to test this trial
    model_name = trial.suggest_categorical("model", ["lgbm", "xgb", "cat"])

    # Number of folds
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    oof_preds = []   # store predictions
    oof_true = []    # store targets

    # Loop through folds
    for train_idx, valid_idx in folds.split(X, y):
        X_train_k, X_valid_k = X.iloc[train_idx], X.iloc[valid_idx]
        y_train_k, y_valid_k = y.iloc[train_idx], y.iloc[valid_idx]
        
        # LightGBM
       
        if model_name == "lgbm":
            params = {
                "objective": "binary",
                "metric": "auc",
                "learning_rate": trial.suggest_float("lgb_lr", 0.01, 0.2),
                "num_leaves": trial.suggest_int("lgb_leaves", 31, 256),
                "feature_fraction": trial.suggest_float("lgb_ff", 0.6, 1.0),
                "bagging_fraction": trial.suggest_float("lgb_bf", 0.6, 1.0),
                "bagging_freq": trial.suggest_int("lgb_bf_q", 1, 10),
                "verbose": -1
            }

            train_set = lgb.Dataset(X_train_k, y_train_k)
            valid_set = lgb.Dataset(X_valid_k, y_valid_k)

            model = lgb.train(
                params,
                train_set,
                valid_sets=[valid_set],
                num_boost_round=2000,
                callbacks=[lgb.early_stopping(150)]
            )

            preds = model.predict(X_valid_k)
       
        # XGBoost
        
        elif model_name == "xgb":
            params = {
                "objective": "binary:logistic",
                "eval_metric": "auc",
                "eta": trial.suggest_float("xgb_lr", 0.01, 0.2),
                "max_depth": trial.suggest_int("xgb_depth", 4, 12),
                "subsample": trial.suggest_float("xgb_sub", 0.6, 1.0),
                "colsample_bytree": trial.suggest_float("xgb_col", 0.6, 1.0),
                "tree_method": "hist"
            }

            dtrain = xgb.DMatrix(X_train_k, y_train_k)
            dvalid = xgb.DMatrix(X_valid_k, y_valid_k)

            model = xgb.train(
                params,
                dtrain,
                num_boost_round=2000,
                evals=[(dvalid, "valid")],
                early_stopping_rounds=150,
                verbose_eval=False
            )

            preds = model.predict(dvalid)

        # CatBoost

        else:
            params = {
                "iterations": 2000,
                "learning_rate": trial.suggest_float("cat_lr", 0.01, 0.2),
                "depth": trial.suggest_int("cat_depth", 4, 10),
                "loss_function": "Logloss",
                "eval_metric": "AUC",
                "random_seed": 42,
                "verbose": False
            }

            model = CatBoostClassifier(**params)
            model.fit(X_train_k, y_train_k, eval_set=(X_valid_k, y_valid_k), use_best_model=True)

            preds = model.predict_proba(X_valid_k)[:, 1]

        oof_preds.extend(preds)
        oof_true.extend(y_valid_k)

    # Return mean CV AUC
    cv_auc = roc_auc_score(oof_true, oof_preds)
    return cv_auc


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)


In [None]:
print("Best Model:", study.best_params["model"])
print("Best Params:", study.best_params)
print("Best AUC:", study.best_value)