In [None]:
# ============================
# Tabular Boosting: XGB (native) · LGBM · ExtraTrees
# Early Stopping (XGB/LGBM), 10-Fold CV, OOF-weighted Blend
# Prints train vs validation AUC per model and ensemble
# ============================

# !pip -q install --upgrade xgboost lightgbm > /dev/null

import os
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.base import clone
from sklearn.ensemble import ExtraTreesClassifier  # try

import xgboost as xgb
from lightgbm import LGBMClassifier
import lightgbm as lgb

#  SETTINGS
RANDOM_STATE = 42
N_SPLITS = 150  # 10 # 40
EARLY_ROUNDS = 350  # 200 # 185

DATA_PATH = "train.csv"     # set your path
TARGET_COL = "Exited"       # target

# Load data
assert os.path.exists(DATA_PATH), f"DATA_PATH not found: {DATA_PATH}"
df = pd.read_csv(DATA_PATH)
assert TARGET_COL in df.columns, f"TARGET_COL '{TARGET_COL}' not in: {df.columns.tolist()}"

# Drop obvious IDs
DROP_COLS = [c for c in df.columns if c.lower() in {"id", "rowid", "customerid", "customer_id"}]
if DROP_COLS:
    df = df.drop(columns=DROP_COLS)

y = df[TARGET_COL].values
X = df.drop(columns=[TARGET_COL])

# drop surname
if "Surname" in X.columns:
    X = X.drop(columns=["Surname"])

# Types
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]
print(f"Rows: {len(df):,} | Features: {X.shape[1]} | Num: {len(num_cols)} | Cat: {len(cat_cols)}")

# Split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

# LGBM/ET preprocessor: Ordinal on categories
ordinal = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
pre_lgbm = ColumnTransformer(
    transformers=[("cats", ordinal, cat_cols)],
    remainder="passthrough"
)

#  Helpers
def encode_for_xgb(X_fit: pd.DataFrame, X_a: pd.DataFrame, X_b: pd.DataFrame, cat_cols):
    """Fit an OrdinalEncoder on X_fit[cat_cols]; transform X_a/X_b to plain float32 NumPy arrays."""
    enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    Xa = X_a.copy()
    Xb = X_b.copy()
    if cat_cols:
        enc.fit(X_fit[cat_cols])
        Xa[cat_cols] = enc.transform(Xa[cat_cols])
        Xb[cat_cols] = enc.transform(Xb[cat_cols])
    Xa = Xa.astype(np.float32).to_numpy(copy=False)
    Xb = Xb.astype(np.float32).to_numpy(copy=False)
    return Xa, Xb

def prep_fit_transform(preprocessor: ColumnTransformer, X_tr: pd.DataFrame, X_va: pd.DataFrame):
    """Clones and fits the preprocessor on **X_tr**, then transforms **X_tr**/**X_va** to float32 numpy"""
    prep = clone(preprocessor)
    Xtr_enc = prep.fit_transform(X_tr)
    Xva_enc = prep.transform(X_va)
    if hasattr(Xtr_enc, "toarray"):
        Xtr_enc = Xtr_enc.toarray()
        Xva_enc = Xva_enc.toarray()
    return Xtr_enc.astype(np.float32), Xva_enc.astype(np.float32), prep

def xgb_predict_best(booster, dmatrix):
    """Compatible with all versions of XGBoost prediction on best iteration"""
    import numpy as _np
    bi = getattr(booster, "best_iteration", None)
    if isinstance(bi, (int, _np.integer)) and bi is not None:
        try:
            return booster.predict(dmatrix, iteration_range=(0, int(bi) + 1))
        except TypeError:
            pass
        try:
            return booster.predict(dmatrix, ntree_limit=int(bi) + 1)
        except TypeError:
            pass
    bntl = getattr(booster, "best_ntree_limit", None)
    if isinstance(bntl, (int, _np.integer)) and bntl:
        try:
            return booster.predict(dmatrix, ntree_limit=int(bntl))
        except TypeError:
            pass
    return booster.predict(dmatrix)

#  CV (XGB native, LGBM numeric arrays, ExtraTrees numeric arrays)
def cv_auc_model_es_xgb_lgbm_et(X, y, n_splits=5, early_rounds=200):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)

    oof_xgb = np.zeros(len(y), dtype=float)
    oof_lgb = np.zeros(len(y), dtype=float)
    oof_et  = np.zeros(len(y), dtype=float)

    iters_xgb, iters_lgb = [], []
    folds_xgb, folds_lgb, folds_et = [], [], []

    for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y), 1):
        X_tr, X_va = X.iloc[trn_idx], X.iloc[val_idx]
        y_tr, y_va = y[trn_idx], y[val_idx]

        #  XGBoost (native)
        Xtr_enc, Xva_enc = encode_for_xgb(X_tr, X_tr, X_va, cat_cols)
        dtr, dva = xgb.DMatrix(Xtr_enc, label=y_tr), xgb.DMatrix(Xva_enc, label=y_va)

        params_xgb = {
            "objective": "binary:logistic",
            "eval_metric": "auc",
            "eta": 0.02,
            "max_depth": 8, #6
            "subsample": 0.85,
            "colsample_bytree": 0.85,
            "lambda": 1.0,
            "alpha": 0.0,
            "min_child_weight": 6.0,
            "tree_method": "hist",
            "seed": RANDOM_STATE,
        }
        pos = int((y_tr == 1).sum()); neg = int((y_tr == 0).sum())
        if pos > 0:
            params_xgb["scale_pos_weight"] = max(1.0, neg / pos)

        bst = xgb.train(
            params_xgb, dtr,
            num_boost_round=10000,
            evals=[(dva, "valid")],
            early_stopping_rounds=early_rounds,
            verbose_eval=False
        )
        pred_xgb = xgb_predict_best(bst, dva)
        oof_xgb[val_idx] = pred_xgb
        auc_x = roc_auc_score(y_va, pred_xgb)
        folds_xgb.append(auc_x)
        iters_xgb.append(getattr(bst, "best_iteration", None))

        # ==== LightGBM preprocess ====
        Xtr_lgb, Xva_lgb, _prep_lgb = prep_fit_transform(pre_lgbm, X_tr, X_va)
        lgbm = LGBMClassifier(
            n_estimators=5000, learning_rate=0.02,
            num_leaves=63, min_child_samples=25,
            subsample=0.9, colsample_bytree=0.85,
            reg_alpha=0.0, reg_lambda=0.5,
            objective="binary", random_state=RANDOM_STATE
        )
        try:
            lgbm.fit(
                Xtr_lgb, y_tr,
                eval_set=[(Xva_lgb, y_va)],
                callbacks=[lgb.early_stopping(early_rounds, verbose=False)],
            )
        except TypeError:
            lgbm.fit(
                Xtr_lgb, y_tr,
                eval_set=[(Xva_lgb, y_va)],
                early_stopping_rounds=early_rounds,
            )
        pred_lgb = lgbm.predict_proba(Xva_lgb)[:, 1]
        oof_lgb[val_idx] = pred_lgb
        auc_l = roc_auc_score(y_va, pred_lgb)
        folds_lgb.append(auc_l)
        iters_lgb.append(getattr(lgbm, "best_iteration_", None))

        #  ExtraTrees (preprocess)
        Xtr_et, Xva_et, _prep_et = prep_fit_transform(pre_lgbm, X_tr, X_va)
        et = ExtraTreesClassifier(
            n_estimators=500,
            max_depth=None,              # or 8–16 if overfitting
            min_samples_leaf=2,
            max_features="sqrt",
            bootstrap=False,
            random_state=RANDOM_STATE,
            n_jobs=-1,
            class_weight="balanced" if y_tr.mean() not in (0,1) else None
        )
        et.fit(Xtr_et, y_tr)
        pred_et = et.predict_proba(Xva_et)[:, 1]
        oof_et[val_idx] = pred_et
        auc_e = roc_auc_score(y_va, pred_et)
        folds_et.append(auc_e)

        print(f"Fold {fold}:  XGB={auc_x:.5f}  LGBM={auc_l:.5f}  ET={auc_e:.5f}")

    print(f"\nXGB  OOF AUC: {roc_auc_score(y, oof_xgb):.5f} | Folds: {[round(a,5) for a in folds_xgb]}")
    print(f"LGBM OOF AUC: {roc_auc_score(y, oof_lgb):.5f} | Folds: {[round(a,5) for a in folds_lgb]}")
    print(f"ET   OOF AUC: {roc_auc_score(y, oof_et ): .5f} | Folds: {[round(a,5) for a in folds_et ]}")
    return (oof_xgb, iters_xgb), (oof_lgb, iters_lgb), oof_et

# ---- Run CV ----
print("\nTraining (ES, CV)...")
(xgb_oof, xgb_iters), (lgbm_oof, lgbm_iters), et_oof = cv_auc_model_es_xgb_lgbm_et(
    X_train, y_train, n_splits=N_SPLITS, early_rounds=EARLY_ROUNDS
)

# ---- Optimize blend weights on OOF (wx, wl, we >=0, sum=1, step=0.01) ----
best_auc, best_w = -1.0, (1/3, 1/3, 1/3)
for wx in np.linspace(0, 1, 101):
    wl_max = 1 - wx
    for wl in np.linspace(0, wl_max, int(wl_max*100)+1):
        we = 1 - wx - wl
        blend = wx*xgb_oof + wl*lgbm_oof + we*et_oof
        auc = roc_auc_score(y_train, blend)
        if auc > best_auc:
            best_auc, best_w = auc, (wx, wl, we)
print(f"\nBest OOF blend AUC: {best_auc:.5f} | Weights = {tuple(round(w,3) for w in best_w)}")

# ------------ Final fit (Train vs Valid) ------------

# XGB final (native)
Xtr_enc, Xva_enc = encode_for_xgb(X_train, X_train, X_valid, cat_cols)
dtr, dva = xgb.DMatrix(Xtr_enc, label=y_train), xgb.DMatrix(Xva_enc, label=y_valid)
params_final = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "eta": 0.01,
    "max_depth": 7,
    "subsample": 0.85,
    "colsample_bytree": 0.85,
    "lambda": 3.0,
    "alpha": 0.0,
    "min_child_weight": 2.0,
    "tree_method": "hist",
    "seed": RANDOM_STATE,
}
pos = int((y_train == 1).sum()); neg = int((y_train == 0).sum())
if pos > 0:
    params_final["scale_pos_weight"] = max(1.0, neg / pos)

bst = xgb.train(
    params_final, dtr,
    num_boost_round=4000,
    evals=[(dva, "valid")],
    early_stopping_rounds=EARLY_ROUNDS,
    verbose_eval=False
)
p_xgb_valid = xgb_predict_best(bst, dva)
p_xgb_train = xgb_predict_best(bst, dtr)
auc_xgb_train = roc_auc_score(y_train, p_xgb_train)
auc_xgb_valid = roc_auc_score(y_valid, p_xgb_valid)

# LGBM final (preprocess)
Xtr_lgb, Xva_lgb, prep_final = prep_fit_transform(pre_lgbm, X_train, X_valid)
lgbm_final = LGBMClassifier(
    n_estimators=5000, learning_rate=0.02,
    num_leaves=63, min_child_samples=25,
    subsample=0.9, colsample_bytree=0.85,
    reg_alpha=0.0, reg_lambda=0.5,
    objective="binary", random_state=RANDOM_STATE
)
try:
    lgbm_final.fit(
        Xtr_lgb, y_train,
        eval_set=[(Xva_lgb, y_valid)],
        callbacks=[lgb.early_stopping(EARLY_ROUNDS, verbose=False)],
    )
except TypeError:
    lgbm_final.fit(
        Xtr_lgb, y_train,
        eval_set=[(Xva_lgb, y_valid)],
        early_stopping_rounds=EARLY_ROUNDS,
    )
p_lgbm_valid = lgbm_final.predict_proba(Xva_lgb)[:, 1]
p_lgbm_train = lgbm_final.predict_proba(Xtr_lgb)[:, 1]
auc_lgbm_train = roc_auc_score(y_train, p_lgbm_train)
auc_lgbm_valid = roc_auc_score(y_valid, p_lgbm_valid)

# ExtraTrees final (preprocess)
Xtr_et, Xva_et, prep_et_final = prep_fit_transform(pre_lgbm, X_train, X_valid)
et_final = ExtraTreesClassifier(
    n_estimators=500,
    max_depth=None,
    min_samples_leaf=2,
    max_features="sqrt",
    bootstrap=False,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    class_weight="balanced" if y_train.mean() not in (0,1) else None
)
et_final.fit(Xtr_et, y_train)
p_et_valid = et_final.predict_proba(Xva_et)[:, 1]
p_et_train = et_final.predict_proba(Xtr_et)[:, 1]
auc_et_train = roc_auc_score(y_train, p_et_train)
auc_et_valid = roc_auc_score(y_valid, p_et_valid)

# Blends 3 models
w_xgb, w_lgbm, w_et = best_w
p_blend_eq_train = (p_xgb_train + p_lgbm_train + p_et_train) / 3.0
p_blend_eq_valid = (p_xgb_valid + p_lgbm_valid + p_et_valid) / 3.0
p_blend_opt_train = w_xgb*p_xgb_train + w_lgbm*p_lgbm_train + w_et*p_et_train
p_blend_opt_valid = w_xgb*p_xgb_valid + w_lgbm*p_lgbm_valid + w_et*p_et_valid

auc_blend_eq_train = roc_auc_score(y_train, p_blend_eq_train)
auc_blend_eq_valid = roc_auc_score(y_valid, p_blend_eq_valid)
auc_blend_opt_train = roc_auc_score(y_train, p_blend_opt_train)
auc_blend_opt_valid = roc_auc_score(y_valid, p_blend_opt_valid)

print("\n=== AUC (Train vs Valid) ===")
print(f"XGBoost     : {auc_xgb_train:.6f}  |  {auc_xgb_valid:.6f}")
print(f"LightGBM    : {auc_lgbm_train:.6f}  |  {auc_lgbm_valid:.6f}")
print(f"ExtraTrees  : {auc_et_train:.6f}  |  {auc_et_valid:.6f}")
print(f"Blend=1/3   : {auc_blend_eq_train:.6f}  |  {auc_blend_eq_valid:.6f}")
print(f"Blend*opt   : {auc_blend_opt_train:.6f}  |  {auc_blend_opt_valid:.6f}")

print("\nBest iters (CV):")
print("  XGB :", xgb_iters)
print("  LGBM:", lgbm_iters)


Rows: 15,000 | Features: 10 | Num: 8 | Cat: 2

Training (ES, CV)...
[LightGBM] [Info] Number of positive: 2426, number of negative: 9494
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000342 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 841
[LightGBM] [Info] Number of data points in the train set: 11920, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.203523 -> initscore=-1.364416
[LightGBM] [Info] Start training from score -1.364416
Fold 1:  XGB=0.80078  LGBM=0.77783  ET=0.78906
[LightGBM] [Info] Number of positive: 2426, number of negative: 9494
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000369 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 842
[L

In [None]:
# ============================
# SUBMISSION
# ============================

SUB_CHOICE = "xgb"          # <- pick: "xgb" | "lgbm" | "et" | "blend_eq" | "blend_opt"
TEST_PATH = "test.csv"
SAMPLE_SUB_PATH = "sample_submission.csv"
SUB_PATH = f"submission_{SUB_CHOICE}.csv"

import os, numpy as np, pandas as pd

assert os.path.exists(TEST_PATH), f"TEST_PATH not found: {TEST_PATH}"
test_df = pd.read_csv(TEST_PATH).copy()

# --- choose ID column if present ---
id_candidates = [c for c in test_df.columns if c.lower() in
                 {"id","rowid","customerid","customer_id","passengerid","rownumber"}]
id_col = id_candidates[0] if id_candidates else None
ids = test_df[id_col] if id_col else pd.Series(range(len(test_df)), name="Id")

# --- build X_test exactly like X used in training ---
X_test = test_df.copy()

# drop the same noise/ID columns as in training
if "Surname" in X_test.columns and "Surname" in X.columns:
    X_test = X_test.drop(columns=["Surname"])
if 'DROP_COLS' in globals() and DROP_COLS:
    for c in DROP_COLS:
        if c in X_test.columns:
            X_test = X_test.drop(columns=[c])
if id_col in X_test.columns:
    X_test = X_test.drop(columns=[id_col])

# align columns to training features
X_test = X_test[X.columns]

preds_dict = {}

#  XGBoost (native)
if 'bst' in globals():
    _, Xte_enc = encode_for_xgb(X_train, X_train, X_test, cat_cols)
    dte = xgb.DMatrix(Xte_enc)
    preds_dict['xgb'] = xgb_predict_best(bst, dte)

#  LightGBM
if 'lgbm_final' in globals():
    try:
        Xte_lgb = prep_final.transform(X_test)    # fitted preprocessor from final fit
    except NameError:
        # fallback: fit encoder on X_train now
        _, Xte_lgb, prep_final = prep_fit_transform(pre_lgbm, X_train, X_test)
    if hasattr(Xte_lgb, "toarray"):
        Xte_lgb = Xte_lgb.toarray()
    preds_dict['lgbm'] = lgbm_final.predict_proba(Xte_lgb)[:, 1]

#  ExtraTrees
if 'et_final' in globals():
    try:
        Xte_et = prep_et_final.transform(X_test)  # If you saved a separate preprocessor for ET.
    except NameError:
        ## Use the same ordinal preprocessor as for LGBM.
        try:
            Xte_et = prep_final.transform(X_test)
        except NameError:
            _, Xte_et, prep_et_final = prep_fit_transform(pre_lgbm, X_train, X_test)
    if hasattr(Xte_et, "toarray"):
        Xte_et = Xte_et.toarray()
    preds_dict['et'] = et_final.predict_proba(Xte_et)[:, 1]

#  Blends
keys = set(preds_dict.keys())

if {"xgb", "lgbm", "et"} <= keys:
    # 3-model blend
    preds_dict['blend_eq'] = (preds_dict['xgb'] + preds_dict['lgbm'] + preds_dict['et']) / 3.0
    try:
        # best_weights from the training block: three weights are expected.
        w_xgb, w_lgbm, w_et = best_w
    except Exception:
        w_xgb, w_lgbm, w_et = (1/3, 1/3, 1/3)
    preds_dict['blend_opt'] = w_xgb*preds_dict['xgb'] + w_lgbm*preds_dict['lgbm'] + w_et*preds_dict['et']

elif {"xgb", "lgbm"} <= keys:
    # 2-model blend (fallback, if ET is missing).
    preds_dict['blend_eq'] = (preds_dict['xgb'] + preds_dict['lgbm']) / 2.0
    try:
        w_xgb, w_lgbm = best_w
    except Exception:
        w_xgb, w_lgbm = (0.5, 0.5)
    preds_dict['blend_opt'] = w_xgb*preds_dict['xgb'] + w_lgbm*preds_dict['lgbm']

assert SUB_CHOICE in preds_dict, f"SUB_CHOICE='{SUB_CHOICE}' not available. Have: {list(preds_dict)}"
preds = preds_dict[SUB_CHOICE]

#  build submission
if os.path.exists(SAMPLE_SUB_PATH):
    sub = pd.read_csv(SAMPLE_SUB_PATH)
    if id_col and id_col in sub.columns:
        pred_cols = [c for c in sub.columns if c != id_col]
        pred_col = pred_cols[0] if pred_cols else sub.columns[-1]
        sub[id_col] = ids.values
        sub[pred_col] = preds
    else:
        # fallback: first column id, last column target
        sub.iloc[:, 0] = ids.values
        sub.iloc[:, -1] = preds
else:
    sub = pd.DataFrame({ids.name if ids.name else "Id": ids.values, "prediction": preds})

sub.to_csv(SUB_PATH, index=False)
print(f"Saved: {SUB_PATH}")
print("Preview:")
print(sub.head())


Saved: submission_xgb.csv
Preview:
      id    Exited
0  15000  0.393180
1  15001  0.198907
2  15002  0.284997
3  15003  0.710505
4  15004  0.239940
