In [1]:
# =============================================================================
# HyperBoost V28.2: Cleaned & Hardened Ultra Ensemble
# - Single per-fold imputation
# - Compatible XGBoost early stopping
# - Warnings instead of hard assert for no categoricals
# - Fixed month mapping to avoid Categorical fillna error
# =============================================================================

import os, warnings
import numpy as np
import pandas as pd
from dataclasses import dataclass
from scipy.optimize import minimize
from typing import Optional
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.isotonic import IsotonicRegression
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier

warnings.filterwarnings("ignore")

@dataclass
class Config:
    DATA_PATH: str = "/kaggle/input/playground-series-s5e8"
    EXTERNAL_DATA_PATH: str = "/kaggle/input/bankdatset/bank1.csv"
    OUTPUT_NAME: str = "submission_v28_2.csv"
    SEED: int = 2025
    N_SPLITS: int = 10
    N_ESTIMATORS: int = 10000
    EARLY_STOP: int = 500
    USE_GPU: bool = True
    SEEDS: tuple = (0, 42, 123, 456, 789)
    USE_EXTERNAL: bool = True
    USE_LGB: bool = True
    USE_XGB: bool = True
    USE_CAT: bool = True
    USE_META_LR: bool = True
    BLEND_OPTIMIZE: bool = True
    USE_CALIBRATION: bool = True

CFG = Config()

def seed_all(seed=CFG.SEED):
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
seed_all()

def read_data():
    train = pd.read_csv(f"{CFG.DATA_PATH}/train.csv")
    test  = pd.read_csv(f"{CFG.DATA_PATH}/test.csv")
    if 'duration' not in train or 'duration' not in test:
        raise ValueError("Both train and test must contain 'duration'")
    return train, test

def enhanced_features(df):
    out = df.copy()
    month_map = dict(zip(
        ['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec'],
        range(1,13)
    ))
    if 'month' in out:
        # Fix: convert to str before mapping to avoid Categorical fillna error
        month_str = out['month'].astype(str).str.lower()
        out['month_num'] = month_str.map(month_map).fillna(0).astype(int)
        out.drop(columns=['month'], inplace=True)
        out['month_sin'] = np.sin(2*np.pi*out['month_num']/12)
        out['month_cos'] = np.cos(2*np.pi*out['month_num']/12)
    if 'balance' in out and 'age' in out:
        out['balance_per_age'] = out['balance'] / (out['age'].replace(0, np.nan))
    if 'age' in out and 'campaign' in out:
        out['age_x_campaign'] = out['age'] * out['campaign']
    for c in ['age','campaign','balance','duration']:
        if c in out:
            out[f'{c}_sq'] = out[c]**2
            out[f'{c}_sqrt'] = np.sqrt(np.abs(out[c]))
    return out

def add_external_data_features(df, external_df):
    if external_df is None:
        return df
    df_ext = df.copy()
    def norm_str(s):
        s = str(s).lower().strip()
        return ' '.join(s.replace('.', '').replace('-', ' ').split())
    ext = external_df.copy()
    for col in ext.select_dtypes(include='object'):
        ext[col] = ext[col].apply(norm_str)
    for col in df_ext.select_dtypes(include='object'):
        df_ext[col] = df_ext[col].apply(norm_str).astype('category')
    common = [c for c in df_ext.columns if c in ext.columns and df_ext[c].dtype.name=='category']
    for c in common:
        vc = ext[c].value_counts()
        freq = (vc / vc.sum()).astype(float)
        prior = 1.0/max(1, vc.size)
        smooth = ((vc + 20*prior)/(vc.sum()+20)).astype(float)
        cnt_vals = df_ext[c].astype(str).map(vc).fillna(0).astype(float)
        hi = np.percentile(cnt_vals, 99)
        df_ext[f'{c}_ext_cnt'] = np.clip(cnt_vals, 0, hi)
        df_ext[f'{c}_ext_freq'] = df_ext[c].astype(str).map(freq).fillna(0.0).astype(float)
        df_ext[f'{c}_ext_smooth'] = df_ext[c].astype(str).map(smooth).fillna(prior).astype(float)
    return df_ext

def bayesian_target_encoding(X_tr, X_va, X_te, y_tr, cat_cols, alpha=10):
    prior = y_tr.mean()
    te_cols = []
    for c in cat_cols:
        if c not in X_tr: continue
        stats = y_tr.groupby(X_tr[c]).agg(['mean','count'])
        smooth = (stats['mean']*stats['count'] + prior*alpha) / (stats['count'] + alpha)
        for df in (X_tr,X_va,X_te):
            if c in df:
                vals = df[c].map(smooth).astype(float).fillna(prior)
                df[f'{c}_te'] = vals.values
                te_cols.append(f'{c}_te')
        for df in (X_tr,X_va,X_te):
            df.drop(columns=[c], inplace=True, errors='ignore')
    return X_tr, X_va, X_te, te_cols

def add_freq_encoding(X_tr, X_va, X_te, cat_cols):
    fe_cols = []
    for c in cat_cols:
        if c not in X_tr: continue
        freq = (X_tr[c].value_counts()/len(X_tr)).astype(float)
        for df in (X_tr,X_va,X_te):
            vals = df[c].map(freq).astype(float).fillna(0.0)
            df[f'{c}_freq'] = vals.values
            fe_cols.append(f'{c}_freq')
    return X_tr, X_va, X_te, fe_cols

def impute_only(X_tr, X_va, X_te):
    cols = X_tr.select_dtypes(include='number').columns
    imp = SimpleImputer(strategy='median')
    X_tr[cols] = imp.fit_transform(X_tr[cols])
    X_va[cols] = imp.transform(X_va[cols])
    X_te[cols] = imp.transform(X_te[cols])
    return X_tr, X_va, X_te

def get_models():
    models = []
    for offset in CFG.SEEDS:
        seed = CFG.SEED + offset
        if CFG.USE_LGB:
            models.append((f'lgb_{seed}', lgb.LGBMClassifier(
                objective='binary', metric='auc',
                n_estimators=CFG.N_ESTIMATORS, learning_rate=0.015,
                num_leaves=127, subsample=0.85, colsample_bytree=0.85,
                device='gpu' if CFG.USE_GPU else 'cpu',
                random_state=seed, verbosity=-1
            )))
        if CFG.USE_XGB:
            models.append((f'xgb_{seed}', xgb.XGBClassifier(
                objective='binary:logistic', eval_metric='auc',
                n_estimators=CFG.N_ESTIMATORS, learning_rate=0.015,
                max_depth=8, subsample=0.85, colsample_bytree=0.85,
                tree_method='gpu_hist' if CFG.USE_GPU else 'hist',
                random_state=seed
            )))
        if CFG.USE_CAT:
            models.append((f'cat_{seed}', CatBoostClassifier(
                iterations=CFG.N_ESTIMATORS, learning_rate=0.015,
                depth=8, l2_leaf_reg=5, eval_metric='AUC',
                task_type='GPU' if CFG.USE_GPU else 'CPU',
                random_seed=seed, od_type='Iter', od_wait=CFG.EARLY_STOP,
                verbose=False
            )))
    return models

def train_base_models(X, y, T, cat_cols):
    models = get_models()
    oof = np.zeros((len(X), len(models)))
    tst = np.zeros((len(T), len(models)))
    aucs = []
    folds = StratifiedKFold(n_splits=CFG.N_SPLITS, shuffle=True, random_state=CFG.SEED)
    for i,(name,model) in enumerate(models):
        print(f"Training {name}")
        oof_pred = np.zeros(len(X)); tst_pred = np.zeros(len(T))
        for tr,va in folds.split(X,y):
            X_tr, X_va = X.iloc[tr].copy(), X.iloc[va].copy()
            y_tr, y_va = y.iloc[tr], y.iloc[va]
            X_te = T.copy()
            X_tr, X_va, X_te, fe = add_freq_encoding(X_tr,X_va,X_te,cat_cols)
            X_tr, X_va, X_te, te = bayesian_target_encoding(X_tr,X_va,X_te,y_tr,cat_cols)
            if not te:
                warnings.warn("No TE columns created")
            X_tr, X_va, X_te = impute_only(X_tr,X_va,X_te)
            X_va = X_va.reindex(columns=X_tr.columns, fill_value=0)
            X_te = X_te.reindex(columns=X_tr.columns, fill_value=0)
            if name.startswith('lgb'):
                model.fit(X_tr,y_tr,eval_set=[(X_va,y_va)],
                          callbacks=[lgb.early_stopping(CFG.EARLY_STOP,verbose=False)])
            elif name.startswith('xgb'):
                model.fit(X_tr,y_tr,eval_set=[(X_va,y_va)],
                          early_stopping_rounds=CFG.EARLY_STOP, verbose=False)
            else:
                model.fit(X_tr,y_tr,eval_set=[(X_va,y_va)],use_best_model=True)
            oof_pred[va] = model.predict_proba(X_va)[:,1]
            tst_pred += model.predict_proba(X_te)[:,1] / CFG.N_SPLITS
        auc = roc_auc_score(y,oof_pred)
        print(f"{name} OOF AUC: {auc:.5f}")
        oof[:,i] = oof_pred
        tst[:,i] = tst_pred
        aucs.append(auc)
    return oof,tst,aucs

def blend_and_stack(oof,tst,y,aucs):
    comps,names = [],[]
    w = np.array(aucs) / sum(aucs)
    comps.append((oof @ w, tst @ w)); names.append('base_w')
    if CFG.USE_META_LR:
        meta_oof = np.zeros(len(y))
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=CFG.SEED+1)
        for tr,va in folds.split(oof,y):
            m = LogisticRegressionCV(Cs=10, cv=3, scoring='roc_auc', max_iter=2000, n_jobs=-1, random_state=CFG.SEED+1)
            m.fit(oof[tr],y.iloc[tr])
            meta_oof[va] = m.predict_proba(oof[va])[:,1]
        m.fit(oof,y)
        meta_tst = m.predict_proba(tst)[:,1]
        comps.append((meta_oof,meta_tst)); names.append('meta_lr')
        print("Meta LR AUC:", roc_auc_score(y,meta_oof))
    coof = np.column_stack([c[0] for c in comps])
    cst = np.column_stack([c[1] for c in comps])
    def obj(w):
        w = np.clip(w,0,1); w /= w.sum()
        return -roc_auc_score(y, coof @ w)
    bounds = [(0.01,0.8)] * coof.shape[1]
    cons = [{'type':'eq', 'fun':lambda w: w.sum()-1}]
    res = minimize(obj, np.ones(coof.shape[1])/coof.shape[1], method='SLSQP', bounds=bounds, constraints=cons)
    w_opt = res.x / res.x.sum()
    print("Weights:", dict(zip(names, [f"{v:.4f}" for v in w_opt])))
    final_oof = coof @ w_opt
    final_tst = cst @ w_opt
    pre = roc_auc_score(y, final_oof)
    print("Pre-cal AUC:", pre)
    if CFG.USE_CALIBRATION:
        iso = IsotonicRegression(out_of_bounds='clip')
        iso.fit(final_oof, y)
        cal_oof = iso.transform(final_oof)
        cal_tst = iso.transform(final_tst)
        post = roc_auc_score(y, cal_oof)
        if post >= pre:
            final_oof = cal_oof
            final_tst = cal_tst
            print("Cal AUC:", post)
        else:
            print("Cal skipped")
        final_auc = post if post >= pre else pre
    else:
        final_auc = pre
    print("Final AUC:", final_auc)
    return final_tst, final_auc

if __name__ == "__main__":
    train, test = read_data()
    y = train['y'].astype(int)
    X = train.drop(columns=['id','y']).copy()
    T = test.drop(columns=['id']).copy()
    for c in X.select_dtypes(include='object'):
        X[c] = X[c].str.lower().fillna('unknown').astype('category')
        T[c] = T[c].str.lower().fillna('unknown').astype('category')
    X = enhanced_features(X)
    T = enhanced_features(T)
    if CFG.USE_EXTERNAL:
        try:
            ext = pd.read_csv(CFG.EXTERNAL_DATA_PATH)
            X = add_external_data_features(X, ext)
            T = add_external_data_features(T, ext)
        except:
            print("Skip external")
    T = T.reindex(columns=X.columns, fill_value=0)
    assert list(X.columns) == list(T.columns), "Columns mismatch"
    cat_cols = [c for c in X.columns if X[c].dtype.name in ('category','object')]
    if not cat_cols:
        warnings.warn("No categoricals found")
    oof, tst, aucs = train_base_models(X, y, T, cat_cols)
    preds, auc = blend_and_stack(oof, tst, y, aucs)
    sub = pd.DataFrame({'id': test['id'].values, 'y': preds})
    sub.to_csv(CFG.OUTPUT_NAME, index=False)
    print("Saved", CFG.OUTPUT_NAME, "Final OOF AUC:", auc)


Training lgb_2025




lgb_2025 OOF AUC: 0.96913
Training xgb_2025
xgb_2025 OOF AUC: 0.96870
Training cat_2025


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


cat_2025 OOF AUC: 0.96649
Training lgb_2067
lgb_2067 OOF AUC: 0.96915
Training xgb_2067
xgb_2067 OOF AUC: 0.96867
Training cat_2067


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


cat_2067 OOF AUC: 0.96650
Training lgb_2148
lgb_2148 OOF AUC: 0.96908
Training xgb_2148
xgb_2148 OOF AUC: 0.96867
Training cat_2148


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


cat_2148 OOF AUC: 0.96651
Training lgb_2481
lgb_2481 OOF AUC: 0.96912
Training xgb_2481
xgb_2481 OOF AUC: 0.96866
Training cat_2481


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


cat_2481 OOF AUC: 0.96657
Training lgb_2814
lgb_2814 OOF AUC: 0.96913
Training xgb_2814
xgb_2814 OOF AUC: 0.96866
Training cat_2814


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


cat_2814 OOF AUC: 0.96650
Meta LR AUC: 0.9695506595992953
Weights: {'base_w': '0.5000', 'meta_lr': '0.5000'}
Pre-cal AUC: 0.96935987653169
Cal AUC: 0.9694171833696903
Final AUC: 0.9694171833696903
Saved submission_v28_2.csv Final OOF AUC: 0.9694171833696903
