# 02 • Baselines and Stacking (reproducible from cache)

In [1]:
from pathlib import Path
import os, numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from datetime import datetime

PROJ = Path.cwd()
DATA = None
for p in [PROJ/'data', PROJ.parent/'data', PROJ.parent.parent/'data']:
    if (p/'train.csv').exists() and (p/'test.csv').exists():
        DATA = p; break
assert DATA is not None, 'data/train.csv or test.csv not found'

OUT = PROJ/'notebooks'/'outputs'
CACHE = OUT/'cache'
SUB = OUT/'submissions'
for d in [OUT, CACHE, SUB]: d.mkdir(parents=True, exist_ok=True)

BASE_NUM = ['age','balance','day','duration','campaign','pdays','previous']
BASE_CAT = ['job','marital','education','default','housing','loan','contact','month','poutcome']

train = pd.read_csv(DATA/'train.csv')
test  = pd.read_csv(DATA/'test.csv')
y = train['y'].values

EmptyDataError: No columns to parse from file

## Feature engineering used in final blend

In [None]:
import numpy as np
def month_to_num(m):
    d={'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,'sep':9,'oct':10,'nov':11,'dec':12}
    return d.get(m,0)

def make_curated_features(df):
    out = df.copy()
    out['duration_clip_99'] = np.minimum(out['duration'], out['duration'].quantile(0.99))
    out['duration_log1p'] = np.log1p(out['duration_clip_99'])
    out['duration_per_call'] = out['duration_clip_99'] / (out['campaign'] + 1.0)
    out['pdays_was_contacted'] = (out['pdays'] != -1).astype(int)
    out['pdays_pos_log'] = np.log1p(out['pdays'].where(out['pdays'] != -1, np.nan)).fillna(0.0)
    out['previous_gt0'] = (out['previous'] > 0).astype(int)
    out['month_num'] = out['month'].map(month_to_num).astype(int)
    out['month_sin'] = np.sin(2*np.pi*out['month_num']/12.0)
    out['month_cos'] = np.cos(2*np.pi*out['month_num']/12.0)
    out['day_sin'] = np.sin(2*np.pi*out['day']/31.0)
    out['day_cos'] = np.cos(2*np.pi*out['day']/31.0)
    out['contact_cellular'] = (out['contact'] == 'cellular').astype(int)
    out['dur_x_cell'] = out['duration_clip_99'] * out['contact_cellular']
    return out

train_f = make_curated_features(train.drop(columns=['id']))
test_f  = make_curated_features(test.drop(columns=['id']))
X_feat = train_f.drop(columns=['y']).copy()
X_test_feat = test_f.copy()
for c in BASE_CAT:
    if c in X_feat.columns: X_feat[c] = X_feat[c].astype('category')
    if c in X_test_feat.columns: X_test_feat[c] = X_test_feat[c].astype('category')

## CV helpers and cache IO

In [None]:
def save_cache(name, oof, pred, cache_dir=None):
    cache_dir = (cache_dir or (Path.cwd()/'notebooks'/'outputs'/'cache'))
    cache_dir.mkdir(parents=True, exist_ok=True)
    np.save(cache_dir/f"{name}_oof.npy", oof)
    np.save(cache_dir/f"{name}_test.npy", pred)

def load_cache(name, cache_dir=None):
    cache_dir = (cache_dir or (Path.cwd()/'notebooks'/'outputs'/'cache'))
    o = np.load(cache_dir/f"{name}_oof.npy")
    t = np.load(cache_dir/f"{name}_test.npy")
    return o, t

def has_cache(name, cache_dir=None):
    cache_dir = (cache_dir or (Path.cwd()/'notebooks'/'outputs'/'cache'))
    return (cache_dir/f"{name}_oof.npy").exists() and (cache_dir/f"{name}_test.npy").exists()

## LightGBM (3 seeds + pos_weight)

In [None]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold

def cv_lgbm(X_tr_all, y_all, X_te_all, params, n_splits=5, seed=42, num_boost_round=3500, es_rounds=250, cats=BASE_CAT):
    oof = np.zeros(len(X_tr_all)); pred = np.zeros(len(X_te_all))
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    use_es = params.get('boosting_type','gbdt')!='dart'
    for tr, va in skf.split(X_tr_all, y_all):
        X_tr, X_va = X_tr_all.iloc[tr], X_tr_all.iloc[va]
        y_tr, y_va = y_all[tr], y_all[va]
        dtr = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cats, free_raw_data=False)
        dva = lgb.Dataset(X_va, label=y_va, categorical_feature=cats, free_raw_data=False)
        cbs=[lgb.log_evaluation(period=200)]
        if use_es: cbs.append(lgb.early_stopping(stopping_rounds=es_rounds))
        m = lgb.train(params, dtr, valid_sets=[dtr,dva], valid_names=['train','valid'],
                      num_boost_round=num_boost_round, callbacks=cbs)
        best_iter = m.best_iteration if use_es else num_boost_round
        oof[va] = m.predict(X_va, num_iteration=best_iter)
        pred += m.predict(X_te_all, num_iteration=best_iter)/n_splits
    return oof, pred

pos_weight = (y==0).sum()/(y==1).sum()
def make_params(seed, spw=None):
    p = {
        'objective':'binary','metric':'auc','boosting_type':'gbdt',
        'learning_rate':0.03,'num_leaves':127,'min_data_in_leaf':96,
        'feature_fraction':0.85,'bagging_fraction':0.85,'bagging_freq':1,
        'min_sum_hessian_in_leaf':5.0,'lambda_l2':10.0,'max_bin':511,
        'seed':seed,'n_jobs':-1,'verbosity':-1,'force_row_wise':True
    }
    if spw is not None: p['scale_pos_weight'] = spw
    return p

if not has_cache('lgbF_s42'):
    o1,t1 = cv_lgbm(X_feat, y, X_test_feat, make_params(42), num_boost_round=3200, es_rounds=250)
    save_cache('lgbF_s42', o1, t1)
if not has_cache('lgbF_s7'):
    o2,t2 = cv_lgbm(X_feat, y, X_test_feat, make_params(7), num_boost_round=3200, es_rounds=250)
    save_cache('lgbF_s7', o2, t2)
if not has_cache('lgbF_spw'):
    o3,t3 = cv_lgbm(X_feat, y, X_test_feat, make_params(2025, pos_weight), num_boost_round=3200, es_rounds=250)
    save_cache('lgbF_spw', o3, t3)

## CatBoost (CPU)

In [None]:
from catboost import CatBoostClassifier, Pool

def cv_cat_cpu_feat(Xdf, yarr, Xte, cat_cols, n_splits=5, seed=42):
    oof = np.zeros(len(Xdf)); pred = np.zeros(len(Xte))
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    for fold,(tr,va) in enumerate(skf.split(Xdf,yarr),1):
        X_tr, X_va = Xdf.iloc[tr], Xdf.iloc[va]
        y_tr, y_va = yarr[tr], yarr[va]
        trp = Pool(X_tr, y_tr, cat_features=cat_cols)
        vap = Pool(X_va, y_va, cat_features=cat_cols)
        tep = Pool(Xte,           cat_features=cat_cols)
        m = CatBoostClassifier(iterations=2500, depth=6, learning_rate=0.05, l2_leaf_reg=6,
                               loss_function='Logloss', eval_metric='AUC', random_seed=seed+fold,
                               od_type='Iter', od_wait=200, verbose=200, task_type='CPU',
                               thread_count=-1, allow_writing_files=False)
        m.fit(trp, eval_set=vap, use_best_model=True)
        oof[va] = m.predict_proba(vap)[:,1]
        pred += m.predict_proba(tep)[:,1]/n_splits
    return oof, pred

if not has_cache('catF_cpu'):
    oc, tc = cv_cat_cpu_feat(X_feat, y, X_test_feat, BASE_CAT)
    save_cache('catF_cpu', oc, tc)

## XGBoost with OHE

In [None]:
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder

def _make_ohe():
    try: return OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    except TypeError: return OneHotEncoder(handle_unknown='ignore', sparse=False)

def cv_xgb_ohe_native(Xdf, yarr, Xte, base_cat, n_splits=5, seed=42, es_rounds=400, num_boost_round=6000):
    num_base = [c for c in Xdf.columns if c not in base_cat]
    oof = np.zeros(len(Xdf)); pred = np.zeros(len(Xte))
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    for fold,(tr,va) in enumerate(skf.split(Xdf,yarr),1):
        X_tr, X_va = Xdf.iloc[tr].copy(), Xdf.iloc[va].copy()
        y_tr, y_va = yarr[tr], yarr[va]
        X_te = Xte.copy()
        ohe = _make_ohe()
        Xtr = np.hstack([X_tr[num_base].to_numpy(dtype=np.float32),
                         ohe.fit_transform(X_tr[base_cat].astype(str)).astype(np.float32)])
        Xva = np.hstack([X_va[num_base].to_numpy(dtype=np.float32),
                         ohe.transform(X_va[base_cat].astype(str)).astype(np.float32)])
        Xtt = np.hstack([X_te[num_base].to_numpy(dtype=np.float32),
                         ohe.transform(X_te[base_cat].astype(str)).astype(np.float32)])
        dtr = xgb.DMatrix(Xtr, label=y_tr); dva = xgb.DMatrix(Xva, label=y_va); dte = xgb.DMatrix(Xtt)
        params = {'objective':'binary:logistic','eval_metric':'auc','eta':0.03,'max_depth':6,
                  'subsample':0.8,'colsample_bytree':0.8,'lambda':5.0,'alpha':0.0,
                  'tree_method':'hist','seed':seed+fold}
        es = xgb.callback.EarlyStopping(rounds=es_rounds, save_best=True, maximize=True)
        bst = xgb.train(params, dtr, num_boost_round=num_boost_round, evals=[(dtr,'train'),(dva,'valid')],
                        callbacks=[es], verbose_eval=False)
        oof[va] = bst.predict(dva)
        pred += bst.predict(dte)/n_splits
    return oof, pred

if not has_cache('xgbF_ohe'):
    ox, tx = cv_xgb_ohe_native(X_feat, y, X_test_feat, BASE_CAT, n_splits=5, seed=42, es_rounds=400, num_boost_round=6000)
    save_cache('xgbF_ohe', ox, tx)

## Report OOF AUC per model

In [None]:
o1,_=load_cache('lgbF_s42'); o2,_=load_cache('lgbF_s7'); o3,_=load_cache('lgbF_spw'); oc,_=load_cache('catF_cpu'); ox,_=load_cache('xgbF_ohe')
print({
    'lgbF_s42': float(roc_auc_score(y,o1)),
    'lgbF_s7': float(roc_auc_score(y,o2)),
    'lgbF_spw': float(roc_auc_score(y,o3)),
    'catF_cpu': float(roc_auc_score(y,oc)),
    'xgbF_ohe': float(roc_auc_score(y,ox)),
})

## Stacking (Logistic Regression) and submission

In [None]:
from sklearn.linear_model import LogisticRegression

o1,t1 = load_cache('lgbF_s42')
o2,t2 = load_cache('lgbF_s7')
o3,t3 = load_cache('lgbF_spw')
oc,tc = load_cache('catF_cpu')
ox,tx = load_cache('xgbF_ohe')

Z_tr = np.vstack([o1,o2,o3,oc,ox]).T
Z_te = np.vstack([t1,t2,t3,tc,tx]).T

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_meta = np.zeros(len(y)); pred_meta = np.zeros(len(test))
for tr, va in skf.split(Z_tr, y):
    m = LogisticRegression(max_iter=1000)
    m.fit(Z_tr[tr], y[tr])
    oof_meta[va] = m.predict_proba(Z_tr[va])[:,1]
    pred_meta += m.predict_proba(Z_te)[:,1]/skf.n_splits

print('Stack OOF AUC:', float(roc_auc_score(y, oof_meta)))
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
sub_path = (PROJ/'notebooks'/'outputs'/'submissions'/f'final_stack_{ts}.csv')
pd.DataFrame({'id': pd.read_csv(DATA/'test.csv')['id'], 'y': pred_meta}).to_csv(sub_path, index=False)
print('Saved:', sub_path)