# Baseline Models

In [12]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

from pathlib import Path
import pandas as pd

cwd = Path.cwd()

candidates = [
    cwd / "data",             
    cwd.parent / "data",     
    cwd.parent.parent / "data"  
]

DATA_DIR = next((p for p in candidates if (p / "train.csv").exists()), None)
if DATA_DIR is None:
    raise FileNotFoundError(f"Не нашёл data/ рядом с ноутбуком. Проверил: {', '.join(str(p) for p in candidates)}")

print("DATA_DIR:", DATA_DIR)
train = pd.read_csv(DATA_DIR / "train.csv")
test  = pd.read_csv(DATA_DIR / "test.csv")

y = train["y"]
X = train.drop(columns=["y", "id"])
X_test = test.drop(columns=["id"])

num_cols = ["age", "balance", "day", "duration", "campaign", "pdays", "previous"]
cat_cols = ["job", "marital", "education", "default", "housing", "loan", "contact", "month", "poutcome"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000, class_weight="balanced", solver="liblinear"))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = []
oof_y = []

for train_idx, val_idx in cv.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model.fit(X_train, y_train)
    preds = model.predict_proba(X_val)[:, 1]
    oof_preds.extend(preds)
    oof_y.extend(y_val)

print("OOF AUC:", roc_auc_score(oof_y, oof_preds))

DATA_DIR: /Users/mykytasalykin/Desktop/kaggle-bankterm/kaggle-bankterm/data
OOF AUC: 0.9427161029459289


In [17]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

y = train["y"]
X = train.drop(columns=["y", "id"]).copy()
X_test = test.drop(columns=["id"]).copy()

num_cols = ["age", "balance", "day", "duration", "campaign", "pdays", "previous"]
cat_cols = ["job", "marital", "education", "default", "housing", "loan", "contact", "month", "poutcome"]

for c in cat_cols:
    X[c] = X[c].astype("category")
    X_test[c] = X_test[c].astype("category")

pos_weight = (y == 0).sum() / (y == 1).sum()

params = {
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 63,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "scale_pos_weight": pos_weight,
    "seed": 42,
    "n_jobs": -1,
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(X))
test_pred = np.zeros(len(X_test))

for tr_idx, va_idx in cv.split(X, y):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    dtr = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_cols, free_raw_data=False)
    dva = lgb.Dataset(X_va, label=y_va, categorical_feature=cat_cols, free_raw_data=False)

    model = lgb.train(
        params,
        dtr,
        valid_sets=[dtr, dva],
        valid_names=["train", "valid"],
        num_boost_round=5000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=200),
            lgb.log_evaluation(period=200),
        ],
    )

    oof[va_idx] = model.predict(X_va, num_iteration=model.best_iteration)
    test_pred += model.predict(X_test, num_iteration=model.best_iteration) / cv.n_splits

print("OOF AUC:", roc_auc_score(y, oof))

from pathlib import Path
Path("outputs/submissions").mkdir(parents=True, exist_ok=True)
pd.DataFrame({"id": test["id"], "y": test_pred}).to_csv("outputs/submissions/lgbm_baseline.csv", index=False)

[LightGBM] [Info] Number of positive: 72391, number of negative: 527609
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016260 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1009
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986273
[LightGBM] [Info] Start training from score -1.986273
Training until validation scores don't improve for 200 rounds
[200]	train's auc: 0.969137	valid's auc: 0.967272
[400]	train's auc: 0.972649	valid's auc: 0.968318
[600]	train's auc: 0.975182	valid's auc: 0.968749
[800]	train's auc: 0.977288	valid's auc: 0.968956
[1000]	train's auc: 0.979097	valid's auc: 0.969039
[1200]	train's auc: 0.980795	valid's auc: 0.969167
[1400]	train's auc: 0.982305	valid's auc: 0.969194
Early stopping, best iter

In [19]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold

def month_to_num(m):
    mapping = {'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,'sep':9,'oct':10,'nov':11,'dec':12}
    return mapping.get(m, 0)

def make_features(df):
    out = df.copy()
    out['balance_is_neg'] = (out['balance'] < 0).astype(int)
    out['balance_log1p'] = np.log1p(np.maximum(out['balance'], 0))
    q99 = out['duration'].quantile(0.99)
    out['duration_clip_99'] = np.minimum(out['duration'], q99)
    out['duration_log1p'] = np.log1p(out['duration_clip_99'])
    out['duration_per_call'] = out['duration_clip_99'] / (out['campaign'] + 1.0)
    out['pdays_was_contacted'] = (out['pdays'] != -1).astype(int)
    out['pdays_pos'] = out['pdays'].where(out['pdays'] != -1, np.nan)
    out['previous_gt0'] = (out['previous'] > 0).astype(int)
    q90 = out['campaign'].quantile(0.90)
    out['campaign_high_q90'] = (out['campaign'] >= q90).astype(int)
    out['month_num'] = out['month'].map(month_to_num).astype(int)
    out['month_sin'] = np.sin(2*np.pi*out['month_num']/12.0)
    out['month_cos'] = np.cos(2*np.pi*out['month_num']/12.0)
    out['day_sin'] = np.sin(2*np.pi*out['day']/31.0)
    out['day_cos'] = np.cos(2*np.pi*out['day']/31.0)
    out['contact_cellular'] = (out['contact'] == 'cellular').astype(int)
    out['housing_yes'] = (out['housing'] == 'yes').astype(int)
    out['loan_yes'] = (out['loan'] == 'yes').astype(int)
    out['default_yes'] = (out['default'] == 'yes').astype(int)
    out['dur_x_cell'] = out['duration_clip_99'] * out['contact_cellular']
    out['dur_x_prevpos'] = out['duration_clip_99'] * out['previous_gt0']
    return out

train_f = make_features(train)
test_f = make_features(test)

y = train_f['y'].values
drop_cols = ['y','id']
X = train_f.drop(columns=drop_cols)
X_test = test_f.drop(columns=['id'])

cat_cols = ["job","marital","education","default","housing","loan","contact","month","poutcome"]
for c in cat_cols:
    X[c] = X[c].astype('category')
    X_test[c] = X_test[c].astype('category')

In [20]:
def kfold_target_encode(train_df, test_df, cols, target, n_splits=5, seed=42, suffix="_te"):
    te_train = pd.DataFrame(index=train_df.index)
    te_test = pd.DataFrame(index=test_df.index)
    global_means = {}
    for c in cols:
        global_means[c] = train_df.groupby(c)[target].mean()
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    for c in cols:
        te_col = np.zeros(len(train_df))
        for tr_idx, va_idx in skf.split(train_df, train_df[target]):
            tr = train_df.iloc[tr_idx]; va = train_df.iloc[va_idx]
            means = tr.groupby(c)[target].mean()
            te_col[va_idx] = va[c].map(means).fillna(train_df[target].mean()).values
        te_train[c + suffix] = te_col
        te_test[c + suffix] = test_df[c].map(train_df.groupby(c)[target].mean()).fillna(train_df[target].mean()).values
    return te_train, te_test

te_cols = ["job","education","contact","month","poutcome"]
te_train, te_test = kfold_target_encode(train_f, test_f, te_cols, target='y', n_splits=5, seed=42)

X_te = pd.concat([X.reset_index(drop=True), te_train.reset_index(drop=True)], axis=1)
X_test_te = pd.concat([X_test.reset_index(drop=True), te_test.reset_index(drop=True)], axis=1)

In [21]:
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from pathlib import Path

pos_weight = (y==0).sum()/(y==1).sum()

def cv_lgbm(X_, y_, params, n_splits=5, seed=42):
    oof = np.zeros(len(X_))
    test_pred = np.zeros(len(X_test_te))
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    for tr, va in skf.split(X_, y_):
        X_tr, X_va = X_.iloc[tr], X_.iloc[va]
        y_tr, y_va = y_[tr], y_[va]
        dtr = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_cols, free_raw_data=False)
        dva = lgb.Dataset(X_va, label=y_va, categorical_feature=cat_cols, free_raw_data=False)
        model = lgb.train(
            params,
            dtr,
            valid_sets=[dtr, dva],
            valid_names=['train','valid'],
            num_boost_round=6000,
            callbacks=[lgb.early_stopping(stopping_rounds=300), lgb.log_evaluation(period=200)],
        )
        oof[va] = model.predict(X_va, num_iteration=model.best_iteration)
        test_pred += model.predict(X_test_te, num_iteration=model.best_iteration)/n_splits
    return oof, test_pred

params_gbdt = {
    "objective":"binary",
    "metric":"auc",
    "boosting_type":"gbdt",
    "learning_rate":0.03,
    "num_leaves":127,
    "min_data_in_leaf":64,
    "min_sum_hessian_in_leaf":5.0,
    "feature_fraction":0.85,
    "bagging_fraction":0.85,
    "bagging_freq":1,
    "lambda_l1":0.0,
    "lambda_l2":10.0,
    "scale_pos_weight":pos_weight,
    "seed":42,
    "n_jobs":-1,
}

params_dart = dict(params_gbdt)
params_dart.update({"boosting_type":"dart", "drop_rate":0.1, "skip_drop":0.5, "max_drop":50})

oof_gbdt, test_gbdt = cv_lgbm(X_te, y, params_gbdt)
oof_dart, test_dart = cv_lgbm(X_te, y, params_dart)

print("GBDT OOF AUC:", roc_auc_score(y, oof_gbdt))
print("DART OOF AUC:", roc_auc_score(y, oof_dart))


[LightGBM] [Info] Number of positive: 72391, number of negative: 527609
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023537 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3042
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986273
[LightGBM] [Info] Start training from score -1.986273
Training until validation scores don't improve for 300 rounds
[200]	train's auc: 0.969369	valid's auc: 0.966888
[400]	train's auc: 0.973957	valid's auc: 0.968911
[600]	train's auc: 0.976792	valid's auc: 0.969503
[800]	train's auc: 0.979076	valid's auc: 0.969851
[1000]	train's auc: 0.981036	valid's auc: 0.970046
[1200]	train's auc: 0.982774	valid's auc: 0.970178
[1400]	train's auc: 0.984228	valid's auc: 0.970205
[1600]	train's auc: 0.985



[200]	train's auc: 0.961993	valid's auc: 0.961096
[400]	train's auc: 0.964632	valid's auc: 0.96345
[600]	train's auc: 0.966684	valid's auc: 0.965003
[800]	train's auc: 0.96947	valid's auc: 0.966885
[1000]	train's auc: 0.971313	valid's auc: 0.967897
[1200]	train's auc: 0.972743	valid's auc: 0.968574
[1400]	train's auc: 0.973787	valid's auc: 0.96891
[1600]	train's auc: 0.974982	valid's auc: 0.969298
[1800]	train's auc: 0.975923	valid's auc: 0.969534
[2000]	train's auc: 0.976796	valid's auc: 0.969688
[2200]	train's auc: 0.977478	valid's auc: 0.969796
[2400]	train's auc: 0.978195	valid's auc: 0.969912
[2600]	train's auc: 0.978884	valid's auc: 0.970027
[2800]	train's auc: 0.979533	valid's auc: 0.970079
[3000]	train's auc: 0.980264	valid's auc: 0.970198
[3200]	train's auc: 0.980923	valid's auc: 0.970284
[3400]	train's auc: 0.981587	valid's auc: 0.970331
[3600]	train's auc: 0.982213	valid's auc: 0.970409
[3800]	train's auc: 0.98289	valid's auc: 0.970455
[4000]	train's auc: 0.983363	valid's au



[200]	train's auc: 0.96219	valid's auc: 0.96039
[400]	train's auc: 0.964881	valid's auc: 0.96263
[600]	train's auc: 0.966932	valid's auc: 0.964156
[800]	train's auc: 0.969812	valid's auc: 0.966097
[1000]	train's auc: 0.971611	valid's auc: 0.966953
[1200]	train's auc: 0.972976	valid's auc: 0.967535
[1400]	train's auc: 0.97403	valid's auc: 0.967879
[1600]	train's auc: 0.975234	valid's auc: 0.968259
[1800]	train's auc: 0.976169	valid's auc: 0.968474
[2000]	train's auc: 0.977069	valid's auc: 0.968621
[2200]	train's auc: 0.977776	valid's auc: 0.96873
[2400]	train's auc: 0.978493	valid's auc: 0.968803
[2600]	train's auc: 0.97917	valid's auc: 0.968907
[2800]	train's auc: 0.979791	valid's auc: 0.968971
[3000]	train's auc: 0.980522	valid's auc: 0.96905
[3200]	train's auc: 0.981142	valid's auc: 0.969131
[3400]	train's auc: 0.981842	valid's auc: 0.969235
[3600]	train's auc: 0.982438	valid's auc: 0.969275
[3800]	train's auc: 0.983097	valid's auc: 0.969292
[4000]	train's auc: 0.983584	valid's auc: 



[200]	train's auc: 0.962156	valid's auc: 0.960231
[400]	train's auc: 0.964865	valid's auc: 0.9625
[600]	train's auc: 0.96696	valid's auc: 0.964076
[800]	train's auc: 0.969772	valid's auc: 0.965929
[1000]	train's auc: 0.971645	valid's auc: 0.96689
[1200]	train's auc: 0.972942	valid's auc: 0.967369
[1400]	train's auc: 0.973999	valid's auc: 0.967668
[1600]	train's auc: 0.975178	valid's auc: 0.968018
[1800]	train's auc: 0.976113	valid's auc: 0.968232
[2000]	train's auc: 0.977068	valid's auc: 0.968452
[2200]	train's auc: 0.97774	valid's auc: 0.968502
[2400]	train's auc: 0.97848	valid's auc: 0.968616
[2600]	train's auc: 0.979118	valid's auc: 0.96867
[2800]	train's auc: 0.979748	valid's auc: 0.968725
[3000]	train's auc: 0.980492	valid's auc: 0.968815
[3200]	train's auc: 0.981151	valid's auc: 0.968909
[3400]	train's auc: 0.9818	valid's auc: 0.968969
[3600]	train's auc: 0.98241	valid's auc: 0.969003
[3800]	train's auc: 0.983079	valid's auc: 0.969064
[4000]	train's auc: 0.983557	valid's auc: 0.9



[200]	train's auc: 0.961908	valid's auc: 0.961326
[400]	train's auc: 0.964658	valid's auc: 0.963397
[600]	train's auc: 0.966785	valid's auc: 0.964954
[800]	train's auc: 0.96957	valid's auc: 0.966838
[1000]	train's auc: 0.971447	valid's auc: 0.967856
[1200]	train's auc: 0.97283	valid's auc: 0.968421
[1400]	train's auc: 0.973862	valid's auc: 0.968679
[1600]	train's auc: 0.975057	valid's auc: 0.969072
[1800]	train's auc: 0.975965	valid's auc: 0.969243
[2000]	train's auc: 0.976938	valid's auc: 0.96948
[2200]	train's auc: 0.977588	valid's auc: 0.969559
[2400]	train's auc: 0.978326	valid's auc: 0.969672
[2600]	train's auc: 0.978981	valid's auc: 0.96974
[2800]	train's auc: 0.979624	valid's auc: 0.969828
[3000]	train's auc: 0.980369	valid's auc: 0.96995
[3200]	train's auc: 0.981003	valid's auc: 0.970032
[3400]	train's auc: 0.981672	valid's auc: 0.970127
[3600]	train's auc: 0.982285	valid's auc: 0.970163
[3800]	train's auc: 0.98297	valid's auc: 0.970213
[4000]	train's auc: 0.983441	valid's auc:



[200]	train's auc: 0.962227	valid's auc: 0.960676
[400]	train's auc: 0.964887	valid's auc: 0.962831
[600]	train's auc: 0.966899	valid's auc: 0.964334
[800]	train's auc: 0.969709	valid's auc: 0.966196
[1000]	train's auc: 0.971652	valid's auc: 0.967282
[1200]	train's auc: 0.972998	valid's auc: 0.96786
[1400]	train's auc: 0.974044	valid's auc: 0.968188
[1600]	train's auc: 0.975256	valid's auc: 0.968581
[1800]	train's auc: 0.976198	valid's auc: 0.96881
[2000]	train's auc: 0.977105	valid's auc: 0.968989
[2200]	train's auc: 0.977797	valid's auc: 0.969076
[2400]	train's auc: 0.978526	valid's auc: 0.96918
[2600]	train's auc: 0.979168	valid's auc: 0.969245
[2800]	train's auc: 0.979778	valid's auc: 0.969277
[3000]	train's auc: 0.9805	valid's auc: 0.969357
[3200]	train's auc: 0.98114	valid's auc: 0.969428
[3400]	train's auc: 0.981783	valid's auc: 0.969467
[3600]	train's auc: 0.982388	valid's auc: 0.969527
[3800]	train's auc: 0.983052	valid's auc: 0.969602
[4000]	train's auc: 0.983529	valid's auc:

In [22]:
from catboost import CatBoostClassifier, Pool

def cv_cat(X_, y_, n_splits=5, seed=42):
    oof = np.zeros(len(X_))
    test_pred = np.zeros(len(X_test_te))
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    for tr, va in skf.split(X_, y_):
        X_tr, X_va = X_.iloc[tr], X_.iloc[va]
        y_tr, y_va = y_[tr], y_[va]
        tr_pool = Pool(X_tr, y_tr, cat_features=cat_cols)
        va_pool = Pool(X_va, y_va, cat_features=cat_cols)
        te_pool = Pool(X_test_te, cat_features=cat_cols)
        model = CatBoostClassifier(
            iterations=2500,
            depth=6,
            learning_rate=0.05,
            l2_leaf_reg=6,
            loss_function="Logloss",
            eval_metric="AUC",
            random_seed=42,
            od_type="Iter",
            od_wait=200,
            verbose=200
        )
        model.fit(tr_pool, eval_set=va_pool, use_best_model=True)
        oof[va] = model.predict_proba(va_pool)[:,1]
        test_pred += model.predict_proba(te_pool)[:,1]/n_splits
    return oof, test_pred

oof_cat, test_cat = cv_cat(X_te, y)

print("Cat OOF AUC:", roc_auc_score(y, oof_cat))

0:	test: 0.9241523	best: 0.9241523 (0)	total: 165ms	remaining: 6m 52s
200:	test: 0.9611855	best: 0.9611855 (200)	total: 30.9s	remaining: 5m 53s
400:	test: 0.9638017	best: 0.9638017 (400)	total: 1m 3s	remaining: 5m 34s
600:	test: 0.9649772	best: 0.9649772 (600)	total: 1m 38s	remaining: 5m 12s
800:	test: 0.9657168	best: 0.9657168 (800)	total: 2m 15s	remaining: 4m 46s
1000:	test: 0.9662553	best: 0.9662553 (1000)	total: 2m 54s	remaining: 4m 21s
1200:	test: 0.9666308	best: 0.9666308 (1200)	total: 3m 32s	remaining: 3m 50s
1400:	test: 0.9668863	best: 0.9668863 (1400)	total: 4m 10s	remaining: 3m 16s
1600:	test: 0.9671883	best: 0.9671891 (1599)	total: 4m 46s	remaining: 2m 41s
1800:	test: 0.9673948	best: 0.9673948 (1800)	total: 5m 25s	remaining: 2m 6s
2000:	test: 0.9675745	best: 0.9675746 (1999)	total: 6m 5s	remaining: 1m 31s
2200:	test: 0.9677352	best: 0.9677352 (2200)	total: 6m 46s	remaining: 55.2s
2400:	test: 0.9678814	best: 0.9678814 (2400)	total: 7m 26s	remaining: 18.4s
2499:	test: 0.967937

In [24]:
from scipy.stats import rankdata

def rank_avg(*cols):
    arr = np.vstack([rankdata(c)/len(c) for c in cols])
    return arr.mean(axis=0)

w_gbdt = roc_auc_score(y, oof_gbdt)
w_dart = roc_auc_score(y, oof_dart)
w_cat = roc_auc_score(y, oof_cat)
ws = np.array([w_gbdt, w_dart, w_cat])
ws = ws/ws.sum()

oof_blend = ws[0]*oof_gbdt + ws[1]*oof_dart + ws[2]*oof_cat
oof_rank = rank_avg(oof_gbdt, oof_dart, oof_cat)

print("Blend-weighted OOF AUC:", roc_auc_score(y, oof_blend))
print("Blend-rank OOF AUC:", roc_auc_score(y, oof_rank))

test_blend = ws[0]*test_gbdt + ws[1]*test_dart + ws[2]*test_cat
test_rank = rank_avg(test_gbdt, test_dart, test_cat)

Path("outputs/submissions").mkdir(parents=True, exist_ok=True)
pd.DataFrame({"id": test["id"], "y": test_blend}).to_csv("outputs/submissions/blend_weighted.csv", index=False)
pd.DataFrame({"id": test["id"], "y": test_rank}).to_csv("outputs/submissions/blend_rank.csv", index=False)

Blend-weighted OOF AUC: 0.9696346807388398
Blend-rank OOF AUC: 0.9697065582417187


In [25]:
from sklearn.model_selection import StratifiedKFold
import numpy as np, pandas as pd

def kfold_target_encoding_smooth(train_df, test_df, cols, target, n_splits=5, seed=42, min_samples=50, smoothing=10, noise=0.01):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    te_train = pd.DataFrame(index=train_df.index)
    te_test = pd.DataFrame(index=test_df.index)
    global_mean = train_df[target].mean()
    for c in cols:
        oof_vals = np.zeros(len(train_df))
        for tr, va in skf.split(train_df, train_df[target]):
            tr_df, va_df = train_df.iloc[tr], train_df.iloc[va]
            stats = tr_df.groupby(c)[target].agg(['mean','count'])
            smooth = (stats['count']*stats['mean'] + smoothing*global_mean) / (stats['count'] + smoothing)
            oof_vals[va] = va_df[c].map(smooth).fillna(global_mean).values
        te_train[c+"_te"] = oof_vals*(1+noise*np.random.randn(len(oof_vals)))
        stats_full = train_df.groupby(c)[target].agg(['mean','count'])
        smooth_full = (stats_full['count']*stats_full['mean'] + smoothing*global_mean) / (stats_full['count'] + smoothing)
        te_test[c+"_te"] = test_df[c].map(smooth_full).fillna(global_mean).values
    return te_train, te_test

pairs = [('job','month'),('contact','month'),('poutcome','contact')]
for a,b in pairs:
    train_f[f'{a}_{b}'] = train_f[a].astype(str)+'__'+train_f[b].astype(str)
    test_f[f'{a}_{b}'] = test_f[a].astype(str)+'__'+test_f[b].astype(str)

te_cols = ["job","education","contact","month","poutcome","job_month","contact_month","poutcome_contact"]
te_train2, te_test2 = kfold_target_encoding_smooth(train_f.rename(columns={'job_month':'job_month','contact_month':'contact_month','poutcome_contact':'poutcome_contact'})\
    .assign(job_month=train_f['job'].astype(str)+'__'+train_f['month'].astype(str),
            contact_month=train_f['contact'].astype(str)+'__'+train_f['month'].astype(str),
            poutcome_contact=train_f['poutcome'].astype(str)+'__'+train_f['contact'].astype(str)),
    test_f.assign(job_month=test_f['job'].astype(str)+'__'+test_f['month'].astype(str),
                  contact_month=test_f['contact'].astype(str)+'__'+test_f['month'].astype(str),
                  poutcome_contact=test_f['poutcome'].astype(str)+'__'+test_f['contact'].astype(str)),
    te_cols, target='y', n_splits=5, smoothing=20, noise=0.01)

freq_cols = ["job","education","contact","month","poutcome"]
for c in freq_cols:
    vc = train_f[c].value_counts()
    train_f[c+"_freq"] = train_f[c].map(vc).fillna(0).astype(int)
    test_f[c+"_freq"] = test_f[c].map(vc).fillna(0).astype(int)

X2 = pd.concat([X, te_train2], axis=1)
X2_test = pd.concat([X_test, te_test2], axis=1)


In [29]:
def train_gbdt(X_, y_, seed, use_pos_weight=False, num_boost_round=4000):
    params = {
        "objective":"binary","metric":"auc","boosting_type":"gbdt",
        "learning_rate":0.03,"num_leaves":127,"min_data_in_leaf":64,
        "feature_fraction":0.85,"bagging_fraction":0.85,"bagging_freq":1,
        "lambda_l2":10.0,"seed":seed,"n_jobs":-1,"max_bin":511
    }
    if use_pos_weight:
        params["scale_pos_weight"] = (y_==0).sum()/(y_==1).sum()
    return cv_lgbm(X_, y_, params)  # без num_boost_round


In [32]:
import numpy as np, lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from tqdm.auto import tqdm

def cv_lgbm(X_, y_, params, n_splits=5, seed=42, num_boost_round=4000, es_rounds=300):
    oof = np.zeros(len(X_))
    test_pred = np.zeros(len(X_test_te))
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    folds = list(skf.split(X_, y_))
    use_es = params.get("boosting_type","gbdt") != "dart"
    for tr, va in tqdm(folds, desc=f"LGBM-{params.get('boosting_type','gbdt')}", leave=False):
        X_tr, X_va = X_.iloc[tr], X_.iloc[va]
        y_tr, y_va = y_[tr], y_[va]
        dtr = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_cols, free_raw_data=False)
        dva = lgb.Dataset(X_va, label=y_va, categorical_feature=cat_cols, free_raw_data=False)
        cbs = [lgb.log_evaluation(period=200)]
        if use_es: cbs.append(lgb.early_stopping(stopping_rounds=es_rounds))
        model = lgb.train(params, dtr, valid_sets=[dtr, dva], valid_names=['train','valid'],
                          num_boost_round=num_boost_round, callbacks=cbs)
        best_iter = model.best_iteration if use_es else num_boost_round
        oof[va] = model.predict(X_va, num_iteration=best_iter)
        test_pred += model.predict(X_test_te, num_iteration=best_iter)/n_splits
    return oof, test_pred


In [35]:
from copy import deepcopy

def align_train_test(X_tr, X_te, cat_cols):
    X_tr = deepcopy(X_tr)
    X_te = deepcopy(X_te)
    for c in cat_cols:
        if c in X_tr.columns:
            X_tr[c] = X_tr[c].astype("category")
        if c in X_te.columns:
            X_te[c] = X_te[c].astype("category")
        if c in X_tr.columns and c in X_te.columns:
            cats = sorted(set(X_tr[c].cat.categories).union(set(X_te[c].cat.categories)))
            X_tr[c] = X_tr[c].cat.set_categories(cats)
            X_te[c] = X_te[c].cat.set_categories(cats)
    tr_only = [c for c in X_tr.columns if c not in X_te.columns]
    te_only = [c for c in X_te.columns if c not in X_tr.columns]
    for c in tr_only:
        X_te[c] = 0
    for c in te_only:
        X_tr[c] = 0
    X_tr = X_tr[X_te.columns]
    return X_tr, X_te

X2_aligned, X2_test_aligned = align_train_test(X2, X2_test, cat_cols)


In [37]:
import numpy as np, lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from tqdm.auto import tqdm

def cv_lgbm(X_tr_all, y_all, X_te_all, params, n_splits=5, seed=42, num_boost_round=4000, es_rounds=300):
    oof = np.zeros(len(X_tr_all))
    test_pred = np.zeros(len(X_te_all))
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    folds = list(skf.split(X_tr_all, y_all))
    use_es = params.get("boosting_type","gbdt") != "dart"
    for tr, va in tqdm(folds, desc=f"LGBM-{params.get('boosting_type','gbdt')}", leave=False):
        X_tr, X_va = X_tr_all.iloc[tr], X_tr_all.iloc[va]
        y_tr, y_va = y_all[tr], y_all[va]
        dtr = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_cols, free_raw_data=False)
        dva = lgb.Dataset(X_va, label=y_va, categorical_feature=cat_cols, free_raw_data=False)
        cbs = [lgb.log_evaluation(period=200)]
        if use_es: cbs.append(lgb.early_stopping(stopping_rounds=es_rounds))
        model = lgb.train(params, dtr, valid_sets=[dtr, dva], valid_names=['train','valid'],
                          num_boost_round=num_boost_round, callbacks=cbs)
        best_iter = model.best_iteration if use_es else num_boost_round
        oof[va] = model.predict(X_va, num_iteration=best_iter)
        test_pred += model.predict(X_te_all, num_iteration=best_iter) / n_splits
    return oof, test_pred


In [38]:
pos_weight = (y==0).sum()/(y==1).sum()

def make_params(seed, use_pos_weight=False):
    p = {
        "objective":"binary","metric":"auc","boosting_type":"gbdt",
        "learning_rate":0.03,"num_leaves":127,"min_data_in_leaf":64,
        "min_sum_hessian_in_leaf":5.0,"feature_fraction":0.85,
        "bagging_fraction":0.85,"bagging_freq":1,"lambda_l2":10.0,
        "max_bin":511,"seed":seed,"n_jobs":-1
    }
    if use_pos_weight: p["scale_pos_weight"] = pos_weight
    return p

oof_g1, test_g1 = cv_lgbm(X2_aligned, y, X2_test_aligned, make_params(42, False), num_boost_round=3500)
oof_g2, test_g2 = cv_lgbm(X2_aligned, y, X2_test_aligned, make_params(7,  False), num_boost_round=3500)
oof_g3, test_g3 = cv_lgbm(X2_aligned, y, X2_test_aligned, make_params(2025, True), num_boost_round=3500)
print("GBDT seeds:", roc_auc_score(y,oof_g1), roc_auc_score(y,oof_g2), roc_auc_score(y,oof_g3))


LGBM-gbdt:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 72391, number of negative: 527609
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020628 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9341
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 44
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986273
[LightGBM] [Info] Start training from score -1.986273
Training until validation scores don't improve for 300 rounds
[200]	train's auc: 0.970279	valid's auc: 0.968
[400]	train's auc: 0.974889	valid's auc: 0.969996
[600]	train's auc: 0.977845	valid's auc: 0.970491
[800]	train's auc: 0.980336	valid's auc: 0.970857
[1000]	train's auc: 0.982449	valid's auc: 0.970954
[1200]	train's auc: 0.98433	valid's auc: 0.971106
[1400]	train's auc: 0.986008	valid's auc: 0.971173
[1600]	train's auc: 0.98751	v

LGBM-gbdt:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 72391, number of negative: 527609
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019266 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9338
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 44
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986273
[LightGBM] [Info] Start training from score -1.986273
Training until validation scores don't improve for 300 rounds
[200]	train's auc: 0.97026	valid's auc: 0.967931
[400]	train's auc: 0.974852	valid's auc: 0.969904
[600]	train's auc: 0.97785	valid's auc: 0.970438
[800]	train's auc: 0.980299	valid's auc: 0.970732
[1000]	train's auc: 0.982448	valid's auc: 0.970898
[1200]	train's auc: 0.984323	valid's auc: 0.970995
[1400]	train's auc: 0.985995	valid's auc: 0.971031
[1600]	train's auc: 0.98750

LGBM-gbdt:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 72391, number of negative: 527609
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.071896 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9383
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 44
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986273
[LightGBM] [Info] Start training from score -1.986273
Training until validation scores don't improve for 300 rounds
[200]	train's auc: 0.96991	valid's auc: 0.967132
[400]	train's auc: 0.974949	valid's auc: 0.969227
[600]	train's auc: 0.978147	valid's auc: 0.969785
[800]	train's auc: 0.980781	valid's auc: 0.970116
[1000]	train's auc: 0.98299	valid's auc: 0.970298
[1200]	train's auc: 0.984994	valid's auc: 0.970442
[1400]	train's auc: 0.986737	valid's auc: 0.970547
[1600]	train's auc: 0.98827	valid's auc: 0.970613
[1800]	train's auc: 0.989634	valid's auc:

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

Z_tr = np.vstack([oof_g1, oof_g2, oof_g3, oof_cat, oof_dart]).T
Z_te = np.vstack([test_g1, test_g2, test_g3, test_cat, test_dart]).T

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_meta = np.zeros(len(y)); pred_meta = np.zeros(len(test))
for tr, va in tqdm(list(skf.split(Z_tr, y)), desc="Stack-LogReg", leave=False):
    m = LogisticRegression(max_iter=1000)
    m.fit(Z_tr[tr], y[tr])
    oof_meta[va] = m.predict_proba(Z_tr[va])[:,1]
    pred_meta += m.predict_proba(Z_te)[:,1]/skf.n_splits

print("Stack OOF AUC:", roc_auc_score(y, oof_meta))
pd.DataFrame({"id": test["id"], "y": pred_meta}).to_csv("outputs/submissions/stack_logreg.csv", index=False)


Stack-LogReg:   0%|          | 0/5 [00:00<?, ?it/s]

Stack OOF AUC: 0.9706332700185369


In [None]:
feature_list = list(X2_aligned.columns)
pd.Series(feature_list).to_json("outputs/feature_list.json", orient="values")
X2_aligned = X2_aligned[feature_list]
X2_test_aligned = X2_test_aligned[feature_list]

In [41]:
import numpy as np, pandas as pd

def add_bins_interactions(df):
    out = df.copy()
    out['duration_q50'] = pd.qcut(out['duration'], 50, labels=False, duplicates='drop')
    out['balance_q20'] = pd.qcut(out['balance'].clip(lower=out['balance'].quantile(0.001), upper=out['balance'].quantile(0.999)), 20, labels=False, duplicates='drop')
    out['age_q20'] = pd.qcut(out['age'], 20, labels=False, duplicates='drop')
    out['campaign_bin'] = pd.cut(out['campaign'], [0,1,2,3,5,10,100], right=False, labels=False)
    out['pdays_is_miss'] = (out['pdays']==-1).astype(int)
    out['pdays_pos_log'] = np.log1p(out['pdays'].clip(lower=0))
    out['dur_x_cell'] = np.log1p(out['duration']) * (out['contact']=='cellular').astype(int)
    out['dur_x_ms'] = np.log1p(out['duration']) * out['month_sin']
    out['dur_x_mc'] = np.log1p(out['duration']) * out['month_cos']
    out['pdaysmiss_x_poutsucc'] = out['pdays_is_miss'] * (out['poutcome']=='success').astype(int)
    return out

train_f = add_bins_interactions(train_f)
test_f = add_bins_interactions(test_f)


In [42]:
pairs = [('job','contact'),('job','month'),('education','contact'),('poutcome','contact')]
for a,b in pairs:
    train_f[f'{a}_{b}'] = train_f[a].astype(str)+'__'+train_f[b].astype(str)
    test_f[f'{a}_{b}'] = test_f[a].astype(str)+'__'+test_f[b].astype(str)

for c in ['job','education','contact','month','poutcome'] + [f'{a}_{b}' for a,b in pairs]:
    vc = train_f[c].value_counts()
    train_f[c+'_freq'] = train_f[c].map(vc).fillna(0).astype(int)
    test_f[c+'_freq'] = test_f[c].map(vc).fillna(0).astype(int)


In [43]:
from sklearn.model_selection import StratifiedKFold
import numpy as np, pandas as pd

def loo_te(train_df, test_df, cols, target, noise=0.01):
    te_tr = pd.DataFrame(index=train_df.index); te_te = pd.DataFrame(index=test_df.index)
    gmean = train_df[target].mean()
    for c in cols:
        grp = train_df.groupby(c)[target]
        sum_y = grp.transform('sum')
        cnt = grp.transform('count')
        val = (sum_y - train_df[target]) / (cnt - 1).replace(0, np.nan)
        te_tr[c+'_loo'] = val.fillna(gmean) * (1 + noise*np.random.randn(len(val)))
        te_te[c+'_loo'] = test_df[c].map(grp.mean()).fillna(gmean)
    return te_tr, te_te

loo_cols = ['job','education','contact','month','poutcome'] + [f'{a}_{b}' for a,b in pairs]
te_tr2, te_te2 = loo_te(train_f, test_f, loo_cols, 'y', noise=0.01)

X3 = pd.concat([train_f.drop(columns=['y','id']), te_tr2], axis=1)
X3_test = pd.concat([test_f.drop(columns=['id']), te_te2], axis=1)


In [45]:
import numpy as np, pandas as pd, lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from tqdm.auto import tqdm

orig_cat = ["job","marital","education","default","housing","loan","contact","month","poutcome"]
pair_cat = ["job_month","contact_month","poutcome_contact","job_contact","education_contact"]
cat_cols_ext = [c for c in orig_cat + pair_cat if c in train_f.columns]

bin_like = ["duration_q50","balance_q20","age_q20","campaign_bin"]
for c in bin_like:
    if c in train_f.columns:
        train_f[c] = pd.to_numeric(train_f[c], errors="coerce").astype("Int32").fillna(-1).astype("int32")
    if c in test_f.columns:
        test_f[c] = pd.to_numeric(test_f[c], errors="coerce").astype("Int32").fillna(-1).astype("int32")

for c in cat_cols_ext:
    if c in train_f.columns: train_f[c] = train_f[c].astype("category")
    if c in test_f.columns:  test_f[c]  = test_f[c].astype("category")

def align_xy(X_tr, X_te, cats):
    for c in cats:
        if c in X_tr.columns and c in X_te.columns:
            cats_uni = sorted(set(X_tr[c].cat.categories).union(set(X_te[c].cat.categories)))
            X_tr[c] = X_tr[c].cat.set_categories(cats_uni)
            X_te[c] = X_te[c].cat.set_categories(cats_uni)
    tr_only = [c for c in X_tr.columns if c not in X_te.columns]
    te_only = [c for c in X_te.columns if c not in X_tr.columns]
    for c in tr_only: X_te[c] = 0
    for c in te_only: X_tr[c] = 0
    X_tr = X_tr[X_te.columns]
    return X_tr, X_te

y_arr = train_f["y"].values
X_all = train_f.drop(columns=["y","id"])
X_te_all = test_f.drop(columns=["id"])
X_all, X_te_all = align_xy(X_all.copy(), X_te_all.copy(), cat_cols_ext)

def cv_lgbm(X_tr_all, y_all, X_te_all, params, n_splits=5, seed=42, num_boost_round=4000, es_rounds=300):
    oof = np.zeros(len(X_tr_all))
    test_pred = np.zeros(len(X_te_all))
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    folds = list(skf.split(X_tr_all, y_all))
    use_es = params.get("boosting_type","gbdt") != "dart"
    for tr, va in tqdm(folds, desc=f"LGBM-{params.get('boosting_type','gbdt')}", leave=False):
        X_tr, X_va = X_tr_all.iloc[tr], X_tr_all.iloc[va]
        y_tr, y_va = y_all[tr], y_all[va]
        dtr = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_cols_ext, free_raw_data=False)
        dva = lgb.Dataset(X_va, label=y_va, categorical_feature=cat_cols_ext, free_raw_data=False)
        cbs = [lgb.log_evaluation(period=200)]
        if use_es: cbs.append(lgb.early_stopping(stopping_rounds=es_rounds))
        model = lgb.train(params, dtr, valid_sets=[dtr, dva], valid_names=['train','valid'],
                          num_boost_round=num_boost_round, callbacks=cbs)
        best_iter = model.best_iteration if use_es else num_boost_round
        oof[va] = model.predict(X_va, num_iteration=best_iter)
        test_pred += model.predict(X_te_all, num_iteration=best_iter) / n_splits
    return oof, test_pred

pos_weight = (y_arr==0).sum()/(y_arr==1).sum()

def make_params(seed, use_pos_weight=False):
    p = {
        "objective":"binary","metric":"auc","boosting_type":"gbdt",
        "learning_rate":0.02,"num_leaves":127,"min_data_in_leaf":96,
        "feature_fraction":0.8,"bagging_fraction":0.8,"bagging_freq":1,
        "min_sum_hessian_in_leaf":5.0,"lambda_l2":10.0,"max_bin":511,
        "extra_trees":True,"seed":seed,"n_jobs":-1
    }
    if use_pos_weight: p["scale_pos_weight"] = pos_weight
    return p

o1,t1 = cv_lgbm(X_all, y_arr, X_te_all, make_params(42, False), num_boost_round=5000)
o2,t2 = cv_lgbm(X_all, y_arr, X_te_all, make_params(7,  False), num_boost_round=5000)
o3,t3 = cv_lgbm(X_all, y_arr, X_te_all, make_params(2025, True), num_boost_round=5000)

print("GBDT seeds:", roc_auc_score(y_arr,o1), roc_auc_score(y_arr,o2), roc_auc_score(y_arr,o3))


LGBM-gbdt:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 72391, number of negative: 527609
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.039035 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7265
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986273
[LightGBM] [Info] Start training from score -1.986273
Training until validation scores don't improve for 300 rounds
[200]	train's auc: 0.962973	valid's auc: 0.96211
[400]	train's auc: 0.96683	valid's auc: 0.964347
[600]	train's auc: 0.968922	valid's auc: 0.965146
[800]	train's auc: 0.970386	valid's auc: 0.965505
[1000]	train's auc: 0.97154	valid's auc: 0.965694
[1200]	train's auc: 0.972494	valid's auc: 0.965824
[1400]	train's auc: 0.973317	valid's auc: 0.965895
[1600]	train's auc: 0.974088

LGBM-gbdt:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 72391, number of negative: 527609
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022140 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7262
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986273
[LightGBM] [Info] Start training from score -1.986273
Training until validation scores don't improve for 300 rounds
[200]	train's auc: 0.962937	valid's auc: 0.962104
[400]	train's auc: 0.966709	valid's auc: 0.964338
[600]	train's auc: 0.968897	valid's auc: 0.965187
[800]	train's auc: 0.970378	valid's auc: 0.965554
[1000]	train's auc: 0.971523	valid's auc: 0.965718
[1200]	train's auc: 0.972485	valid's auc: 0.965847
[1400]	train's auc: 0.97332	valid's auc: 0.965922
[1600]	train's auc: 0.9740

LGBM-gbdt:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 72391, number of negative: 527609
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022811 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7329
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986273
[LightGBM] [Info] Start training from score -1.986273
Training until validation scores don't improve for 300 rounds
[200]	train's auc: 0.962914	valid's auc: 0.961574
[400]	train's auc: 0.967258	valid's auc: 0.964145
[600]	train's auc: 0.969623	valid's auc: 0.965015
[800]	train's auc: 0.971215	valid's auc: 0.96537
[1000]	train's auc: 0.972475	valid's auc: 0.965552
[1200]	train's auc: 0.973508	valid's auc: 0.965616
[1400]	train's auc: 0.974402	valid's auc: 0.965631
[1600]	train's auc: 0.9752

In [50]:
import os, json, numpy as np, pandas as pd
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

OUT = Path("outputs")
SUB = OUT/"submissions"
CACHE = OUT/"cache"
for p in [OUT,SUB,CACHE]: p.mkdir(parents=True, exist_ok=True)

data_candidates = [Path.cwd()/ "data", Path.cwd().parent/"data", Path.cwd().parent.parent/"data"]
DATA_DIR = next((p for p in data_candidates if (p/"train.csv").exists()), None)
assert DATA_DIR is not None, "data/train.csv not found"

train = pd.read_csv(DATA_DIR/"train.csv")
test  = pd.read_csv(DATA_DIR/"test.csv")

y = train["y"].values
X = train.drop(columns=["y","id"]).copy()
X_test = test.drop(columns=["id"]).copy()

BASE_NUM = ["age","balance","day","duration","campaign","pdays","previous"]
BASE_CAT = ["job","marital","education","default","housing","loan","contact","month","poutcome"]

X_base = train[BASE_NUM + BASE_CAT].copy()
X_test_base = test[BASE_NUM + BASE_CAT].copy()

for c in BASE_CAT:
    X_base[c] = X_base[c].astype("category")
    X_test_base[c] = X_test_base[c].astype("category")

num_cols = ["age","balance","day","duration","campaign","pdays","previous"]
cat_cols = ["job","marital","education","default","housing","loan","contact","month","poutcome"]

for c in cat_cols:
    X[c] = X[c].astype("category")
    X_test[c] = X_test[c].astype("category")

def save_cache(name, oof, pred):
    np.save(CACHE/f"{name}_oof.npy", oof)
    np.save(CACHE/f"{name}_test.npy", pred)

def load_cache(name):
    oof = np.load(CACHE/f"{name}_oof.npy")
    pred = np.load(CACHE/f"{name}_test.npy")
    return oof, pred

def has_cache(name):
    return (CACHE/f"{name}_oof.npy").exists() and (CACHE/f"{name}_test.npy").exists()


In [51]:
import lightgbm as lgb
from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold
import numpy as np

def _align_lgbm_frames(X_tr_all, X_te_all, cats):
    X_tr = X_tr_all.copy()
    X_te = X_te_all.copy()
    tr_only = [c for c in X_tr.columns if c not in X_te.columns]
    te_only = [c for c in X_te.columns if c not in X_tr.columns]
    for c in tr_only: X_te[c] = 0
    for c in te_only: X_tr[c] = 0
    X_tr = X_tr[X_te.columns] 

    for c in cats:
        if c in X_tr.columns and str(X_tr[c].dtype) == "category":
            cats_union = sorted(set(X_tr[c].cat.categories).union(
                               set(X_te[c].cat.categories) if str(X_te[c].dtype)=="category" else []))
            X_tr[c] = X_tr[c].cat.set_categories(cats_union)
            X_te[c] = X_te[c].astype("category").cat.set_categories(cats_union)
    return X_tr, X_te

def cv_lgbm(Xdf, yarr, Xte, params, n_splits=5, seed=42, num_boost_round=3500, es_rounds=250, cats=BASE_CAT):
    X_train_aligned, X_test_aligned = _align_lgbm_frames(Xdf, Xte, cats)
    assert list(X_train_aligned.columns) == list(X_test_aligned.columns), "columns still misaligned"

    oof = np.zeros(len(X_train_aligned))
    pred = np.zeros(len(X_test_aligned))
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    folds = list(skf.split(X_train_aligned, yarr))
    use_es = params.get("boosting_type","gbdt")!="dart"

    for tr, va in tqdm(folds, desc=f"LGBM-{params.get('boosting_type','gbdt')}", leave=False):
        X_tr, X_va = X_train_aligned.iloc[tr], X_train_aligned.iloc[va]
        y_tr, y_va = yarr[tr], yarr[va]
        dtr = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cats, free_raw_data=False)
        dva = lgb.Dataset(X_va, label=y_va, categorical_feature=cats, free_raw_data=False)
        cbs=[lgb.log_evaluation(period=200)]
        if use_es: cbs.append(lgb.early_stopping(stopping_rounds=es_rounds))
        m = lgb.train(params, dtr, valid_sets=[dtr,dva], valid_names=["train","valid"],
                      num_boost_round=num_boost_round, callbacks=cbs)
        best_iter = m.best_iteration if use_es else num_boost_round
        oof[va] = m.predict(X_va, num_iteration=best_iter)
        pred += m.predict(X_test_aligned, num_iteration=best_iter)/n_splits
    return oof, pred


In [52]:
pos_weight = (y==0).sum()/(y==1).sum()
def make_params(seed, spw=None):
    p = {
        "objective":"binary","metric":"auc","boosting_type":"gbdt",
        "learning_rate":0.03,"num_leaves":127,"min_data_in_leaf":96,
        "feature_fraction":0.85,"bagging_fraction":0.85,"bagging_freq":1,
        "min_sum_hessian_in_leaf":5.0,"lambda_l2":10.0,"max_bin":511,
        "seed":seed,"n_jobs":-1,"verbosity":-1,"force_row_wise":True
    }
    if spw is not None: p["scale_pos_weight"] = spw
    return p

if not has_cache("lgb_s42"):
    o1,t1 = cv_lgbm(X_base, y, X_test_base, make_params(42), num_boost_round=3200, es_rounds=250, cats=BASE_CAT)
    print("LGB s42:", roc_auc_score(y, o1)); save_cache("lgb_s42", o1, t1)

if not has_cache("lgb_s7"):
    o2,t2 = cv_lgbm(X_base, y, X_test_base, make_params(7), num_boost_round=3200, es_rounds=250, cats=BASE_CAT)
    print("LGB s7:", roc_auc_score(y, o2)); save_cache("lgb_s7", o2, t2)

if not has_cache("lgb_spw"):
    o3,t3 = cv_lgbm(X_base, y, X_test_base, make_params(2025, pos_weight), num_boost_round=3200, es_rounds=250, cats=BASE_CAT)
    print("LGB spw:", roc_auc_score(y, o3)); save_cache("lgb_spw", o3, t3)


LGBM-gbdt:   0%|          | 0/5 [00:00<?, ?it/s]

Training until validation scores don't improve for 250 rounds
[200]	train's auc: 0.969572	valid's auc: 0.967948
[400]	train's auc: 0.973677	valid's auc: 0.970091
[600]	train's auc: 0.976166	valid's auc: 0.970833
[800]	train's auc: 0.978117	valid's auc: 0.971229
[1000]	train's auc: 0.979729	valid's auc: 0.971425
[1200]	train's auc: 0.981158	valid's auc: 0.97154
[1400]	train's auc: 0.982424	valid's auc: 0.971612
[1600]	train's auc: 0.983609	valid's auc: 0.971647
[1800]	train's auc: 0.984694	valid's auc: 0.971671
[2000]	train's auc: 0.985688	valid's auc: 0.971665
Early stopping, best iteration is:
[1815]	train's auc: 0.984777	valid's auc: 0.971679
Training until validation scores don't improve for 250 rounds
[200]	train's auc: 0.969851	valid's auc: 0.966885
[400]	train's auc: 0.97401	valid's auc: 0.969166
[600]	train's auc: 0.976475	valid's auc: 0.969914
[800]	train's auc: 0.978366	valid's auc: 0.970263
[1000]	train's auc: 0.979969	valid's auc: 0.970426
[1200]	train's auc: 0.981398	valid'

LGBM-gbdt:   0%|          | 0/5 [00:00<?, ?it/s]

Training until validation scores don't improve for 250 rounds
[200]	train's auc: 0.96957	valid's auc: 0.96794
[400]	train's auc: 0.973729	valid's auc: 0.97014
[600]	train's auc: 0.976168	valid's auc: 0.97086
[800]	train's auc: 0.978107	valid's auc: 0.971225
[1000]	train's auc: 0.979719	valid's auc: 0.971407
[1200]	train's auc: 0.981148	valid's auc: 0.971525
[1400]	train's auc: 0.982435	valid's auc: 0.971582
[1600]	train's auc: 0.983609	valid's auc: 0.971617
[1800]	train's auc: 0.984714	valid's auc: 0.971648
[2000]	train's auc: 0.985719	valid's auc: 0.971646
[2200]	train's auc: 0.986628	valid's auc: 0.971635
Early stopping, best iteration is:
[1957]	train's auc: 0.985517	valid's auc: 0.971653
Training until validation scores don't improve for 250 rounds
[200]	train's auc: 0.969853	valid's auc: 0.966908
[400]	train's auc: 0.974023	valid's auc: 0.969123
[600]	train's auc: 0.97648	valid's auc: 0.969891
[800]	train's auc: 0.978381	valid's auc: 0.970266
[1000]	train's auc: 0.979987	valid's a

LGBM-gbdt:   0%|          | 0/5 [00:00<?, ?it/s]

Training until validation scores don't improve for 250 rounds
[200]	train's auc: 0.96912	valid's auc: 0.967158
[400]	train's auc: 0.973914	valid's auc: 0.969766
[600]	train's auc: 0.976571	valid's auc: 0.970462
[800]	train's auc: 0.978624	valid's auc: 0.970826
[1000]	train's auc: 0.980364	valid's auc: 0.971057
[1200]	train's auc: 0.981889	valid's auc: 0.971209
[1400]	train's auc: 0.983233	valid's auc: 0.971301
[1600]	train's auc: 0.984453	valid's auc: 0.971349
[1800]	train's auc: 0.985556	valid's auc: 0.971369
[2000]	train's auc: 0.986577	valid's auc: 0.971376
[2200]	train's auc: 0.987519	valid's auc: 0.971369
Early stopping, best iteration is:
[1979]	train's auc: 0.986471	valid's auc: 0.971381
Training until validation scores don't improve for 250 rounds
[200]	train's auc: 0.969518	valid's auc: 0.966251
[400]	train's auc: 0.974143	valid's auc: 0.968669
[600]	train's auc: 0.976803	valid's auc: 0.969377
[800]	train's auc: 0.97888	valid's auc: 0.969716
[1000]	train's auc: 0.980572	valid'

In [None]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from tqdm.auto import tqdm
import numpy as np
from sklearn.metrics import roc_auc_score
from pathlib import Path

def cv_cat_cpu(Xdf, yarr, Xte, cat_cols, n_splits=5, seed=42):
    oof = np.zeros(len(Xdf), dtype=float)
    pred = np.zeros(len(Xte), dtype=float)
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    for fold, (tr, va) in enumerate(tqdm(list(skf.split(Xdf, yarr)), desc="CAT-CPU", leave=False), 1):
        X_tr, X_va = Xdf.iloc[tr], Xdf.iloc[va]
        y_tr, y_va = yarr[tr], yarr[va]

        trp = Pool(X_tr, y_tr, cat_features=cat_cols)
        vap = Pool(X_va, y_va, cat_features=cat_cols)
        tep = Pool(Xte,          cat_features=cat_cols)

        m = CatBoostClassifier(
            iterations=2500,
            depth=6,
            learning_rate=0.05,
            l2_leaf_reg=6,
            loss_function="Logloss",
            eval_metric="AUC",
            random_seed=seed + fold,
            od_type="Iter",
            od_wait=200,
            verbose=200,
            task_type="CPU",
            thread_count=-1,
            allow_writing_files=False,
        )
        m.fit(trp, eval_set=vap, use_best_model=True)
        oof[va] = m.predict_proba(vap)[:, 1]
        pred += m.predict_proba(tep)[:, 1] / n_splits

    return oof, pred

if not has_cache("cat_cpu_base"):
    oc, tc = cv_cat_cpu(X_base, y.values if hasattr(y, "values") else y, X_test_base, BASE_CAT)
    print("CAT OOF:", roc_auc_score(y, oc))
    save_cache("cat_cpu_base", oc, tc)

Path("outputs/submissions").mkdir(parents=True, exist_ok=True)
pd.DataFrame({"id": test["id"], "y": load_cache("cat_cpu_base")[1]}).to_csv(
    "outputs/submissions/cat_cpu_base.csv", index=False
)


CAT-CPU:   0%|          | 0/5 [00:00<?, ?it/s]

0:	test: 0.9147978	best: 0.9147978 (0)	total: 384ms	remaining: 16m
200:	test: 0.9607437	best: 0.9607437 (200)	total: 1m 14s	remaining: 14m 12s
400:	test: 0.9637184	best: 0.9637184 (399)	total: 2m 54s	remaining: 15m 13s
600:	test: 0.9647026	best: 0.9647026 (600)	total: 4m 32s	remaining: 14m 21s
800:	test: 0.9654050	best: 0.9654050 (800)	total: 6m	remaining: 12m 44s
1000:	test: 0.9659356	best: 0.9659356 (1000)	total: 7m 39s	remaining: 11m 28s
1200:	test: 0.9663450	best: 0.9663454 (1199)	total: 8m 58s	remaining: 9m 42s
1400:	test: 0.9666455	best: 0.9666455 (1400)	total: 10m 1s	remaining: 7m 51s
1600:	test: 0.9669083	best: 0.9669083 (1600)	total: 11m 3s	remaining: 6m 12s
1800:	test: 0.9671197	best: 0.9671197 (1800)	total: 12m 4s	remaining: 4m 41s
2000:	test: 0.9672824	best: 0.9672824 (2000)	total: 13m 3s	remaining: 3m 15s
2200:	test: 0.9674379	best: 0.9674379 (2200)	total: 14m 3s	remaining: 1m 54s
2400:	test: 0.9675780	best: 0.9675783 (2395)	total: 15m 3s	remaining: 37.3s
2499:	test: 0.967

In [58]:
import numpy as np, pandas as pd, xgboost, sklearn
print("sklearn:", sklearn.__version__, "xgboost:", xgboost.__version__)

assert set(BASE_CAT).issubset(set(X_base.columns))
num_inferred = [c for c in X_base.columns if c not in BASE_CAT]
print("num_inferred:", num_inferred)
print("len(X_base), len(X_test_base):", len(X_base), len(X_test_base))

for c in BASE_CAT:
    if str(X_base[c].dtype) != "category":
        X_base[c] = X_base[c].astype("category")
    if str(X_test_base[c].dtype) != "category":
        X_test_base[c] = X_test_base[c].astype("category")

X_base[num_inferred] = X_base[num_inferred].apply(pd.to_numeric, errors="coerce")
X_test_base[num_inferred] = X_test_base[num_inferred].apply(pd.to_numeric, errors="coerce")


sklearn: 1.7.1 xgboost: 3.0.4
num_inferred: ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
len(X_base), len(X_test_base): 750000 250000


In [61]:
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

def _make_ohe():
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        return OneHotEncoder(handle_unknown="ignore", sparse=False)

def cv_xgb_ohe_native(Xdf, yarr, Xte, base_cat, n_splits=5, seed=42, es_rounds=400, num_boost_round=6000):
    num_base = [c for c in Xdf.columns if c not in base_cat]
    oof = np.zeros(len(Xdf), dtype=float)
    pred = np.zeros(len(Xte), dtype=float)
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

    for fold, (tr, va) in enumerate(tqdm(list(skf.split(Xdf, yarr)), desc="XGB-OHE(native)", leave=False), 1):
        X_tr, X_va = Xdf.iloc[tr].copy(), Xdf.iloc[va].copy()
        y_tr, y_va = yarr[tr], yarr[va]
        X_te = Xte.copy()

        ohe = _make_ohe()
        Xtr = np.hstack([X_tr[num_base].to_numpy(dtype=np.float32),
                         ohe.fit_transform(X_tr[base_cat].astype(str)).astype(np.float32)])
        Xva = np.hstack([X_va[num_base].to_numpy(dtype=np.float32),
                         ohe.transform(X_va[base_cat].astype(str)).astype(np.float32)])
        Xtt = np.hstack([X_te[num_base].to_numpy(dtype=np.float32),
                         ohe.transform(X_te[base_cat].astype(str)).astype(np.float32)])

        dtr = xgb.DMatrix(Xtr, label=y_tr)
        dva = xgb.DMatrix(Xva, label=y_va)
        dte = xgb.DMatrix(Xtt)

        params = {
            "objective": "binary:logistic",
            "eval_metric": "auc",
            "eta": 0.03,
            "max_depth": 6,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "lambda": 5.0,
            "alpha": 0.0,
            "tree_method": "hist",
            "seed": seed + fold,
        }

        es = xgb.callback.EarlyStopping(rounds=es_rounds, save_best=True, maximize=True)
        bst = xgb.train(params, dtr, num_boost_round=num_boost_round,
                        evals=[(dtr, "train"), (dva, "valid")],
                        callbacks=[es], verbose_eval=False)

        oof[va] = bst.predict(dva)
        pred += bst.predict(dte) / n_splits

    return oof, pred


In [62]:
from sklearn.metrics import roc_auc_score

if not has_cache("xgb_ohe_base"):
    ox, tx = cv_xgb_ohe_native(X_base, y, X_test_base, BASE_CAT, n_splits=5, seed=42, es_rounds=400, num_boost_round=6000)
    print("XGB(OHE) OOF:", roc_auc_score(y, ox))
    save_cache("xgb_ohe_base", ox, tx)
else:
    ox, tx = load_cache("xgb_ohe_base")
    print("XGB(OHE) OOF:", roc_auc_score(y, ox))

pd.DataFrame({"id": test["id"], "y": tx}).to_csv("outputs/submissions/xgb_ohe_base.csv", index=False)


XGB-OHE(native):   0%|          | 0/5 [00:00<?, ?it/s]

XGB(OHE) OOF: 0.968435070082612
