### Import Libraries

In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

sns.set_theme(style="whitegrid", palette="husl")
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.grid'] = True
plt.rcParams['grid.alpha'] = 0.3

### Import Data

In [47]:
train = pd.read_csv('../data/inputs/train.csv', index_col='id')
test = pd.read_csv('../data/inputs/test.csv', index_col='id')

In [48]:
train.info()
train.head()

<class 'pandas.core.frame.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 17 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   age        750000 non-null  int64 
 1   job        750000 non-null  object
 2   marital    750000 non-null  object
 3   education  750000 non-null  object
 4   default    750000 non-null  object
 5   balance    750000 non-null  int64 
 6   housing    750000 non-null  object
 7   loan       750000 non-null  object
 8   contact    750000 non-null  object
 9   day        750000 non-null  int64 
 10  month      750000 non-null  object
 11  duration   750000 non-null  int64 
 12  campaign   750000 non-null  int64 
 13  pdays      750000 non-null  int64 
 14  previous   750000 non-null  int64 
 15  poutcome   750000 non-null  object
 16  y          750000 non-null  int64 
dtypes: int64(8), object(9)
memory usage: 103.0+ MB


Unnamed: 0_level_0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,42,technician,married,secondary,no,7,no,no,cellular,25,aug,117,3,-1,0,unknown,0
1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,jun,185,1,-1,0,unknown,0
2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,may,111,2,-1,0,unknown,0
3,27,student,single,secondary,no,34,yes,no,unknown,28,may,10,2,-1,0,unknown,0
4,26,technician,married,secondary,no,889,yes,no,cellular,3,feb,902,1,-1,0,unknown,1


In [49]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

categorical_cols = train.select_dtypes(exclude=['number']).columns

for col in categorical_cols:
    train[col] = le.fit_transform(train[col])

categorical_cols = test.select_dtypes(exclude=['number']).columns

for col in categorical_cols:
    test[col] = le.fit_transform(test[col])

train.head()
test.head()

Unnamed: 0_level_0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
750000,32,1,1,1,0,1397,1,0,2,21,8,224,1,-1,0,3
750001,44,4,1,2,0,23,1,0,0,3,0,586,2,-1,0,3
750002,36,6,1,0,0,46,1,1,0,13,8,111,2,-1,0,3
750003,58,1,1,1,0,-1380,1,1,2,29,8,125,1,-1,0,3
750004,28,9,2,1,0,1950,1,0,0,22,5,181,1,-1,0,3


### Model Training

In [53]:
def data_process(df):
    df = df.copy()

    def many_no(x):
        if x['default'] == 0 and x['housing'] == 0 and x['loan'] == 0:
            return 21
        if x['default'] == 0 and x['housing'] == 0 \
        or x['default'] == 0 and x['loan'] == 0 \
        or x['housing'] == 0 and x['loan'] == 0:
            return 7
        if x['default'] == 'no' or x['housing'] == 'no' or x['loan'] == 'no':
            return 3
        return 0

    df['many_no'] = df.apply(lambda x:many_no(x), axis=1)

    return df

In [57]:
from sklearn.model_selection import train_test_split, StratifiedKFold

X = train.drop('y', axis=1)
y = train['y']

X_str = data_process(X).astype('int')
test_str = data_process(test).astype('int')

X.head()
X_str.head()

Unnamed: 0_level_0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,many_no
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,42,9,1,1,0,7,0,0,0,25,1,117,3,-1,0,3,21
1,38,1,1,1,0,514,0,0,2,18,6,185,1,-1,0,3,21
2,36,1,1,1,0,602,1,0,2,14,8,111,2,-1,0,3,7
3,27,8,2,1,0,34,1,0,2,28,8,10,2,-1,0,3,7
4,26,9,1,1,0,889,1,0,0,3,3,902,1,-1,0,3,7


In [59]:
import numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

xgb_clf = xgb.XGBClassifier(
    max_depth=6,
    objective='binary:logistic',
    eval_metric='auc',
    learning_rate=0.1,
    n_estimators=1000,
    tree_method='hist'
)

N_SPLITS = 5
skfold = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
test_pred = np.zeros(len(test_str))
roc_scores = []

for fold, (train_idx, test_idx) in enumerate(
    tqdm(skfold.split(X_str, y), total=N_SPLITS, desc="XGB CV")
):
    print(f"Training fold {fold + 1}/{N_SPLITS}...")
    X_train, X_val = X_str.iloc[train_idx], X_str.iloc[test_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

    model = clone(xgb_clf)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )

    y_pred = model.predict_proba(X_val)[:, 1]
    roc = roc_auc_score(y_val, y_pred)
    roc_scores.append(roc)
    print(f"Fold {fold + 1} ROC AUC: {roc:.4f}")

    test_pred += model.predict_proba(test_str)[:, 1]

print(f"Average Fold ROC AUC: {np.mean(roc_scores):.4f} ± {np.std(roc_scores):.4f}")
test_pred = test_pred / N_SPLITS

XGB CV:   0%|          | 0/5 [00:00<?, ?it/s]

Training fold 1/5...
Fold 1 ROC AUC: 0.9683


XGB CV:  20%|██        | 1/5 [00:42<02:50, 42.68s/it]

Training fold 2/5...
Fold 2 ROC AUC: 0.9673


XGB CV:  40%|████      | 2/5 [01:23<02:05, 41.84s/it]

Training fold 3/5...
Fold 3 ROC AUC: 0.9670


XGB CV:  60%|██████    | 3/5 [02:03<01:21, 40.86s/it]

Training fold 4/5...
Fold 4 ROC AUC: 0.9684


XGB CV:  80%|████████  | 4/5 [02:43<00:40, 40.38s/it]

Training fold 5/5...
Fold 5 ROC AUC: 0.9676


XGB CV: 100%|██████████| 5/5 [03:23<00:00, 40.71s/it]

Average Fold ROC AUC: 0.9677 ± 0.0005





### Submission

In [62]:
sub = pd.read_csv('../data/inputs/sample_submission.csv', index_col='id')
sub['y'] = test_pred
sub.to_csv('../data/outputs/submission_xgb.csv')
sub.head()

Unnamed: 0_level_0,y
id,Unnamed: 1_level_1
750000,0.001331
750001,0.068358
750002,0.000159
750003,2.7e-05
750004,0.012634


In [63]:
!kaggle competitions submit -c playground-series-s5e8 -f ../data/outputs/submission_xgb.csv -m "XGBoost"

100%|██████████████████████████████████████| 6.72M/6.72M [00:01<00:00, 3.83MB/s]
Successfully submitted to Binary Classification with a Bank Dataset

### ハイパーパラメータチューニング

Optunaを使用して、`max_depth, gamma, lambda, alpha`を最適化する。

In [66]:
import optuna
from optuna.samplers import TPESampler
from sklearn.model_selection import train_test_split, StratifiedKFold
import numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.base import clone
from tqdm import tqdm

base_clf = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    learning_rate=0.1,
    n_estimators=1000,
    tree_method="hist",
    n_jobs=-1,
    random_state=42
)


N_SPLITS = 5
skfold = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

def objective(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 10.0),

        "reg_lambda": trial.suggest_float("lambda", 1e-8, 10.0, log=True),
        "reg_alpha":  trial.suggest_float("alpha",  1e-8, 10.0, log=True),
    }

    fold_scores = []
    for fold, (tr_idx, va_idx) in enumerate(skfold.split(X_str, y), start=1):
        X_tr, X_va = X_str.iloc[tr_idx], X_str.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

        model = clone(base_clf)
        model.set_params(**params)

        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            verbose=False
        )

        y_pred = model.predict_proba(X_va)[:, 1]
        fold_scores.append(roc_auc_score(y_va, y_pred))

    return float(np.mean(fold_scores))

In [67]:
study = optuna.create_study(
    direction="maximize",
    sampler=TPESampler(seed=42)
)
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("Best AUC:", study.best_value)
print("Best params:", study.best_params)

best_params = study.best_params
best_params_renamed = {
    "max_depth": best_params["max_depth"],
    "gamma": best_params["gamma"],
    "reg_lambda": best_params["lambda"],
    "reg_alpha": best_params["alpha"],
}

test_pred = np.zeros(len(test_str))
roc_scores = []

print("\nRe-training with best params and computing out-of-fold AUC & test predictions...")
for fold, (train_idx, test_idx) in enumerate(
    tqdm(skfold.split(X_str, y), total=N_SPLITS, desc="XGB CV (best)")
):
    print(f"Training fold {fold + 1}/{N_SPLITS}...")
    X_train, X_val = X_str.iloc[train_idx], X_str.iloc[test_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

    model = clone(base_clf)
    model.set_params(**best_params_renamed)

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False,
        early_stopping_rounds=50,
    )

    y_pred = model.predict_proba(X_val)[:, 1]
    roc = roc_auc_score(y_val, y_pred)
    roc_scores.append(roc)
    print(f"Fold {fold + 1} ROC AUC: {roc:.4f}")

    test_pred += model.predict_proba(test_str)[:, 1]

print(f"Average Fold ROC AUC: {np.mean(roc_scores):.4f} ± {np.std(roc_scores):.4f}")
test_pred = test_pred / N_SPLITS

[I 2025-08-27 15:12:28,394] A new study created in memory with name: no-name-955c9420-57b7-45b3-9af0-942d3e2253c5
  0%|          | 0/50 [01:34<?, ?it/s]

[W 2025-08-27 15:14:02,398] Trial 0 failed with parameters: {'max_depth': 5, 'gamma': 9.50714306409916, 'lambda': 0.03872090295370417, 'alpha': 0.0024430162614261434} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/kazuki/Documents/Kaggle/.kaggle/lib/python3.13/site-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/xq/bpxwdpg112n7q8kn_d1dbjg80000gn/T/ipykernel_5162/658933635.py", line 41, in objective
    model.fit(
    ~~~~~~~~~^
        X_tr, y_tr,
        ^^^^^^^^^^^
        eval_set=[(X_va, y_va)],
        ^^^^^^^^^^^^^^^^^^^^^^^^
        verbose=False
        ^^^^^^^^^^^^^
    )
    ^
  File "/Users/kazuki/Documents/Kaggle/.kaggle/lib/python3.13/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
  File "/Users/kazuki/Documents/Kaggle/.kaggle/lib/python3.13/site-packages/xgboost/sklearn.py", line 1682, in fit
    self._Booster = train(




KeyboardInterrupt: 