## Bagging

In [65]:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import roc_auc_score

In [82]:
# データのロード
df = sns.load_dataset('titanic')
df.dropna(inplace=True)


# x, yの準備
X = df.loc[:, (df.columns != 'survived') & (df.columns != 'alive')]
y = df['survived']


# ラベルエンコーディング
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()


# defaultはnp.arrayで返るので、pdで返るようにする
oe.set_output(transform='pandas')


# カテゴリカルデータの選択 => label encoding
cat_cols = X.select_dtypes(exclude=np.number).columns.to_numpy()
X.loc[:, cat_cols] = oe.fit_transform(X[cat_cols])
# X = oe.fit_transform(X)


# hold-out
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [83]:
# 学習
clf = BaggingClassifier(random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)

In [84]:
# 評価
print(f"bagging AUC:{roc_auc_score(y_test,y_pred[:, 1])}")

bagging AUC:0.8846153846153846


## 一本の決定木の精度確認

In [86]:
# 単一の決定技の精度
from sklearn.tree import DecisionTreeClassifier
single_tree = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)
y_pred_tree = single_tree.predict_proba(X_test)

print(f"sigle tree AUC:{roc_auc_score(y_test,y_pred_tree[:, 1])}")

sigle tree AUC:0.7596153846153846


バギングで多数のモデルのアンサンブルの方が精度が良い！！