# 機械学習をPythonで実践する-12　　～ Bagging ～

In [56]:
%load_ext autoreload
%autoreload 2
import polars as pl
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OrdinalEncoder
# import statsmodels.api as sma
from sklearn.model_selection import train_test_split ,cross_val_score, KFold, RepeatedKFold,StratifiedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, log_loss, confusion_matrix,ConfusionMatrixDisplay, \
accuracy_score, precision_score, recall_score,precision_recall_curve,f1_score,roc_curve,auc,get_scorer_names,roc_auc_score
from sklearn import tree
from sklearn.ensemble import BaggingClassifier

%matplotlib inline
import matplotlib.pyplot as plt


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Bagging（分類タスク）
sklearn.ensemble.BaggingClassifierクラスを使う。

1. インスタンス生成  
    ・base_estimator: sklearnのモデルインスタンス(デフォルトは決定木)  
    ・n_estimators: Baggingに使うモデルの数(デフォルト10)  
2. .fit(X, y)で学習
3. .predict(X)で予測（多数決）。.predict_probaは確率の平均。


In [7]:
# polarsでタイタニックデータを読み込み
df = pl.from_pandas(sns.load_dataset('titanic'))

In [9]:
df.head(2)

survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
i64,i64,str,f64,i64,i64,f64,str,cat,str,bool,cat,str,str,bool
0,3,"""male""",22.0,1,0,7.25,"""S""","""Third""","""man""",True,,"""Southampton""","""no""",False
1,1,"""female""",38.0,1,0,71.2833,"""C""","""First""","""woman""",False,"""C""","""Cherbourg""","""yes""",False


In [10]:
df.null_count()

survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,177,0,0,0,2,0,0,0,688,2,0,0


In [35]:
# 今回は単純に欠損値を落とす。
df = df.drop_nulls()

In [36]:
# 学習データ、目的変数を定義
X = df.drop(['survived', 'alive'])
y = df.get_column('survived')

In [37]:
X.head(1)

pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alone
i64,str,f64,i64,i64,f64,str,cat,str,bool,cat,str,bool
1,"""female""",38.0,1,0,71.2833,"""C""","""First""","""woman""",False,"""C""","""Cherbourg""",False


In [38]:
# ラベルエンコーディング（LabelEncoderではなく、OrdinalEncoderを使う）
oe = OrdinalEncoder()
# pandasで返ってくるように指定。polarsは指定できない模様
oe.set_output(transform='pandas')
# polars.DFはそのまま入れられないのでpandasに変換する。
X = pl.from_pandas(oe.fit_transform(X.to_pandas()))

In [39]:
X.head(1)

pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alone
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.0,0.0,32.0,1.0,0.0,57.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0


Ordinal Encoderを使用すると、量的変数も質的変数も全てf64にキャストされるっぽい。  
両者を分ける必要がないのは楽だが、メモリの無駄遣い感がある・・・

In [40]:
# hold-out
X_train, X_test, y_train, y_test = train_test_split(X.to_pandas(), y.to_pandas(), test_size=0.3, random_state=0)

In [41]:
# Bagging
clf = BaggingClassifier(random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)

In [44]:
clf.classes_

array([0, 1])

1が生存

In [51]:
y_pred[:6]

array([[0.5, 0.5],
       [0.8, 0.2],
       [0.1, 0.9],
       [0. , 1. ],
       [0.3, 0.7],
       [0.7, 0.3]])

In [52]:
y_test[:6]

94     1
18     0
33     1
98     1
181    1
168    0
Name: survived, dtype: int64

概ねあっていそう。

In [55]:
# AUCを計算
roc_auc_score(y_test, y_pred[:,1])

0.8974358974358974

In [86]:
clf.estimators_

[DecisionTreeClassifier(random_state=2087557356),
 DecisionTreeClassifier(random_state=132990059),
 DecisionTreeClassifier(random_state=1109697837),
 DecisionTreeClassifier(random_state=123230084),
 DecisionTreeClassifier(random_state=633163265),
 DecisionTreeClassifier(random_state=998640145),
 DecisionTreeClassifier(random_state=1452413565),
 DecisionTreeClassifier(random_state=2006313316),
 DecisionTreeClassifier(random_state=45050103),
 DecisionTreeClassifier(random_state=395371042)]

Baggingの中で使用したモデルを取得することができる。

### ※決定木だけで予測した場合

In [106]:
dtree = tree.DecisionTreeClassifier(random_state=0, max_depth=5).fit(X_train, y_train)
y_pred_dtree = dtree.predict_proba(X_test)

In [107]:
roc_auc_score(y_test, y_pred_dtree[:,1])

0.8317307692307693

max_depthは適当だが、Baggingの結果の方が精度良さそう。