# スタッキングの実装
ここでは2値分類用のスタッキングのクラスを実装する。  
scikit-learnにもスタッキング用のクラスが存在するが、学習時にKFold CVをしていない模様。なので過学習気味になる恐れ。    
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.StackingClassifier.html

In [2]:
%load_ext autoreload
%autoreload 2
import polars as pl
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, label_binarize, OrdinalEncoder
# import statsmodels.api as sma
from sklearn.model_selection import train_test_split ,LeaveOneOut, cross_val_score, KFold, RepeatedKFold,StratifiedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, log_loss, confusion_matrix,ConfusionMatrixDisplay, \
accuracy_score, precision_score, recall_score,precision_recall_curve,f1_score,roc_curve,auc,get_scorer_names,roc_auc_score
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_openml
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage,dendrogram,fcluster
from sklearn import tree
from xgboost import XGBClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

%matplotlib inline
import matplotlib.pyplot as plt


In [3]:
# polarsでタイタニックデータを読み込み
df = pl.from_pandas(sns.load_dataset('titanic'))

# 欠損値は落とさなくてよい
#df = df.drop_nulls()

# 学習データ、目的変数を定義
X = df.drop(['survived', 'alive'])
y = df.get_column('survived')

# カテゴリ変数のカラム名をリスト化
category_cols = X.select(pl.col([pl.Utf8, pl.Categorical, pl.Boolean])).columns

# ラベルエンコーディング（LabelEncoderではなく、OrdinalEncoderを使う）
oe = OrdinalEncoder()
# pandasで返ってくるように指定。polarsは指定できない模様
oe.set_output(transform='pandas')
# カテゴリ変数をエンコーディング。polars.DFはそのまま入れられないのでpandasに変換する。
X = X.with_columns( pl.from_pandas(oe.fit_transform(X.select(category_cols).to_pandas())) )

# hold-out
X_train, X_test, y_train, y_test = train_test_split(X.to_pandas(), y.to_pandas(), test_size=0.3, random_state=0)

### 実装前の確認

In [4]:
# kfoldの定義
cv = KFold(n_splits=5, shuffle=True, random_state=0)
estimator = XGBClassifier(early_stopping_rounds=10, learning_rate=0.01, eval_metric='auc',random_state=0)

In [5]:
%%time
X_pd = X.to_pandas()
y_pd = y.to_pandas()
y_pred_proba_li = []

# Layer1
for train_index, val_index in cv.split(X_pd ,y_pd):
    X_train, X_val = X_pd.iloc[train_index], X_pd.iloc[val_index]
    y_train, y_val = y_pd.iloc[train_index], y_pd.iloc[val_index]
    
    # モデル学習
    eval_set = [(X_val, y_val)]
    estimator.fit(X_train, y_train, eval_set=eval_set, verbose=True)
    # 検証データ(学習に使っていないデータ)に対する予測値算出
    y_pred_proba = estimator.predict_proba(X_val)

    # 予測値を追加していく
    y_pred_proba_li.append(y_pred_proba)

result = np.concatenate(y_pred_proba_li)

[0]	validation_0-auc:0.85264
[1]	validation_0-auc:0.85382
[2]	validation_0-auc:0.90184
[3]	validation_0-auc:0.86733
[4]	validation_0-auc:0.86838
[5]	validation_0-auc:0.86838
[6]	validation_0-auc:0.86838
[7]	validation_0-auc:0.86838
[8]	validation_0-auc:0.86838
[9]	validation_0-auc:0.86798
[10]	validation_0-auc:0.86838
[11]	validation_0-auc:0.86825
[12]	validation_0-auc:0.86785
[0]	validation_0-auc:0.82661
[1]	validation_0-auc:0.82990
[2]	validation_0-auc:0.82910
[3]	validation_0-auc:0.82896
[4]	validation_0-auc:0.82896
[5]	validation_0-auc:0.82923
[6]	validation_0-auc:0.82937
[7]	validation_0-auc:0.82977
[8]	validation_0-auc:0.82977
[9]	validation_0-auc:0.82795
[10]	validation_0-auc:0.82795
[11]	validation_0-auc:0.82822
[0]	validation_0-auc:0.87184
[1]	validation_0-auc:0.87270
[2]	validation_0-auc:0.87385
[3]	validation_0-auc:0.87457
[4]	validation_0-auc:0.87514
[5]	validation_0-auc:0.87514
[6]	validation_0-auc:0.87514
[7]	validation_0-auc:0.87514
[8]	validation_0-auc:0.87514
[9]	valid

○ メモ  
逐次concatenateする場合繰り返しconcatenateのオーバーヘッドがあるので、  
リストでまとめてconcatするよりも時間がかかるらしい。（大規模データの場合）

In [52]:
%%time
X_pd = X.to_pandas()
y_pd = y.to_pandas()
# 予測値を入れる初期の空の配列を作成
result_2 = np.empty((0, 2))

# Layer1
for train_index, val_index in cv.split(X_pd ,y_pd):
    X_train, X_val = X_pd.iloc[train_index], X_pd.iloc[val_index]
    y_train, y_val = y_pd.iloc[train_index], y_pd.iloc[val_index]
    
    # モデル学習
    eval_set = [(X_val, y_val)]
    estimator.fit(X_train, y_train, eval_set=eval_set, verbose=True)
    # 検証データ(学習に使っていないデータ)に対する予測値算出
    y_pred_proba = estimator.predict_proba(X_val)

    # 予測値を追加していく
    result_2 = np.concatenate((result, y_pred_proba))
    

[0]	validation_0-auc:0.85264
[1]	validation_0-auc:0.85382
[2]	validation_0-auc:0.90184
[3]	validation_0-auc:0.86733
[4]	validation_0-auc:0.86838
[5]	validation_0-auc:0.86838
[6]	validation_0-auc:0.86838
[7]	validation_0-auc:0.86838
[8]	validation_0-auc:0.86838
[9]	validation_0-auc:0.86798
[10]	validation_0-auc:0.86838
[11]	validation_0-auc:0.86825
[12]	validation_0-auc:0.86785
[0]	validation_0-auc:0.82661
[1]	validation_0-auc:0.82990
[2]	validation_0-auc:0.82910
[3]	validation_0-auc:0.82896
[4]	validation_0-auc:0.82896
[5]	validation_0-auc:0.82923
[6]	validation_0-auc:0.82937
[7]	validation_0-auc:0.82977
[8]	validation_0-auc:0.82977
[9]	validation_0-auc:0.82795
[10]	validation_0-auc:0.82795
[0]	validation_0-auc:0.87184
[1]	validation_0-auc:0.87270
[2]	validation_0-auc:0.87385
[3]	validation_0-auc:0.87457
[4]	validation_0-auc:0.87514
[5]	validation_0-auc:0.87514
[6]	validation_0-auc:0.87514
[7]	validation_0-auc:0.87514
[8]	validation_0-auc:0.87514
[9]	validation_0-auc:0.88182
[10]	valid

In [522]:
# 呼び出し側
cv = KFold(n_splits=5, shuffle=True, random_state=0)

estimators = [
    ('DecisionTree', tree.DecisionTreeClassifier(max_depth=2)),
    ('XGBoost', XGBClassifier(early_stopping_rounds=10, learning_rate=0.01, eval_metric='auc',random_state=0)),
    ('LightGBM', lgb.LGBMClassifier(boosting_type='goss', max_depth=5, random_state=0))
]

In [None]:
class MyStackingClassifierCV:
    def __init__(self, estimators, final_estimator=None, cv=None):
        self.estimators = estimators
        self.final_estimatores = final_estimator
        self.cv = cv

    def fit(self, X, y):
        for train_index, test_index in self.cv.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            
            # モデル学習
            model.fit(X_train, y_train)
            # テストデータに対する予測値算出
            y_pred = model.predict(X_test)
            
            # MSE ※LOOと違い、予測値・テストデータは複数の値
            mse = mean_squared_error(y_test, y_pred)
            mse_list.append(mse)
