# スタッキングの実装
ここでは2値分類用のスタッキングのクラスを実装する。  
scikit-learnにもスタッキング用のクラスが存在するが、学習時にKFold CVをしていない模様。なので過学習気味になる恐れ。    
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.StackingClassifier.html

In [43]:
%load_ext autoreload
%autoreload 2
import polars as pl
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, label_binarize, OrdinalEncoder
# import statsmodels.api as sma
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.model_selection import train_test_split ,LeaveOneOut, cross_val_score, KFold, RepeatedKFold,StratifiedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, log_loss, confusion_matrix,ConfusionMatrixDisplay, \
accuracy_score, precision_score, recall_score,precision_recall_curve,f1_score,roc_curve,auc,get_scorer_names,roc_auc_score
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_openml
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage,dendrogram,fcluster
from sklearn import tree
from xgboost import XGBClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

%matplotlib inline
import matplotlib.pyplot as plt


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
# polarsでタイタニックデータを読み込み
df = pl.from_pandas(sns.load_dataset('titanic'))

# 欠損値は落とさなくてよい
df = df.drop_nulls()

# 学習データ、目的変数を定義
X = df.drop(['survived', 'alive'])
y = df.get_column('survived')

# カテゴリ変数のカラム名をリスト化
category_cols = X.select(pl.col([pl.Utf8, pl.Categorical, pl.Boolean])).columns

# ラベルエンコーディング（LabelEncoderではなく、OrdinalEncoderを使う）
oe = OrdinalEncoder()
# pandasで返ってくるように指定。polarsは指定できない模様
oe.set_output(transform='pandas')
# カテゴリ変数をエンコーディング。polars.DFはそのまま入れられないのでpandasに変換する。
X = X.with_columns( pl.from_pandas(oe.fit_transform(X.select(category_cols).to_pandas())) )

# hold-out
X_train, X_test, y_train, y_test = train_test_split(X.to_pandas(), y.to_pandas(), test_size=0.3, random_state=0)

## 実装前の確認

In [5]:
# kfoldの定義
cv = KFold(n_splits=5, shuffle=True, random_state=0)
estimator = XGBClassifier(early_stopping_rounds=10, learning_rate=0.01, eval_metric='auc',random_state=0)

In [15]:
%%time
X_pd = X.to_pandas()
y_pd = y.to_pandas()
y_pred_proba_li = []

# Layer1
for train_index, val_index in cv.split(X_pd):
    X_train, X_val = X_pd.iloc[train_index], X_pd.iloc[val_index]
    y_train, y_val = y_pd.iloc[train_index], y_pd.iloc[val_index]
    
    # モデル学習
    eval_set = [(X_val, y_val)]
    estimator.fit(X_train, y_train, eval_set=eval_set, verbose=True)
    # 検証データ(学習に使っていないデータ)に対する予測値算出
    y_pred_proba = estimator.predict_proba(X_val)

    # 予測値を追加していく
    y_pred_proba_li.append(y_pred_proba)

result = np.concatenate(y_pred_proba_li)

[0]	validation_0-auc:0.85264
[1]	validation_0-auc:0.85382
[2]	validation_0-auc:0.90184
[3]	validation_0-auc:0.86733
[4]	validation_0-auc:0.86838
[5]	validation_0-auc:0.86838
[6]	validation_0-auc:0.86838
[7]	validation_0-auc:0.86838
[8]	validation_0-auc:0.86838
[9]	validation_0-auc:0.86798
[10]	validation_0-auc:0.86838
[11]	validation_0-auc:0.86825
[0]	validation_0-auc:0.82661
[1]	validation_0-auc:0.82990
[2]	validation_0-auc:0.82910
[3]	validation_0-auc:0.82896
[4]	validation_0-auc:0.82896
[5]	validation_0-auc:0.82923
[6]	validation_0-auc:0.82937
[7]	validation_0-auc:0.82977
[8]	validation_0-auc:0.82977
[9]	validation_0-auc:0.82795
[10]	validation_0-auc:0.82795
[11]	validation_0-auc:0.82822
[0]	validation_0-auc:0.87184
[1]	validation_0-auc:0.87270
[2]	validation_0-auc:0.87385
[3]	validation_0-auc:0.87457
[4]	validation_0-auc:0.87514
[5]	validation_0-auc:0.87514
[6]	validation_0-auc:0.87514
[7]	validation_0-auc:0.87514
[8]	validation_0-auc:0.87514
[9]	validation_0-auc:0.88182
[10]	valid

○ メモ  
逐次concatenateする場合繰り返しconcatenateのオーバーヘッドがあるので、  
リストでまとめてconcatするよりも時間がかかるらしい。（大規模データの場合）

In [7]:
%%time
X_pd = X.to_pandas()
y_pd = y.to_pandas()
# 予測値を入れる初期の空の配列を作成
result_2 = np.empty((0, 2))

# Layer1
for train_index, val_index in cv.split(X_pd ,y_pd):
    X_train, X_val = X_pd.iloc[train_index], X_pd.iloc[val_index]
    y_train, y_val = y_pd.iloc[train_index], y_pd.iloc[val_index]
    
    # モデル学習
    eval_set = [(X_val, y_val)]
    estimator.fit(X_train, y_train, eval_set=eval_set, verbose=True)
    # 検証データ(学習に使っていないデータ)に対する予測値算出
    y_pred_proba = estimator.predict_proba(X_val)

    # 予測値を追加していく
    result_2 = np.concatenate((result, y_pred_proba))
    

[0]	validation_0-auc:0.85264
[1]	validation_0-auc:0.85382
[2]	validation_0-auc:0.90184
[3]	validation_0-auc:0.86733
[4]	validation_0-auc:0.86838
[5]	validation_0-auc:0.86838
[6]	validation_0-auc:0.86838
[7]	validation_0-auc:0.86838
[8]	validation_0-auc:0.86838
[9]	validation_0-auc:0.86798
[10]	validation_0-auc:0.86838
[11]	validation_0-auc:0.86825
[0]	validation_0-auc:0.82661
[1]	validation_0-auc:0.82990
[2]	validation_0-auc:0.82910
[3]	validation_0-auc:0.82896
[4]	validation_0-auc:0.82896
[5]	validation_0-auc:0.82923
[6]	validation_0-auc:0.82937
[7]	validation_0-auc:0.82977
[8]	validation_0-auc:0.82977
[9]	validation_0-auc:0.82795
[10]	validation_0-auc:0.82795
[0]	validation_0-auc:0.87184
[1]	validation_0-auc:0.87270
[2]	validation_0-auc:0.87385
[3]	validation_0-auc:0.87457
[4]	validation_0-auc:0.87514
[5]	validation_0-auc:0.87514
[6]	validation_0-auc:0.87514
[7]	validation_0-auc:0.87514
[8]	validation_0-auc:0.87514
[9]	validation_0-auc:0.88182
[10]	validation_0-auc:0.88182
[11]	valid

## 実装

In [100]:
a = [1, 3]
a += [5, 6]

In [101]:
a

[1, 3, 5, 6]

### - 改善前
下記は最初にスクラッチで実装したときのコード。  
CVによってX,yの元のインデックスから変わってしまうことに気づかずに進めてしまった。  
X_valに対するpredict_probaの結果をそのままリストに追加してconcatすると、  
numpyに合わせてインデックスが０からふり直されてしまうことになる。  
これを正しく行うには元々のpandasのインデックスを保持したまま予測結果を格納する必要がある。

In [93]:
class MyStackingClassifierCV:
    def __init__(self, estimators, final_estimator=None, cv=None):
        self.estimators = estimators
        self.final_estimator = final_estimator
        self.cv = cv

    def fit(self, X, y):
        # CVでの学習済みモデルからの予測結果格納用（Layer2学習用）
        self.y_pred_dict_for_layer2 = {}
        # テストデータに対する予測のためのモデル格納用
        self.estimators_for_test = {}

        # Layer1の学習    
        for estimator_name, estimator in self.estimators:
            # モデル名と予測値のリストを対応させる。
            self.y_pred_dict_for_layer2[estimator_name] = []
            # テストデータに対するLayer1での予測値格納用

            # Layer2へ渡す特徴量生成のための学習
            for train_index, val_index in self.cv.split(X):
                # 学習用データと予測値算出用データに分ける
                X_train, X_val = X.iloc[train_index], X.iloc[val_index]
                y_train, y_val = y.iloc[train_index], y.iloc[val_index]
                
                if estimator_name == 'XGB':
                    # XGBoostのモデル学習
                    eval_set = [(X_val, y_val)]
                    estimator.fit(X_train, y_train, eval_set=eval_set, verbose=True)
                elif estimator_name == 'LGBM':
                    # LightGBMのモデル学習
                    eval_set = [(X_val, y_val)]
                    callbacks = [lgb.early_stopping(stopping_rounds=10), lgb.log_evaluation()]
                    estimator.fit(X_train, y_train, eval_set=eval_set, callbacks=callbacks)
                else:
                    # 決定木のモデル学習
                    estimator.fit(X_train, y_train)

                # 学習に使わなかったデータに対する予測値を算出し、リストに追加
                self.y_pred_dict_for_layer2[estimator_name].append(estimator.predict_proba(X_val))

            # 全foldでの予測値を結合してそのモデルの最終的な予測値を算出
            # !!! 結合した結果は元の入力データX,yと順番は異なってしまっていることに注意(CV時にシャッフルしているため)。
            self.y_pred_dict_for_layer2[estimator_name] = np.concatenate(self.y_pred_dict_for_layer2[estimator_name])

            # テストデータに対する予測のための学習
            if estimator_name == 'XGB':
                # XGBoostのモデル学習
                X_train2, X_val2, y_train2, y_val2 = train_test_split(X, y, test_size=0.3, random_state=0)
                eval_set = [(X_val2, y_val2)]
                self.estimators_for_test[estimator_name] = estimator.fit(X_train2, y_train2, eval_set=eval_set, verbose=True)
            elif estimator_name == 'LGBM':
                # LightGBMのモデル学習
                X_train2, X_val2, y_train2, y_val2 = train_test_split(X, y, test_size=0.3, random_state=0)
                eval_set = [(X_val2, y_val2)]
                callbacks = [lgb.early_stopping(stopping_rounds=10), lgb.log_evaluation()]
                self.estimators_for_test[estimator_name] = estimator.fit(X_train2, y_train2, eval_set=eval_set, callbacks=callbacks)
            else:
                # 決定木のモデル学習
                self.estimators_for_test[estimator_name] = estimator.fit(X, y)
        
        # Layer1での予測値をまとめる（Layer2へ渡す用）。array[[モデル1の予測結果×2列, モデル2の予測結果×2列・・・]]の形式。
        # concatenateで横に結合していく。予測結果が×２列になるのはpredict_probaの結果だから。
        self.result_layer1 = np.concatenate(list(self.y_pred_dict_for_layer2.values()), axis=1)

        # Layer2の学習。元々の特徴量＋layer1の結果を特徴量とする。
        X_train_layer2 = np.concatenate([X, self.result_layer1], axis=1)
        self.final_estimator.fit(X_train_layer2, y)

        #return self.y_pred_dict_for_layer2
        return self.result_layer1

    
    def predict_proba(self, X_test):
        # Layer1での学習済みモデルを使ってテストデータに対して予測
        # result_layer1_for_testの予測値・モデルの並びとresult_layer1の並びは同じなので、そのままlayer2に渡してOK
        result_layer1_for_test = [estimator.predict_proba(X_test) for _, estimator in self.estimators_for_test.items()]
        print(result_layer1_for_test)
        result_layer1_for_test = np.concatenate(result_layer1_for_test, axis=1)
        print(result_layer1_for_test)
        
        # テストデータに対する最終的な予測（Layer2）。元々の特徴量＋layer1の結果を特徴量とする。
        X_test_layer2 = np.concatenate([X_test, result_layer1_for_test], axis=1)
        result = self.final_estimator.predict_proba(X_test_layer2)

        return result


In [94]:
# 呼び出し側
cv = KFold(n_splits=5, shuffle=True, random_state=0)

estimators = [
    ('DT', tree.DecisionTreeClassifier(max_depth=2)),
    ('XGB', XGBClassifier(early_stopping_rounds=10, learning_rate=0.01, eval_metric='auc',random_state=0)),
    ('LGBM', lgb.LGBMClassifier(boosting_type='goss', max_depth=5, random_state=0))
]
final_estimator = LogisticRegression()

# スタッキングのインスタンス生成
model = MyStackingClassifierCV(estimators=estimators, final_estimator=final_estimator,cv=cv)

In [95]:
model.fit(X_train, y_train)

[0]	validation_0-auc:0.80303
[1]	validation_0-auc:0.82121
[2]	validation_0-auc:0.82121
[3]	validation_0-auc:0.82121
[4]	validation_0-auc:0.82121
[5]	validation_0-auc:0.82121
[6]	validation_0-auc:0.81212
[7]	validation_0-auc:0.81212
[8]	validation_0-auc:0.78182
[9]	validation_0-auc:0.75152
[10]	validation_0-auc:0.73939
[11]	validation_0-auc:0.73333
[0]	validation_0-auc:0.81818
[1]	validation_0-auc:0.81818
[2]	validation_0-auc:0.81818
[3]	validation_0-auc:0.81818
[4]	validation_0-auc:0.82955
[5]	validation_0-auc:0.82955
[6]	validation_0-auc:0.82955
[7]	validation_0-auc:0.82955
[8]	validation_0-auc:0.82955
[9]	validation_0-auc:0.81250
[10]	validation_0-auc:0.81250
[11]	validation_0-auc:0.81250
[12]	validation_0-auc:0.81250
[13]	validation_0-auc:0.81250
[0]	validation_0-auc:0.83333
[1]	validation_0-auc:0.85417
[2]	validation_0-auc:0.85417
[3]	validation_0-auc:0.85417
[4]	validation_0-auc:0.85417
[5]	validation_0-auc:0.85417
[6]	validation_0-auc:0.85417
[7]	validation_0-auc:0.85417
[8]	vali

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([[0.02040816, 0.97959184, 0.49137974, 0.50862026, 0.19883939,
        0.80116061],
       [0.54347826, 0.45652174, 0.50348496, 0.49651507, 0.47315669,
        0.52684331],
       [0.54347826, 0.45652174, 0.50399303, 0.49600694, 0.47315669,
        0.52684331],
       [0.54347826, 0.45652174, 0.49909711, 0.50090289, 0.47315669,
        0.52684331],
       [0.02040816, 0.97959184, 0.49137974, 0.50862026, 0.17161421,
        0.82838579],
       [0.54347826, 0.45652174, 0.50348496, 0.49651507, 0.47315669,
        0.52684331],
       [0.02040816, 0.97959184, 0.49137974, 0.50862026, 0.18468661,
        0.81531339],
       [0.54347826, 0.45652174, 0.49834168, 0.50165832, 0.47315669,
        0.52684331],
       [0.02040816, 0.97959184, 0.49137974, 0.50862026, 0.17161421,
        0.82838579],
       [0.54347826, 0.45652174, 0.49909711, 0.50090289, 0.47315669,
        0.52684331],
       [0.54347826, 0.45652174, 0.50498319, 0.49501678, 0.40412804,
        0.59587196],
       [1.        , 0

In [96]:
y_pred_proba = model.predict_proba(X_test)

[array([[0.54716981, 0.45283019],
       [0.54716981, 0.45283019],
       [0.06349206, 0.93650794],
       [0.06349206, 0.93650794],
       [0.54716981, 0.45283019],
       [0.88888889, 0.11111111],
       [0.54716981, 0.45283019],
       [0.06349206, 0.93650794],
       [0.06349206, 0.93650794],
       [0.06349206, 0.93650794],
       [0.88888889, 0.11111111],
       [0.06349206, 0.93650794],
       [0.06349206, 0.93650794],
       [0.06349206, 0.93650794],
       [0.54716981, 0.45283019],
       [0.54716981, 0.45283019],
       [0.54716981, 0.45283019],
       [0.06349206, 0.93650794],
       [0.06349206, 0.93650794],
       [0.54716981, 0.45283019],
       [0.06349206, 0.93650794],
       [0.06349206, 0.93650794],
       [0.06349206, 0.93650794],
       [0.06349206, 0.93650794],
       [0.06349206, 0.93650794],
       [0.54716981, 0.45283019],
       [0.54716981, 0.45283019],
       [0.06349206, 0.93650794],
       [0.54716981, 0.45283019],
       [0.54716981, 0.45283019],
       [0

In [97]:
y_pred_proba

array([[0.3843401 , 0.6156599 ],
       [0.52687786, 0.47312214],
       [0.10489626, 0.89510374],
       [0.11116677, 0.88883323],
       [0.4437343 , 0.5562657 ],
       [0.50809146, 0.49190854],
       [0.25311693, 0.74688307],
       [0.17218658, 0.82781342],
       [0.095095  , 0.904905  ],
       [0.07292245, 0.92707755],
       [0.56234944, 0.43765056],
       [0.13246107, 0.86753893],
       [0.15124301, 0.84875699],
       [0.07292245, 0.92707755],
       [0.44183369, 0.55816631],
       [0.38872737, 0.61127263],
       [0.51353424, 0.48646576],
       [0.19401893, 0.80598107],
       [0.07417816, 0.92582184],
       [0.72219955, 0.27780045],
       [0.11909168, 0.88090832],
       [0.2170787 , 0.7829213 ],
       [0.07494327, 0.92505673],
       [0.07475773, 0.92524227],
       [0.15228487, 0.84771513],
       [0.53633805, 0.46366195],
       [0.61844972, 0.38155028],
       [0.20914582, 0.79085418],
       [0.59207036, 0.40792964],
       [0.49626935, 0.50373065],
       [0.

In [98]:
y_test

94     1
18     0
33     1
98     1
181    1
168    0
7      0
138    1
61     1
74     1
5      1
112    1
177    1
130    1
164    0
150    1
118    0
106    1
80     1
155    0
71     1
55     1
37     1
145    1
4      1
66     0
90     0
179    1
45     0
89     1
110    0
63     1
26     1
60     1
170    1
159    1
8      1
44     1
96     0
129    1
83     0
54     1
24     0
30     1
97     0
56     1
123    1
111    1
19     0
139    1
135    0
160    1
16     1
51     1
162    1
Name: survived, dtype: int64

In [99]:
roc_auc_score(y_test, y_pred_proba[:, 1])

0.9102564102564104

In [34]:
X_train.values

array([[ 1.,  1., 42., ...,  3.,  2.,  0.],
       [ 1.,  1., 44., ...,  2.,  1.,  0.],
       [ 1.,  0., 22., ...,  1.,  0.,  0.],
       ...,
       [ 1.,  0., 39., ...,  2.,  0.,  0.],
       [ 1.,  0., 58., ...,  2.,  2.,  0.],
       [ 1.,  0., 39., ...,  4.,  0.,  0.]])

In [106]:
np.zeros((X_train.shape[0],4))

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
