## スタッキング

In [28]:
# データ準備

import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression



## データ準備
df = sns.load_dataset('titanic')
df.dropna(inplace=True)
# X, yを作成
X = df.drop(['survived', 'alive'], axis=1)
y = df['survived']
# ラベルエンコーディング
oe = OrdinalEncoder()
oe.set_output(transform='pandas')
cat_cols = X.select_dtypes(exclude=np.number).columns.to_list()
X[cat_cols] = oe.fit_transform(X[cat_cols])

# hold-out
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [47]:
import warnings
warnings.simplefilter('ignore')

In [96]:
# KFoldでデータを5分割の作成
pred_train_list = []
# pred_train_dict = {}

# cv
cv = KFold(n_splits=5, shuffle=True, random_state=0)

# model
estimator = LogisticRegression()

# 予測学習データの作成（1段階目）
for iter_num, (train_idx, test_idx)  in enumerate(cv.split(X_train)):
    
    # 学習データとテストデータの分割
    X_train_df = X_train.iloc[train_idx, :]
    y_train_df = y_train.iloc[train_idx]   
    pre_test_df = X_train.iloc[test_idx, :]
        
    # 学習、予測
    estimator.fit(X_train_df, y_train_df)
    est_val = estimator.predict(pre_test_df)
    pred_train_list.append(est_val)
    # pred_train_dict[f"model_{iter_num}"] = est_val
    
    # 
    np.concatenate(pred_train_list)
    
    
    
# 予測テストデータの作成（1段階目）
estimator.fit(X_train, y_train)
y_pred = estimator.predict(X_test)


In [78]:
# 一段階目の学習結果をdf化
pred_train_df = pd.DataFrame.from_dict(pred_train_dict, orient="index").T
pd.concat([X_train, pred_train_df], axis=1)

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alone,model_0,model_1,model_2,model_3,model_4
621,1.0,1.0,42.0,1.0,0.0,52.5542,2.0,0.0,1.0,1.0,3.0,2.0,0.0,,,,,
245,1.0,1.0,44.0,2.0,0.0,90.0000,1.0,0.0,1.0,1.0,2.0,1.0,0.0,,,,,
539,1.0,0.0,22.0,0.0,2.0,49.5000,0.0,0.0,2.0,0.0,1.0,0.0,0.0,,,,,
136,1.0,0.0,19.0,0.0,2.0,26.2833,2.0,0.0,2.0,0.0,3.0,2.0,0.0,,,,,
789,1.0,1.0,46.0,0.0,0.0,79.2000,0.0,0.0,1.0,1.0,1.0,0.0,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20,,,,,,,,,,,,,,1.0,1.0,0.0,0.0,1.0
21,,,,,,,,,,,,,,1.0,0.0,0.0,1.0,1.0
22,,,,,,,,,,,,,,1.0,1.0,1.0,0.0,1.0
24,,,,,,,,,,,,,,1.0,0.0,0.0,1.0,0.0


In [106]:
for iter_num, (train_idx, test_idx)  in enumerate(cv.split(X_train)):
    print(test_idx)

[  2   7   8  10  16  22  24  30  40  48  51  59  66  71  73  78  85  89
  91  92  95  98 100 101 105 126]
[  3   6  11  13  18  26  27  33  43  45  50  52  54  60  62  63  68  75
  84  94  96 104 107 110 121 124]
[  0   1   4   5  15  17  23  28  34  35  38  41  42  53  55  56  61  74
  76  93  99 109 113 115 118]
[ 14  19  20  25  29  31  32  37  49  57  69  72  77  79  80  82  86  90
  97 106 108 119 120 123 125]
[  9  12  21  36  39  44  46  47  58  64  65  67  70  81  83  87  88 102
 103 111 112 114 116 117 122]


In [98]:
pred_train_list[0]

array([1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1])

### 時間切れのため,正解を見る

## stacking

In [218]:
class StackingClassifireCV():
    def __init__(self, estimators, final_estimator, cv):
        self.estimators = estimators # [("modelname", model),・・・]
        self.final_estimator = final_estimator
        self.cv = cv
        
        
    def fit(self, X, y):
        pred_features = {}
        # 1層目のモデル
        for model_name, model in self.estimators:
            preds = []
            new_y = []
            
            for train_idx, val_idx in self.cv.split(X):
                X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
                y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
                model.fit(X_train, y_train)
                pred = model.predict_proba(X_val)[:, 1].tolist()
                preds += pred
                new_y += y_val.tolist()
            
            # predict_proba用に全データで学習 => self.estimatorsでこのモデルは維持される
            model.fit(X, y)
            pred_features[model_name] = preds
                
        # 2層目のモデル学習
        new_X = pd.DataFrame(pred_features)
        self.final_estimator.fit(new_X, new_y)
    
    def predict_proba(self, X):
        # 1層目のモデルで特徴量（予測値）生成, テストデータの作成？？
        pred_features = {}
        for model_name, model in self.estimators:
            pred = model.predict_proba(X)[:, 1]
            pred_features[model_name] = pred
            
        new_X = pd.DataFrame(pred_features)
        final_pred = self.final_estimator.predict_proba(new_X)
        
        return final_pred

In [226]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, random_state=0, shuffle=True)
final_estimator = LogisticRegression()
stackin_cv = StackingClassifireCV(estimators=[("rf", RandomForestClassifier()), ("knn", KNeighborsClassifier())],
                                  final_estimator=final_estimator,
                                  cv=cv)

stackin_cv.fit(X_train, y_train)
y_pred_stacking_cv = stackin_cv.predict_proba(X_test)

In [227]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, y_pred_stacking_cv[:, 1])

0.7996794871794872