In [1]:
import pickle

import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

from module.mymodule import test_eval
from features import pipe_1, pipe_2, pipe_3, pipe_4, pipe_5, pipe_6, pipe_7,\
                     pipe_8, pipe_9, pipe_10, pipe_11, pipe_12, pipe_13, pipe_14,\
                     pipe_15

### データセットの用意

In [3]:
data = pd.read_csv('./data/train.csv')
data = data.loc[550:, :]

df = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [4]:
data

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
550,60,1,ASY,125,0,1,Normal,110,0,0.1,Up,1
551,37,1,ASY,140,207,0,Normal,130,1,1.5,Flat,1
552,59,1,ATA,140,287,0,Normal,150,0,0.0,Up,0
553,38,0,ASY,110,0,0,Normal,156,0,0.0,Flat,1
554,59,1,TA,134,204,0,Normal,162,0,0.8,Up,1
...,...,...,...,...,...,...,...,...,...,...,...,...
637,48,1,ASY,106,263,1,Normal,110,0,0.0,Flat,1
638,53,1,ASY,126,0,0,Normal,106,0,0.0,Flat,1
639,54,1,ASY,200,198,0,Normal,142,1,2.0,Flat,1
640,45,0,ATA,130,237,0,Normal,170,0,0.0,Up,0


### パイプライン候補の準備

### データセットの作成

In [5]:
to_pipe ={
            'df': df,
            'split_kwrg': {'to_array': True},
            'train_flg': False,
            'retrain': False,
            }

# 特徴量候補を設定
pipe_lines = [
            pipe_1,  # base 
            # pipe_2,  # StSlopeCat
            # pipe_3,  # CholsetMean
            # pipe_4,  # AgeCat
            # pipe_5,  # StSlopeCat CholetMean AgeCat
            # pipe_6,  # RestingBpCat
            # pipe_7,  # OldPeakCat
            # pipe_8,  # RestingBpCat OldPeakCat
            pipe_9,  # Onehot
            pipe_10,  # CholestMean AgeCat Onehot
            pipe_11,  # CholCut
            pipe_12,  # CholCut Onehot,
            pipe_13,  # DropByShap
            pipe_14,  # CholCatDropBy,
            pipe_15  # CholRegression
            ]
data_set = {pipe.__name__: pipe(**to_pipe) for pipe in pipe_lines}

                                Base(pipe_1)                               


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak
0,60,1,125,0,1,110,0,0.1
1,37,1,140,207,0,130,1,1.5
2,59,1,140,287,0,150,0,0.0
3,38,0,110,0,0,156,0,0.0
4,59,1,134,204,0,162,0,0.8


                          Onehot Standard(pipe_9)                          


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,60,1,125,0,1,110,0,0.1,1,0,0,0,0,1,0,0,0,1
1,37,1,140,207,0,130,1,1.5,1,0,0,0,0,1,0,0,1,0
2,59,1,140,287,0,150,0,0.0,0,1,0,0,0,1,0,0,0,1
3,38,0,110,0,0,156,0,0.0,1,0,0,0,0,1,0,0,1,0
4,59,1,134,204,0,162,0,0.8,0,0,0,1,0,1,0,0,0,1


                CholestMean AgeCat Onehot Standard(pipe_10)                


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,4,1,125,244.824324,1,110,0,0.1,1,0,0,0,0,1,0,0,0,1
1,1,1,140,207.0,0,130,1,1.5,1,0,0,0,0,1,0,0,1,0
2,3,1,140,287.0,0,150,0,0.0,0,1,0,0,0,1,0,0,0,1
3,1,0,110,244.824324,0,156,0,0.0,1,0,0,0,0,1,0,0,1,0
4,3,1,134,204.0,0,162,0,0.8,0,0,0,1,0,1,0,0,0,1


                              CholCut(pipe_11)                             


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak
0,60,1,125,0,1,110,0,0.1
1,37,1,140,207,0,130,1,1.5
2,59,1,140,287,0,150,0,0.0
3,38,0,110,0,0,156,0,0.0
4,59,1,134,204,0,162,0,0.8


                          CholCut Onehot(pipe_12)                          


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,60,1,125,0,1,110,0,0.1,1,0,0,0,0,1,0,0,0,1
1,37,1,140,207,0,130,1,1.5,1,0,0,0,0,1,0,0,1,0
2,59,1,140,287,0,150,0,0.0,0,1,0,0,0,1,0,0,0,1
3,38,0,110,0,0,156,0,0.0,1,0,0,0,0,1,0,0,1,0
4,59,1,134,204,0,162,0,0.8,0,0,0,1,0,1,0,0,0,1


                            DropByShap(pipe_13)                            


Unnamed: 0,Age,Sex,Cholesterol,FastingBS,MaxHR,Oldpeak,ChestPainType_ASY,ST_Slope_Flat,ST_Slope_Up
0,60,1,0,1,110,0.1,1,0,1
1,37,1,207,0,130,1.5,1,1,0
2,59,1,287,0,150,0.0,0,0,1
3,38,0,0,0,156,0.0,1,1,0
4,59,1,204,0,162,0.8,0,0,1


                        CholCut DropByShap(pipe_14)                        


Unnamed: 0,Age,Sex,Cholesterol,FastingBS,MaxHR,Oldpeak,ChestPainType_ASY,ST_Slope_Flat,ST_Slope_Up
0,60,1,0,1,110,0.1,1,0,1
1,37,1,207,0,130,1.5,1,1,0
2,59,1,287,0,150,0.0,0,0,1
3,38,0,0,0,156,0.0,1,1,0
4,59,1,204,0,162,0.8,0,0,1


                          Chol Regression(pipe_15)                         


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,60,1,125,228,1,110,0,0.1,1,0,0,0,0,1,0,0,0,1
1,37,1,140,207,0,130,1,1.5,1,0,0,0,0,1,0,0,1,0
2,59,1,140,287,0,150,0,0.0,0,1,0,0,0,1,0,0,0,1
3,38,0,110,237,0,156,0,0.0,1,0,0,0,0,1,0,0,1,0
4,59,1,134,204,0,162,0,0.8,0,0,0,1,0,1,0,0,0,1


### モデルのロード

In [6]:
#file_name = 'retrained_models'
file_name = 'models'
with open(f'./data/{file_name}.pkl', 'rb') as f:
    train_models = pickle.load(f)

### モデルの評価と選定

In [7]:
with open(f'./data/{file_name}.pkl', 'rb') as f:
    train_models = pickle.load(f)
evaluations = test_eval(train_models, pipe_lines, data_set, y)

In [17]:
xgb_eval = evaluations.sort_values(XGBClassifier.__name__, ascending=False)
xgb_eval

Unnamed: 0,XGBClassifier,DecisionTreeClassifier,SVC
pipe_12,0.858696,0.858696,0.771739
pipe_14,0.847826,0.858696,0.782609
pipe_1,0.836957,0.771739,0.793478
pipe_9,0.836957,0.869565,0.880435
pipe_10,0.836957,0.836957,0.858696
pipe_13,0.815217,0.869565,0.880435
pipe_15,0.815217,0.826087,0.858696
pipe_11,0.75,0.728261,0.717391


In [18]:
tree_eval = evaluations.sort_values(DecisionTreeClassifier.__name__, ascending=False)
tree_eval

Unnamed: 0,XGBClassifier,DecisionTreeClassifier,SVC
pipe_9,0.836957,0.869565,0.880435
pipe_13,0.815217,0.869565,0.880435
pipe_12,0.858696,0.858696,0.771739
pipe_14,0.847826,0.858696,0.782609
pipe_10,0.836957,0.836957,0.858696
pipe_15,0.815217,0.826087,0.858696
pipe_1,0.836957,0.771739,0.793478
pipe_11,0.75,0.728261,0.717391


In [19]:
svc_eval = evaluations.sort_values(SVC.__name__, ascending=False)
svc_eval

Unnamed: 0,XGBClassifier,DecisionTreeClassifier,SVC
pipe_9,0.836957,0.869565,0.880435
pipe_13,0.815217,0.869565,0.880435
pipe_10,0.836957,0.836957,0.858696
pipe_15,0.815217,0.826087,0.858696
pipe_1,0.836957,0.771739,0.793478
pipe_14,0.847826,0.858696,0.782609
pipe_12,0.858696,0.858696,0.771739
pipe_11,0.75,0.728261,0.717391


## Ensemble出力

In [20]:
xgb_eval.index[:3]

Index(['pipe_12', 'pipe_14', 'pipe_1'], dtype='object')

In [21]:
train_models['XGBClassifier']

{'XGBClassifier': {'pipe_1': GridSearchCV(cv=3,
               estimator=XGBClassifier(base_score=None, booster=None,
                                       callbacks=None, colsample_bylevel=None,
                                       colsample_bynode=None,
                                       colsample_bytree=None,
                                       early_stopping_rounds=50,
                                       enable_categorical=False, eval_metric=None,
                                       feature_types=None, gamma=None,
                                       gpu_id=None, grow_policy=None,
                                       importance_type=None,
                                       interaction_constraints=None,
                                       learning_rate=None, ma...
                                       max_cat_to_onehot=None,
                                       max_delta_step=None, max_depth=None,
                                       max_leaves=None, 

In [None]:
def ensemble(train_models, data_set, model, pipes, y):
    models = [train_models[model.__name__][pipe.__name__] for pipe in pipes]
    pred = sum([model_.predict_proba(data_set[pipe_.__name__]) for model_, pipe_ in zip(models, pipes)])
    ensemble_pred = np.where(pred[:, 0] > pred[:, 1], 0, 1)
    return accuracy_score(y.values, ensemble_pred)

In [10]:
pipes = [pipe_9, pipe_15]

In [11]:
model = XGBClassifier
ensemble(train_models, data_set, model, pipes, y)

0.8260869565217391

In [12]:
model = DecisionTreeClassifier
ensemble(train_models, data_set, model, pipes, y)

0.8369565217391305

In [13]:
model = SVC
ensemble(train_models, data_set, model, pipes, y)

0.8804347826086957