In [1]:
import pickle

import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

from module.mymodule import test_eval
from module.features import pipe_1, pipe_2, pipe_3, pipe_4, pipe_5, pipe_6, pipe_7,\
                     pipe_8, pipe_9, pipe_10, pipe_11, pipe_12, pipe_13, pipe_14,\
                     pipe_15

### データセットの用意

In [2]:
data = pd.read_csv('./data/train.csv')
data = data.loc[550:, :]

df = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [3]:
data

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
550,60,1,ASY,125,0,1,Normal,110,0,0.1,Up,1
551,37,1,ASY,140,207,0,Normal,130,1,1.5,Flat,1
552,59,1,ATA,140,287,0,Normal,150,0,0.0,Up,0
553,38,0,ASY,110,0,0,Normal,156,0,0.0,Flat,1
554,59,1,TA,134,204,0,Normal,162,0,0.8,Up,1
...,...,...,...,...,...,...,...,...,...,...,...,...
637,48,1,ASY,106,263,1,Normal,110,0,0.0,Flat,1
638,53,1,ASY,126,0,0,Normal,106,0,0.0,Flat,1
639,54,1,ASY,200,198,0,Normal,142,1,2.0,Flat,1
640,45,0,ATA,130,237,0,Normal,170,0,0.0,Up,0


### パイプライン候補の準備

### データセットの作成

In [4]:
to_pipe ={
            'df': df,
            'split_kwrg': {'to_array': True},
            'train_flg': False,
            'retrain': False,
            }

# 特徴量候補を設定
pipe_lines = [
            pipe_1,  # base 
            # pipe_2,  # StSlopeCat
            # pipe_3,  # CholsetMean
            # pipe_4,  # AgeCat
            # pipe_5,  # StSlopeCat CholetMean AgeCat
            # pipe_6,  # RestingBpCat
            # pipe_7,  # OldPeakCat
            # pipe_8,  # RestingBpCat OldPeakCat
            pipe_9,  # Onehot
            pipe_10,  # CholestMean AgeCat Onehot
            pipe_11,  # CholCut
            pipe_12,  # CholCut Onehot,
            pipe_13,  # DropByShap
            pipe_14,  # CholCatDropBy,
            pipe_15  # CholRegression
            ]
data_set = {pipe.__name__: pipe(**to_pipe) for pipe in pipe_lines}

                                Base(pipe_1)                               


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak
0,60,1,125,0,1,110,0,0.1
1,37,1,140,207,0,130,1,1.5
2,59,1,140,287,0,150,0,0.0
3,38,0,110,0,0,156,0,0.0
4,59,1,134,204,0,162,0,0.8


                               Onehot(pipe_9)                              


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,60,1,125,0,1,110,0,0.1,1,0,0,0,0,1,0,0,0,1
1,37,1,140,207,0,130,1,1.5,1,0,0,0,0,1,0,0,1,0
2,59,1,140,287,0,150,0,0.0,0,1,0,0,0,1,0,0,0,1
3,38,0,110,0,0,156,0,0.0,1,0,0,0,0,1,0,0,1,0
4,59,1,134,204,0,162,0,0.8,0,0,0,1,0,1,0,0,0,1


                     CholestMean AgeCat Onehot(pipe_10)                    


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,4,1,125,244.824324,1,110,0,0.1,1,0,0,0,0,1,0,0,0,1
1,1,1,140,207.0,0,130,1,1.5,1,0,0,0,0,1,0,0,1,0
2,3,1,140,287.0,0,150,0,0.0,0,1,0,0,0,1,0,0,0,1
3,1,0,110,244.824324,0,156,0,0.0,1,0,0,0,0,1,0,0,1,0
4,3,1,134,204.0,0,162,0,0.8,0,0,0,1,0,1,0,0,0,1


                              CholCut(pipe_11)                             


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak
0,60,1,125,0,1,110,0,0.1
1,37,1,140,207,0,130,1,1.5
2,59,1,140,287,0,150,0,0.0
3,38,0,110,0,0,156,0,0.0
4,59,1,134,204,0,162,0,0.8


                          CholCut Onehot(pipe_12)                          


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,60,1,125,0,1,110,0,0.1,1,0,0,0,0,1,0,0,0,1
1,37,1,140,207,0,130,1,1.5,1,0,0,0,0,1,0,0,1,0
2,59,1,140,287,0,150,0,0.0,0,1,0,0,0,1,0,0,0,1
3,38,0,110,0,0,156,0,0.0,1,0,0,0,0,1,0,0,1,0
4,59,1,134,204,0,162,0,0.8,0,0,0,1,0,1,0,0,0,1


                            DropByShap(pipe_13)                            


Unnamed: 0,Age,Sex,Cholesterol,FastingBS,MaxHR,Oldpeak,ChestPainType_ASY,ST_Slope_Flat,ST_Slope_Up
0,60,1,0,1,110,0.1,1,0,1
1,37,1,207,0,130,1.5,1,1,0
2,59,1,287,0,150,0.0,0,0,1
3,38,0,0,0,156,0.0,1,1,0
4,59,1,204,0,162,0.8,0,0,1


                        CholCut DropByShap(pipe_14)                        


Unnamed: 0,Age,Sex,Cholesterol,FastingBS,MaxHR,Oldpeak,ChestPainType_ASY,ST_Slope_Flat,ST_Slope_Up
0,60,1,0,1,110,0.1,1,0,1
1,37,1,207,0,130,1.5,1,1,0
2,59,1,287,0,150,0.0,0,0,1
3,38,0,0,0,156,0.0,1,1,0
4,59,1,204,0,162,0.8,0,0,1


                          Chol Regression(pipe_15)                         


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,60,1,125,228,1,110,0,0.1,1,0,0,0,0,1,0,0,0,1
1,37,1,140,207,0,130,1,1.5,1,0,0,0,0,1,0,0,1,0
2,59,1,140,287,0,150,0,0.0,0,1,0,0,0,1,0,0,0,1
3,38,0,110,237,0,156,0,0.0,1,0,0,0,0,1,0,0,1,0
4,59,1,134,204,0,162,0,0.8,0,0,0,1,0,1,0,0,0,1


### モデルのロード

In [5]:
#file_name = 'retrained_models'
file_name = 'models'
with open(f'./data/{file_name}.pkl', 'rb') as f:
    train_models = pickle.load(f)

### モデルの評価と選定

In [6]:
with open(f'./data/{file_name}.pkl', 'rb') as f:
    train_models = pickle.load(f)
evaluations = test_eval(train_models, pipe_lines, data_set, y)

In [7]:
xgb_eval = evaluations.sort_values(XGBClassifier.__name__, ascending=False)
xgb_eval

Unnamed: 0,XGBClassifier,DecisionTreeClassifier,SVC
pipe_12,0.858696,0.858696,0.771739
pipe_14,0.847826,0.858696,0.782609
pipe_1,0.836957,0.771739,0.793478
pipe_9,0.836957,0.869565,0.880435
pipe_10,0.836957,0.836957,0.858696
pipe_13,0.815217,0.869565,0.880435
pipe_15,0.815217,0.826087,0.858696
pipe_11,0.75,0.728261,0.717391


In [8]:
tree_eval = evaluations.sort_values(DecisionTreeClassifier.__name__, ascending=False)
tree_eval

Unnamed: 0,XGBClassifier,DecisionTreeClassifier,SVC
pipe_9,0.836957,0.869565,0.880435
pipe_13,0.815217,0.869565,0.880435
pipe_12,0.858696,0.858696,0.771739
pipe_14,0.847826,0.858696,0.782609
pipe_10,0.836957,0.836957,0.858696
pipe_15,0.815217,0.826087,0.858696
pipe_1,0.836957,0.771739,0.793478
pipe_11,0.75,0.728261,0.717391


In [9]:
svc_eval = evaluations.sort_values(SVC.__name__, ascending=False)
svc_eval

Unnamed: 0,XGBClassifier,DecisionTreeClassifier,SVC
pipe_9,0.836957,0.869565,0.880435
pipe_13,0.815217,0.869565,0.880435
pipe_10,0.836957,0.836957,0.858696
pipe_15,0.815217,0.826087,0.858696
pipe_1,0.836957,0.771739,0.793478
pipe_14,0.847826,0.858696,0.782609
pipe_12,0.858696,0.858696,0.771739
pipe_11,0.75,0.728261,0.717391


## Ensemble出力

In [14]:
def ensemble(train_models, data_set, model, pipes, y):
    models = [train_models[model][pipe] for pipe in pipes]
    pred = sum([model_.predict_proba(data_set[pipe_]) for model_, pipe_ in zip(models, pipes)])
    ensemble_pred = np.where(pred[:, 0] > pred[:, 1], 0, 1)
    print(accuracy_score(y.values, ensemble_pred))
    return pred

In [56]:
def ensembles(preds, y):
    ensembles = sum(preds)
    ensemble_pred = np.where(ensembles[:, 0] > ensembles[:, 1], 0, 1)
    print(accuracy_score(y, ensemble_pred))
    return ensembles

In [89]:
indices = 3
pipes = xgb_eval.index[:indices]
model = XGBClassifier.__name__
print(pipes)
xgb_pred = ensemble(train_models, data_set, model, pipes, y)

pipes = tree_eval.index[:indices]
model = DecisionTreeClassifier.__name__
print(pipes)
tree_pred = ensemble(train_models, data_set, model, pipes, y)

pipes = svc_eval.index[:indices]
model = SVC.__name__
print(pipes)
svc_pred = ensemble(train_models, data_set, model, pipes, y)


preds = [
        #xgb_pred,
        tree_pred,
        svc_pred,
        ]
print(" ensembles ".center(50, '-'))
pred = ensembles(preds, y)

Index(['pipe_12', 'pipe_14', 'pipe_1'], dtype='object')
0.8695652173913043
Index(['pipe_9', 'pipe_13', 'pipe_12'], dtype='object')
0.8804347826086957
Index(['pipe_9', 'pipe_13', 'pipe_10'], dtype='object')
0.8804347826086957
------------------- ensembles --------------------
0.8804347826086957


## 考察欄

In [90]:
# 間違えたデータ一覧
df[np.where(pred[:, 0] > pred[:, 1], 0, 1) != y]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
554,59,1,TA,134,204,0,Normal,162,0,0.8,Up
555,64,1,ASY,128,263,0,Normal,105,1,0.2,Flat
569,57,1,ASY,140,192,0,Normal,148,0,0.4,Flat
576,57,1,ATA,154,232,0,LVH,164,0,0.0,Up
580,62,0,NAP,130,263,0,Normal,97,0,1.2,Flat
601,45,1,ASY,104,208,0,LVH,148,1,3.0,Flat
602,57,1,ASY,110,201,0,Normal,126,1,1.5,Flat
609,46,0,ASY,138,243,0,LVH,152,1,0.0,Flat
616,55,1,ASY,140,229,0,Normal,110,1,0.5,Flat
626,49,0,NAP,160,180,0,Normal,156,0,1.0,Flat


In [91]:
df.RestingECG.value_counts()

Normal    64
LVH       17
ST        11
Name: RestingECG, dtype: int64

In [92]:
# 間違えたデータの確信度
pred[np.where(pred[:, 0] > pred[:, 1], 0, 1) != y]/(indices*2)

array([[0.89069216, 0.10930784],
       [0.05378026, 0.94621974],
       [0.12067093, 0.87932907],
       [0.95017365, 0.04982635],
       [0.53052622, 0.46947378],
       [0.08853865, 0.91146135],
       [0.05051437, 0.94948563],
       [0.31761683, 0.68238317],
       [0.04695501, 0.95304499],
       [0.85963829, 0.14036171],
       [0.26844128, 0.73155872]])