In [1]:
from pathlib import Path
import sys

import pandas as pd
from xgboost import XGBClassifier

rank = 0 
#sys.path[0] = f'{Path().resolve().parents[rank]}' # mymoduleが上の階層にある場合rankを調整してコメント解除
from mymodule import PipeLine, grid_search_cv
from kayano import age_categolize, stSlope_categolize, cholesterol_mean

In [2]:
df = pd.read_csv('./data/train.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,56,1,ASY,155,342,1,Normal,150,1,3.0,Flat,1
1,55,0,ATA,130,394,0,LVH,150,0,0.0,Up,0
2,47,1,NAP,110,0,1,Normal,120,1,0.0,Flat,1
3,34,1,ASY,115,0,1,Normal,154,0,0.2,Up,1
4,54,0,NAP,160,201,0,Normal,163,0,0.0,Up,0


### 河合さんの関数の挙動の方は問題にないなさそう

In [3]:
df_st = stSlope_categolize(df)
df_st[df['ST_Slope']=='Down']['ST_Slope'].value_counts()

-1    49
Name: ST_Slope, dtype: int64

In [4]:
df_st[df['ST_Slope']=='Flat']['ST_Slope'].value_counts()

0    316
Name: ST_Slope, dtype: int64

In [5]:
df_st[df['ST_Slope']=='Up']['ST_Slope'].value_counts()

1    277
Name: ST_Slope, dtype: int64

### パイプライン

In [12]:
def pipe_1(df, test_size, to_grid_search):
    print('ベースライン'.center(50))
    pipe = PipeLine()
    pipe(df)
    pack = pipe.fold_out_split(test_size)
    gs_model = grid_search_cv(pack, **to_grid_search)
    return gs_model

def pipe_2(df, test_size, to_grid_search):
    print('ST_Slopカテゴライズ'.center(50))
    pipe = PipeLine()
    pipe(df)
    pipe.df_cat = stSlope_categolize(pipe.df_cat)  # ST_Slopeをカテゴライズ
    slop = pipe.df_num.copy()
    slop['ST_Slope'] = pipe.df_cat['ST_Slope']
    pipe.df_num = slop
    #display(pipe.df_num)
    pack = pipe.fold_out_split(test_size)
    gs_model = grid_search_cv(pack, **to_grid_search)
    return gs_model

def pipe_3(df, test_size, to_grid_search):
    print('ワンホットとST_Slopカテゴライズ'.center(50))
    pipe = PipeLine()
    pipe(df)
    pipe.one_hot(pipe.df_cat.columns)
    pipe.df_num = stSlope_categolize(pipe.df_num)  # ST_Slopeをカテゴライズ
    #display(pipe.df_num)
    pack = pipe.fold_out_split(test_size)
    gs_model = grid_search_cv(pack, **to_grid_search)
    return gs_model

def pipe_4(df, test_size, to_grid_search):
    print('Ageカテゴライズ'.center(50))
    pipe = PipeLine()
    pipe(df)
    pipe.df_num = age_categolize(pipe.df_num)
    pack = pipe.fold_out_split(test_size)
    gs_model = grid_search_cv(pack, **to_grid_search)
    return gs_model

def pipe_5(df, test_size, to_grid_search):
    print('AgeカテゴライズとST_Slopカテゴライズ'.center(50))
    pipe = PipeLine()
    pipe(df)
    pipe.df_cat = stSlope_categolize(pipe.df_cat)  # ST_Slopeをカテゴライズ
    slop = pipe.df_num.copy()
    slop['ST_Slope'] = pipe.df_cat['ST_Slope']
    pipe.df_num = slop
    pipe.df_num = age_categolize(pipe.df_num)
    #display(pipe.df_num)
    pack = pipe.fold_out_split(test_size)
    gs_model = grid_search_cv(pack, **to_grid_search)
    return gs_model

def pipe_6(df, test_size, to_grid_search):
    print('コレステロールの補完'.center(50))
    pipe = PipeLine()
    pipe(df)
    pipe.df_num = cholesterol_mean(pipe.df_num)
    #display(pipe.df_num)
    pack = pipe.fold_out_split(test_size)
    gs_model = grid_search_cv(pack, **to_grid_search)
    return gs_model

def pipe_7(df, test_size, to_grid_search):
    print('コレステロールの補完と年齢カテゴライズ'.center(50))
    pipe = PipeLine()
    pipe(df)
    pipe.df_num = cholesterol_mean(pipe.df_num)
    pipe.df_num = age_categolize(pipe.df_num)
    #display(pipe.df_num)
    pack = pipe.fold_out_split(test_size)
    gs_model = grid_search_cv(pack, **to_grid_search)
    return gs_model

def pipe_8(df, test_size, to_grid_search):
    print('コレステロールの補完と年齢カテゴライズとSlopカテゴライズ'.center(50))
    pipe = PipeLine()
    pipe(df)
    pipe.df_num = cholesterol_mean(pipe.df_num)
    pipe.df_num = age_categolize(pipe.df_num)
    pipe.df_cat = stSlope_categolize(pipe.df_cat)  # ST_Slopeをカテゴライズ
    slop = pipe.df_num.copy()
    slop['ST_Slope'] = pipe.df_cat['ST_Slope']
    pipe.df_num = slop
    #display(pipe.df_num)
    pack = pipe.fold_out_split(test_size)
    gs_model = grid_search_cv(pack, **to_grid_search)
    return gs_model


### トレーニング

In [17]:
df = pd.read_csv('./data/train.csv')
#################### グリッドサーチの探索候補を用意 #################
tree_grid = {'max_depth':[3, 5, 7, 9, 15],
              'learning_rate': [0.05, 0.1, 0.3],
              'n_estimators': [50, 75, 100, 150],
            }


############################# 用意したパイプに渡す引数 ##############################
to_grid_search = {'model': XGBClassifier,
                  'param_grid': tree_grid,
                  'model_arg': {'random_state': 42, 'early_stopping_rounds': 10}
                  }

pipe_kwargs = {'df': df,
               'test_size': 0.2,
               'to_grid_search': to_grid_search}

######################## 用意したパイプラインをfor文で訓練 #########################
pipe_lines = [pipe_1, pipe_2, pipe_3, pipe_4, pipe_5, pipe_6, pipe_7, pipe_8]  # 作ったdemo関数をリストでまとめる （）いらない
models = {}  # 訓練したモデルの格納先
for pipe in pipe_lines:
    print('#'*20,f'{pipe.__name__}の結果', '#'*20)  # 表示結果のラベル
    model = pipe(**pipe_kwargs)
    models[pipe.__name__] = model
    print()

#################### pipe_1の結果 ####################
                      ベースライン                      
-------------------- 評価結果 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.861598,0.905512,0.830325,0.86629
test,0.837209,0.917808,0.817073,0.864516



#################### pipe_2の結果 ####################
                  ST_Slopカテゴライズ                   
-------------------- 評価結果 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.916179,0.917857,0.927798,0.922801
test,0.875969,0.923077,0.878049,0.9



#################### pipe_3の結果 ####################
               ワンホットとST_Slopカテゴライズ                
-------------------- 評価結果 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.918129,0.918149,0.931408,0.924731
test,0.852713,0.898734,0.865854,0.881988



#################### pipe_4の結果 ####################
                    Ageカテゴライズ                     
-------------------- 評価結果 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.867446,0.916335,0.830325,0.871212
test,0.79845,0.9,0.768293,0.828947



#################### pipe_5の結果 ####################
             AgeカテゴライズとST_Slopカテゴライズ              
-------------------- 評価結果 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.920078,0.921429,0.931408,0.926391
test,0.875969,0.923077,0.878049,0.9



#################### pipe_6の結果 ####################
                    コレステロールの補完                    
-------------------- 評価結果 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.97076,0.971223,0.974729,0.972973
test,0.837209,0.896104,0.841463,0.867925



#################### pipe_7の結果 ####################
               コレステロールの補完と年齢カテゴライズ                
-------------------- 評価結果 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.935673,0.958647,0.920578,0.939227
test,0.837209,0.906667,0.829268,0.866242



#################### pipe_8の結果 ####################
          コレステロールの補完と年齢カテゴライズとSlopカテゴライズ          
-------------------- 評価結果 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.912281,0.914286,0.924188,0.91921
test,0.860465,0.932432,0.841463,0.884615





ベースラインとST_Slopの挙動が最も汎化性能がありそう<br>
前回精度の向上がみれられたAgeカテゴライズについては過学習を校了されたためか<br>
精度が低下しているよう

In [19]:
models['pipe_4'].best_params_

{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 50}