In [3]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC

from mymodule import PipeLine

In [4]:
# 使用するデータセット
df = pd.read_csv('./data/train.csv')
df.head(3)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,56,1,ASY,155,342,1,Normal,150,1,3.0,Flat,1
1,55,0,ATA,130,394,0,LVH,150,0,0.0,Up,0
2,47,1,NAP,110,0,1,Normal,120,1,0.0,Flat,1


>
# ベースラインモデル
ベースラインを作成して今後の特徴作成やモデル選択の比較対象にする<br>
カテゴリ変数と欠損の多いコレステロール値のカラムを除いたものとする
モデルは線形分離モデルのSVCを使用

In [5]:
valid = 'fold_out_split'  # バリデーションはホールドアウト法
model = SVC
# パイプライン
pipe = PipeLine()
pipe(df)  # オリジナルデータを代入
pipe.df_num = pipe.df_num.drop(['Cholesterol'], axis=1)  # コレステロールを削除
model = pipe.training(valid, model, view=True)  # モデルを訓練

-------------------- 使用された特徴量 --------------------


Unnamed: 0,Age,Sex,RestingBP,FastingBS,MaxHR,ExerciseAngina,Oldpeak
0,56,1,155,1,150,1,3.0
1,55,0,130,0,150,0,0.0
2,47,1,110,1,120,1,0.0
3,34,1,115,1,154,0,0.2
4,54,0,160,0,163,0,0.0


-------------------- 評価結果 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.69265,0.694981,0.753138,0.722892
test,0.694301,0.736434,0.791667,0.763052


### コレステロール値を追加して比較する
今回は欠損値の補完は考えない<br>
欠損値補完は考察のもと別途考える必要

In [4]:
valid = 'fold_out_split'
model = SVC
# パイプライン
pipe = PipeLine()
pipe(df)
model = pipe.training(valid, model, view=True)

-------------------- 使用された特徴量 --------------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak
0,56,1,155,342,1,150,1,3.0
1,55,0,130,394,0,150,0,0.0
2,47,1,110,0,1,120,1,0.0
3,34,1,115,0,1,154,0,0.2
4,54,0,160,201,0,163,0,0.0


-------------------- 評価結果 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.750557,0.774892,0.748954,0.761702
test,0.704663,0.773913,0.741667,0.757447


訓練誤差は5%ほど向上したが検証誤差はほぼ改善なし

>
## 標準化の追加（SVCモデル）
コレステロールを含めたベースラインの説明変数に標準化を取り入れる

In [5]:
valid = 'fold_out_split'
model = SVC
# パイプライン
pipe = PipeLine()
pipe(df)
pipe.standard_scaler()  # 標準化の処理を追加
model = pipe.training(valid, model, view=True)

-------------------- 使用された特徴量 --------------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak
0,0.256306,0.520852,1.272672,1.323176,1.74291,0.512265,1.252198,1.959903
1,0.151431,-1.91993,-0.116784,1.796972,-0.573753,0.512265,-0.798596,-0.809353
2,-0.687567,0.520852,-1.228348,-1.792941,1.74291,-0.706992,1.252198,-0.809353
3,-2.050938,0.520852,-0.950457,-1.792941,1.74291,0.674833,-0.798596,-0.624736
4,0.046557,-1.91993,1.550563,0.038461,-0.573753,1.04061,-0.798596,-0.809353


-------------------- 評価結果 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.864143,0.893805,0.845188,0.868817
test,0.823834,0.898148,0.808333,0.850877


標準化を施すことで<span style="color: orange;">ベースラインより精度が10%以上向上</span>
>

# One Hot Encodeを追加（SVCモデル）
ベースラインで取り除いたカテゴリ変数をOne_Hot_Encodingで説明変数に取り入れて比較する<br>
(比較を行うために標準化は行わない)

In [6]:
valid = 'fold_out_split'
model = SVC
# パイプライン
pipe = PipeLine()
pipe(df)
one_hot_columns = pipe.df_cat.columns  # アトリビュートからカラム名を取得
pipe.one_hot(one_hot_columns)  # pipeのクラスメソッドでワンホット化
model = pipe.training(valid, Model=model, view=True)

-------------------- 使用された特徴量 --------------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,56,1,155,342,1,150,1,3.0,1,0,0,0,0,1,0,0,1,0
1,55,0,130,394,0,150,0,0.0,0,1,0,0,1,0,0,0,0,1
2,47,1,110,0,1,120,1,0.0,0,0,1,0,0,1,0,0,1,0
3,34,1,115,0,1,154,0,0.2,1,0,0,0,0,1,0,0,0,1
4,54,0,160,201,0,163,0,0.0,0,0,1,0,0,1,0,0,0,1


-------------------- 評価結果 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.752784,0.778261,0.748954,0.763326
test,0.699482,0.77193,0.733333,0.752137


訓練誤差は向上しているが汎化誤差はベースラインとほぼ一緒<br>むしろ悪い
>
以下は各カテゴリを別々で追加して精度を確認

In [7]:
# それぞれのカラムがどのように影響するか調べていく
for cat in one_hot_columns:
    model = SVC
    pipe = PipeLine()
    pipe(df)
    one_hot_columns = pipe.df_cat.columns  # カテゴリデータのカラムを取得
    print('#'*40, f'{cat}', '#'*40)
    # パイプライン
    pipe = PipeLine()
    pipe(df)
    pipe.one_hot(cat)  # pipeのクラスメソッドでワンホット化
    model = pipe.training(valid, Model = model, view=False)  # 特徴量を確認したいときはview=True

######################################## ChestPainType ########################################
-------------------- 評価結果 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.755011,0.779221,0.753138,0.765957
test,0.699482,0.77193,0.733333,0.752137


######################################## RestingECG ########################################
-------------------- 評価結果 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.752784,0.775862,0.753138,0.764331
test,0.699482,0.77193,0.733333,0.752137


######################################## ST_Slope ########################################
-------------------- 評価結果 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.752784,0.775862,0.753138,0.764331
test,0.699482,0.77193,0.733333,0.752137


標準化していないためかカテゴリ変数の効果は薄いよう<br>
モデルの表現力不足のせいかRestingECGとST_Slopeの結果に変化がない

### 標準化とワンホットエンコーディング（SVC)
標準化をしたものにワンホットエンコードがどのように寄与するか確認する

In [8]:
valid = 'fold_out_split'
model = SVC
# パイプライン
pipe = PipeLine()
pipe(df)
one_hot_columns = pipe.df_cat.columns  # アトリビュートからカラム名を取得
pipe.standard_scaler()
pipe.one_hot(one_hot_columns)
model = pipe.training(valid, Model=model, view=True)

-------------------- 使用された特徴量 --------------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,0.256306,0.520852,1.272672,1.323176,1.74291,0.512265,1.252198,1.959903,1,0,0,0,0,1,0,0,1,0
1,0.151431,-1.91993,-0.116784,1.796972,-0.573753,0.512265,-0.798596,-0.809353,0,1,0,0,1,0,0,0,0,1
2,-0.687567,0.520852,-1.228348,-1.792941,1.74291,-0.706992,1.252198,-0.809353,0,0,1,0,0,1,0,0,1,0
3,-2.050938,0.520852,-0.950457,-1.792941,1.74291,0.674833,-0.798596,-0.624736,1,0,0,0,0,1,0,0,0,1
4,0.046557,-1.91993,1.550563,0.038461,-0.573753,1.04061,-0.798596,-0.809353,0,0,1,0,0,1,0,0,0,1


-------------------- 評価結果 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.917595,0.913934,0.933054,0.923395
test,0.875648,0.887097,0.916667,0.901639


標準化を施したベースライン(ワンホットなし)に比べて<span style="color: orange;">5%ほど検証データの正解率が向上</span>
>
以下はカテゴリ別で精度の向上を確認

In [10]:
# それぞれのカラムがどのように影響するか調べていく
for cat in one_hot_columns:
    model = SVC
    pipe = PipeLine()
    pipe(df)
    one_hot_columns = pipe.df_cat.columns  # カテゴリデータのカラムを取得
    print('#'*40, f'{cat}', '#'*40)
    # パイプライン
    pipe = PipeLine()
    pipe(df)
    pipe.standard_scaler() 
    pipe.one_hot(cat)
    model = pipe.training(valid, Model=model, view=False)  # 特徴量を確認したいときはview=True

######################################## ChestPainType ########################################
-------------------- 評価結果 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.875278,0.899563,0.861925,0.880342
test,0.865285,0.891667,0.891667,0.891667


######################################## RestingECG ########################################
-------------------- 評価結果 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.877506,0.910714,0.853556,0.88121
test,0.80829,0.873874,0.808333,0.839827


######################################## ST_Slope ########################################
-------------------- 評価結果 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.904232,0.904959,0.916318,0.910603
test,0.891192,0.902439,0.925,0.91358


適合率、再現率ともに<span style="color: orange;">ST_Slopeの効果がかなり大きい</span><br>
次いでChestPainType

# 分類木による評価

In [16]:
valid = 'fold_out_split'
model = DecisionTreeClassifier
# パイプライン
pipe = PipeLine()
pipe(df)
model = pipe.training(valid, Model=model, view=True)

-------------------- 使用された特徴量 --------------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak
0,56,1,155,342,1,150,1,3.0
1,55,0,130,394,0,150,0,0.0
2,47,1,110,0,1,120,1,0.0
3,34,1,115,0,1,154,0,0.2
4,54,0,160,201,0,163,0,0.0


-------------------- 評価結果 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,1.0,1.0,1.0,1.0
test,0.772021,0.827586,0.8,0.813559


In [21]:
valid = 'fold_out_split'
model = DecisionTreeClassifier
# パイプライン
pipe = PipeLine()
pipe(df)
pipe.standard_scaler()
model = pipe.training(valid, Model=model, view=True)

-------------------- 使用された特徴量 --------------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak
0,0.256306,0.520852,1.272672,1.323176,1.74291,0.512265,1.252198,1.959903
1,0.151431,-1.91993,-0.116784,1.796972,-0.573753,0.512265,-0.798596,-0.809353
2,-0.687567,0.520852,-1.228348,-1.792941,1.74291,-0.706992,1.252198,-0.809353
3,-2.050938,0.520852,-0.950457,-1.792941,1.74291,0.674833,-0.798596,-0.624736
4,0.046557,-1.91993,1.550563,0.038461,-0.573753,1.04061,-0.798596,-0.809353


-------------------- 評価結果 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,1.0,1.0,1.0,1.0
test,0.777202,0.82906,0.808333,0.818565


In [22]:
# それぞれのカラムがどのように影響するか調べていく
for cat in one_hot_columns:
    valid = 'fold_out_split'
    model = DecisionTreeClassifier

    print('#'*45, f'{cat}', '#'*45)
    pipe = PipeLine()
    pipe(df)
    pipe.standard_scaler()
    pipe.one_hot(cat)
    model = pipe.training(valid, Model=model, view=False)

############################################# ChestPainType #############################################
-------------------- 評価結果 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,1.0,1.0,1.0,1.0
test,0.678756,0.733871,0.758333,0.745902


############################################# RestingECG #############################################
-------------------- 評価結果 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,1.0,1.0,1.0,1.0
test,0.772021,0.822034,0.808333,0.815126


############################################# ST_Slope #############################################
-------------------- 評価結果 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,1.0,1.0,1.0,1.0
test,0.823834,0.852459,0.866667,0.859504


分類木につてはどれも過学習が強い<br>
正則化を取り入れると効果があるかも