In [1]:
import numpy as np
import pandas as pd

from sklearn.svm import SVC

from mymodule import PipeLine, evaluations

In [2]:
# 使用するデータセット
df = pd.read_csv('./data/train.csv')
df.head(3)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,56,1,ASY,155,342,1,Normal,150,1,3.0,Flat,1
1,55,0,ATA,130,394,0,LVH,150,0,0.0,Up,0
2,47,1,NAP,110,0,1,Normal,120,1,0.0,Flat,1


>
# ベースラインの作成
ベースラインを作成して今後の特徴作成やモデル選択の比較対象とする<br>
カテゴリ変数と欠損の多いコレステロール値のカラムを除いたものとする
モデルは線形分離モデルのSVCを使用

In [6]:
df = pd.read_csv('./data/train.csv')

# 前処理
pipe = PipeLine()
pipe(df)
pipe.df_num = pipe.df_num.drop(['Cholesterol'], axis=1)
print('-'*20, ' ベースラインで使用する変数', '-'*20)
display(pipe.df_num.head(3))
# 7:3でデータを分割(seed値はpipe.random_seed=scala: intで設定)
pack = pipe.fold_out_split(test_size=0.3)  # x_train, x_test, y_train, y_test = pack

# サポートベクターマシンでモデルを訓練
model = SVC()
model.fit(pack[0], pack[2])

# 予測値を出力
print()
print('-'*20, '性能評価', '-'*20)
display(evaluations(model, *pack))

--------------------  ベースラインで使用する変数 --------------------


Unnamed: 0,Age,Sex,RestingBP,FastingBS,MaxHR,ExerciseAngina,Oldpeak
0,56,1,155,1,150,1,3.0
1,55,0,130,0,150,0,0.0
2,47,1,110,1,120,1,0.0


-------------------- 分割されたデータShape --------------------
x_train: (449, 7) x_test: (193, 7)
y_train: (449,) y_test: (193,)

-------------------- 性能評価 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.69265,0.694981,0.753138,0.722892
test,0.694301,0.736434,0.791667,0.763052


### コレステロール値を追加して比較する
今回は欠損値の補完は考えない<br>
欠損値補完は考察のもと別途考える必要がある

In [8]:
df = pd.read_csv('./data/train.csv')

# 前処理
pipe = PipeLine()
pipe(df)
print('-'*20, ' 変数にコレステロール値を追加', '-'*20)
display(pipe.df_num.head(3))
pack = pipe.fold_out_split(test_size=0.3)  # x_train, x_test, y_train, y_test = pack

# サポートベクターマシンでモデルを訓練
model = SVC()
model.fit(pack[0], pack[2])

# 予測値を出力
print()
print('-'*20, '性能評価', '-'*20)
display(evaluations(model, *pack))

--------------------  変数にコレステロール値を追加 --------------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak
0,56,1,155,342,1,150,1,3.0
1,55,0,130,394,0,150,0,0.0
2,47,1,110,0,1,120,1,0.0


-------------------- 分割されたデータShape --------------------
x_train: (449, 8) x_test: (193, 8)
y_train: (449,) y_test: (193,)

-------------------- 性能評価 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.750557,0.774892,0.748954,0.761702
test,0.704663,0.773913,0.741667,0.757447


訓練誤差は5%ほど向上したが検証誤差は1%ほどしか向上してないが改善がみられた

>
### ベースラインの標準化
コレステロールを含めたベースラインの説明変数に標準化を取り入れる

In [9]:
df = pd.read_csv('./data/train.csv')

# 前処理
pipe = PipeLine()
pipe(df)
pipe.standard_scaler()  # 標準化の処理を追加
pack = pipe.fold_out_split(test_size=0.3)

# サポートベクターマシンでモデルを訓練
model = SVC()
model.fit(pack[0], pack[2])  # x_train, x_test, y_train, y_test = pack

# 予測値を出力
print()
print('-'*20, '性能評価', '-'*20)
display(evaluations(model, *pack))

-------------------- 標準化されたdf_num --------------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak
0,0.256306,0.520852,1.272672,1.323176,1.74291,0.512265,1.252198,1.959903
1,0.151431,-1.91993,-0.116784,1.796972,-0.573753,0.512265,-0.798596,-0.809353
2,-0.687567,0.520852,-1.228348,-1.792941,1.74291,-0.706992,1.252198,-0.809353


-------------------- 分割されたデータShape --------------------
x_train: (449, 8) x_test: (193, 8)
y_train: (449,) y_test: (193,)

-------------------- 性能評価 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.864143,0.893805,0.845188,0.868817
test,0.823834,0.898148,0.808333,0.850877


標準化を施すことで<span style="color: orange;">ベースラインより精度が10%以上向上</span>していることがわかる
>

# One Hot Encodeを追加（SVCモデル）
ベースラインで取り除いたカテゴリ変数をOne_Hot_Encodingで説明変数に取り入れて比較する<br>
(比較を行うために標準化は行わない)

In [16]:
df = pd.read_csv('./data/train.csv')
# 前処理
pipe = PipeLine()
pipe(df)
# pipeのカテゴリデータを保持したアトリビュートからカラム名を取得
one_hot_columns = pipe.df_cat.columns
pipe.one_hot(one_hot_columns)  # pipeのクラスメソッドでワンホット化
pack = pipe.fold_out_split(test_size=0.3)

# サポートベクターマシンでモデルを訓練
model = SVC()
model.fit(pack[0], pack[2])  # x_train, x_test, y_train, y_test = pack

# 予測値を出力
print()
print('-'*20, '性能評価', '-'*20)
display(evaluations(model, *pack))

-------------------- ワンホットされたカラムIndex(['ChestPainType', 'RestingECG', 'ST_Slope'], dtype='object') --------------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,56,1,155,342,1,150,1,3.0,1,0,0,0,0,1,0,0,1,0
1,55,0,130,394,0,150,0,0.0,0,1,0,0,1,0,0,0,0,1
2,47,1,110,0,1,120,1,0.0,0,0,1,0,0,1,0,0,1,0


-------------------- 分割されたデータShape --------------------
x_train: (449, 18) x_test: (193, 18)
y_train: (449,) y_test: (193,)

-------------------- 性能評価 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.752784,0.778261,0.748954,0.763326
test,0.699482,0.77193,0.733333,0.752137


訓練誤差は向上しているが汎化誤差はベースラインとほぼ一緒<br>むしろ悪い
>
以下は各カテゴリを別々で追加して精度を確認

In [35]:
df = pd.read_csv('./data/train.csv')
pipe = PipeLine()
pipe(df)
one_hot_columns = pipe.df_cat.columns  # カテゴリデータのカラムを取得

# それぞれのカラムがどのように影響するか調べていく
for cat in one_hot_columns:
    print('#'*45, f'{cat}', '#'*45)
    # データ処理
    pipe = PipeLine()  # インスタンスの初期化
    pipe.viewer = False  # 前処理過程を非表示。表示する場合はコメントアウト
    pipe(df) 
    pipe.one_hot(cat)
    pack = pipe.fold_out_split(test_size=0.3)

    # サポートベクターマシンでモデルを訓練
    model = SVC()
    model.fit(pack[0], pack[2])  # x_train, x_test, y_train, y_test = pack

    # 予測値を出力
    print()
    print('-'*20, '性能評価', '-'*20)
    display(evaluations(model, *pack))


############################################# ChestPainType #############################################

-------------------- 性能評価 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.755011,0.779221,0.753138,0.765957
test,0.699482,0.77193,0.733333,0.752137


############################################# RestingECG #############################################

-------------------- 性能評価 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.752784,0.775862,0.753138,0.764331
test,0.699482,0.77193,0.733333,0.752137


############################################# ST_Slope #############################################

-------------------- 性能評価 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.752784,0.775862,0.753138,0.764331
test,0.699482,0.77193,0.733333,0.752137


標準化していないためかカテゴリ変数の効果は薄いよう<br>
モデルの表現力不足のせいかRestingECGとST_Slopeの結果に変化がない

### 標準化とワンホットエンコーディング（SVC)
標準化をしたものにワンホットエンコードがどのように寄与するか確認する

In [25]:
df = pd.read_csv('./data/train.csv')
# 前処理
pipe = PipeLine()
pipe(df)
one_hot_columns = pipe.df_cat.columns
pipe.one_hot(one_hot_columns)
pipe.standard_scaler()  # 標準化の処理を追加
pack = pipe.fold_out_split(test_size=0.3)

# サポートベクターマシンでモデルを訓練
model = SVC()
model.fit(pack[0], pack[2])  # x_train, x_test, y_train, y_test = pack

# 予測値を出力
print()
print('-'*20, '性能評価', '-'*20)
display(evaluations(model, *pack))

-------------------- ワンホットされたカラムIndex(['ChestPainType', 'RestingECG', 'ST_Slope'], dtype='object') --------------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,56,1,155,342,1,150,1,3.0,1,0,0,0,0,1,0,0,1,0
1,55,0,130,394,0,150,0,0.0,0,1,0,0,1,0,0,0,0,1
2,47,1,110,0,1,120,1,0.0,0,0,1,0,0,1,0,0,1,0


-------------------- 標準化されたdf_num --------------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,0.256306,0.520852,1.272672,1.323176,1.74291,0.512265,1.252198,1.959903,0.913392,-0.484371,-0.549756,-0.183892,-0.481919,0.785575,-0.489267,-0.287456,1.0157,-0.871151
1,0.151431,-1.91993,-0.116784,1.796972,-0.573753,0.512265,-0.798596,-0.809353,-1.09482,2.064533,-0.549756,-0.183892,2.075039,-1.272953,-0.489267,-0.287456,-0.984543,1.147907
2,-0.687567,0.520852,-1.228348,-1.792941,1.74291,-0.706992,1.252198,-0.809353,-1.09482,-0.484371,1.81899,-0.183892,-0.481919,0.785575,-0.489267,-0.287456,1.0157,-0.871151


-------------------- 分割されたデータShape --------------------
x_train: (449, 18) x_test: (193, 18)
y_train: (449,) y_test: (193,)

-------------------- 性能評価 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.917595,0.913934,0.933054,0.923395
test,0.870466,0.874016,0.925,0.898785


標準化を施したベースライン(ワンホットなし)に比べて<span style="color: orange;">5%ほど検証データの正解率が向上</span>
>
以下はカテゴリ別で精度の向上を確認

In [27]:
df = pd.read_csv('./data/train.csv')
pipe = PipeLine()
pipe(df)
one_hot_columns = pipe.df_cat.columns  # カテゴリデータのカラムを取得

# それぞれのカラムがどのように影響するか調べていく
for cat in one_hot_columns:
    print('#'*45, f'{cat}', '#'*45)
    pipe = PipeLine()  # インスタンスの初期化
    pipe.viewer = False  # 途中処理過程を非表示。表示する場合はコメントアウト
    pipe(df) 
    pipe.one_hot(cat)
    pipe.standard_scaler()
    pack = pipe.fold_out_split(test_size=0.3)

    # サポートベクターマシンでモデルを訓練
    model = SVC()
    model.fit(pack[0], pack[2])  # x_train, x_test, y_train, y_test = pack

    # 予測値を出力
    print()
    print('-'*20, '性能評価', '-'*20)
    display(out_put(model, *pack))

############################################# ChestPainType #############################################

-------------------- 性能評価 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.868597,0.875,0.878661,0.876827
test,0.84456,0.868852,0.883333,0.876033


############################################# RestingECG #############################################

-------------------- 性能評価 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.879733,0.914798,0.853556,0.883117
test,0.803109,0.859649,0.816667,0.837607


############################################# ST_Slope #############################################

-------------------- 性能評価 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.902004,0.901235,0.916318,0.908714
test,0.88601,0.895161,0.925,0.909836


適合率、再現率ともに<span style="color: orange;">ST_Slopeの効果がかなり大きい</span><br>
次いでChestPainType

# 分類木による評価

In [30]:
from sklearn.tree import DecisionTreeClassifier

In [47]:
df = pd.read_csv('./data/train.csv')
# 前処理
pipe = PipeLine()
pipe(df)
one_hot_columns = pipe.df_cat.columns
pipe.standard_scaler()  # 標準化の処理を追加
pack = pipe.fold_out_split(test_size=0.3)

# 決定木のモデルを訓練
model = DecisionTreeClassifier(max_depth=3)
model.fit(pack[0], pack[2])  # x_train, x_test, y_train, y_test = pack

# 予測値を出力
print()
print('-'*20, '性能評価', '-'*20)
display(evaluations(model, *pack))
print('-'*20, ' 説明変数の重要度 ', '-'*20)
display(pd.DataFrame([model.feature_importances_], columns=pipe.df_num.columns))
print()

-------------------- 標準化されたdf_num --------------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak
0,0.256306,0.520852,1.272672,1.323176,1.74291,0.512265,1.252198,1.959903
1,0.151431,-1.91993,-0.116784,1.796972,-0.573753,0.512265,-0.798596,-0.809353
2,-0.687567,0.520852,-1.228348,-1.792941,1.74291,-0.706992,1.252198,-0.809353


-------------------- 分割されたデータShape --------------------
x_train: (449, 8) x_test: (193, 8)
y_train: (449,) y_test: (193,)

-------------------- 性能評価 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.817372,0.894472,0.74477,0.812785
test,0.766839,0.912088,0.691667,0.78673


--------------------  説明変数の重要度  --------------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak
0,0.077465,0.0,0.0,0.263856,0.028652,0.090309,0.532049,0.007669





In [31]:
df = pd.read_csv('./data/train.csv')
# 前処理
pipe = PipeLine()
pipe(df)
one_hot_columns = pipe.df_cat.columns
pipe.one_hot(one_hot_columns)
pipe.standard_scaler()  # 標準化の処理を追加
pack = pipe.fold_out_split(test_size=0.3)

# 決定木のモデルを訓練
model = DecisionTreeClassifier(max_depth=3)
model.fit(pack[0], pack[2])  # x_train, x_test, y_train, y_test = pack

# 予測値を出力
print()
print('-'*20, '性能評価', '-'*20)
display(evaluations(model, *pack))
print('-'*20, ' 説明変数の重要度 ', '-'*20)
display(pd.DataFrame([model.feature_importances_], columns=pipe.df_num.columns))
print()

-------------------- ワンホットされたカラムIndex(['ChestPainType', 'RestingECG', 'ST_Slope'], dtype='object') --------------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,56,1,155,342,1,150,1,3.0,1,0,0,0,0,1,0,0,1,0
1,55,0,130,394,0,150,0,0.0,0,1,0,0,1,0,0,0,0,1
2,47,1,110,0,1,120,1,0.0,0,0,1,0,0,1,0,0,1,0


-------------------- 標準化されたdf_num --------------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,0.256306,0.520852,1.272672,1.323176,1.74291,0.512265,1.252198,1.959903,0.913392,-0.484371,-0.549756,-0.183892,-0.481919,0.785575,-0.489267,-0.287456,1.0157,-0.871151
1,0.151431,-1.91993,-0.116784,1.796972,-0.573753,0.512265,-0.798596,-0.809353,-1.09482,2.064533,-0.549756,-0.183892,2.075039,-1.272953,-0.489267,-0.287456,-0.984543,1.147907
2,-0.687567,0.520852,-1.228348,-1.792941,1.74291,-0.706992,1.252198,-0.809353,-1.09482,-0.484371,1.81899,-0.183892,-0.481919,0.785575,-0.489267,-0.287456,1.0157,-0.871151


-------------------- 分割されたデータShape --------------------
x_train: (449, 18) x_test: (193, 18)
y_train: (449,) y_test: (193,)

-------------------- 性能評価 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.879733,0.883817,0.891213,0.8875
test,0.854922,0.877049,0.891667,0.884298


--------------------  説明変数の重要度  --------------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,0.0,0.007564,0.0,0.0,0.0,0.032009,0.0,0.101312,0.203358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.655757





In [39]:
df = pd.read_csv('./data/train.csv')
pipe = PipeLine()
pipe(df)
one_hot_columns = pipe.df_cat.columns  # カテゴリデータのカラムを取得

# それぞれのカラムがどのように影響するか調べていく
for cat in one_hot_columns:
    print('#'*45, f'{cat}', '#'*45)
    pipe = PipeLine()  # インスタンスの初期化
    pipe.viewer = False  # 前処理の過程を非表示にできる
    pipe(df) 
    pipe.one_hot(cat)
    pipe.standard_scaler()
    pack = pipe.fold_out_split(test_size=0.3)

    model = DecisionTreeClassifier(max_depth=3)
    model.fit(pack[0], pack[2])  # x_train, x_test, y_train, y_test = pack

    # 予測値を出力
    # 予測値を出力
    print()
    print('-'*20, '性能評価', '-'*20)
    display(evaluations(model, *pack))
    print('-'*20, ' 説明変数の重要度 ', '-'*20)
    display(pd.DataFrame([model.feature_importances_], columns=pipe.df_num.columns))
    print()

############################################# ChestPainType #############################################

-------------------- 性能評価 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.815145,0.825,0.828452,0.826722
test,0.772021,0.806452,0.833333,0.819672


--------------------  説明変数の重要度  --------------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ASY,ATA,NAP,TA
0,0.0,0.087394,0.0,0.081464,0.0,0.033667,0.0,0.172791,0.624684,0.0,0.0,0.0



############################################# RestingECG #############################################

-------------------- 性能評価 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.817372,0.894472,0.74477,0.812785
test,0.772021,0.913043,0.7,0.792453


--------------------  説明変数の重要度  --------------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,LVH,Normal,ST
0,0.077465,0.0,0.0,0.246032,0.028652,0.090309,0.532049,0.007669,0.0,0.0,0.017824



############################################# ST_Slope #############################################

-------------------- 性能評価 --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.859688,0.896396,0.832636,0.863341
test,0.849741,0.876033,0.883333,0.879668


--------------------  説明変数の重要度  --------------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,Down,Flat,Up
0,0.0,0.030911,0.0,0.109583,0.030789,0.057872,0.0,0.075718,0.0,0.0,0.695127



