In [62]:
import pandas as pd

# 使用するデータセット
df = pd.read_csv('./data/train.csv')
df.head(5)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,56,1,ASY,155,342,1,Normal,150,1,3.0,Flat,1
1,55,0,ATA,130,394,0,LVH,150,0,0.0,Up,0
2,47,1,NAP,110,0,1,Normal,120,1,0.0,Flat,1
3,34,1,ASY,115,0,1,Normal,154,0,0.2,Up,1
4,54,0,NAP,160,201,0,Normal,163,0,0.0,Up,0


>
# ベースラインの作成
ベースラインを作成して今後の特徴作成やモデル選択の比較対象とする<br>
カテゴリ変数と欠損の多いコレステロール値のカラムを除いたものとする
モデルは線形分離モデルのSVCを使用

In [60]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [3]:
# ベースラインで使わない変数
x = df.drop(['Cholesterol', 'ChestPainType', 'RestingECG', 'ST_Slope', 'HeartDisease'], axis=1)
# 正解データ
y = df.HeartDisease
print('-'*20, ' ベースラインで使用した変数', '-'*20)
display(x)

# ベースラインはホールドアウト法（7:3）
x_train, x_test, y_train, y_test = train_test_split(
                                                    x, y,
                                                    test_size=0.3,
                                                    shuffle=True,
                                                    random_state=42
                                                    )
# モデルはサポートベクターマシンのデフォルト
base = SVC()
# モデル訓練
base.fit(x_train, y_train)
# 訓練データで予測値を出力
print('-'*20, '正解率（訓練データ）', '-'*20)
train_pred = base.predict(x_train)
# 正解率を表示
print(accuracy_score(y_train, train_pred))
# 検証データで予測値を出力
print('-'*20, '正解率（検証データ）', '-'*20)
test_pred = base.predict(x_test)
accuracy_score(y_test, test_pred)

--------------------  ベースラインで使用した変数 --------------------


Unnamed: 0,Age,Sex,RestingBP,FastingBS,MaxHR,ExerciseAngina,Oldpeak
0,56,1,155,1,150,1,3.0
1,55,0,130,0,150,0,0.0
2,47,1,110,1,120,1,0.0
3,34,1,115,1,154,0,0.2
4,54,0,160,0,163,0,0.0
...,...,...,...,...,...,...,...
637,48,1,106,1,110,0,0.0
638,53,1,126,0,106,0,0.0
639,54,1,200,0,142,1,2.0
640,45,0,130,0,170,0,0.0


-------------------- 正解率（訓練データ） --------------------
[[131  79]
 [ 59 180]]
0.6926503340757239
-------------------- 正解率（検証データ） --------------------
[[39 25]
 [34 95]]


0.694300518134715

### コレステロール値を追加して比較する
今回は欠損値の補完は考えない<br>
欠損値補完は考察のもと別途考える必要がある

In [48]:
df = pd.read_csv('./data/train.csv')
# ベースラインで使わない変数
x = df.drop(['ChestPainType', 'RestingECG', 'ST_Slope', 'HeartDisease'], axis=1)
# 正解データ
y = df.HeartDisease
print('-'*20, ' コレステロール値のカラムを追加 ', '-'*20)
display(x)

# ベースラインはホールドアウト法（7:3）
x_train, x_test, y_train, y_test = train_test_split(
                                                    x, y,
                                                    test_size=0.3,
                                                    shuffle=True,
                                                    random_state=42
                                                    )
# モデルはサポートベクターマシンのデフォルト
base = SVC()
# モデル訓練
base.fit(x_train, y_train)
# 訓練データで予測値を出力
print('-'*20, '正解率（訓練データ）', '-'*20)
train_pred = base.predict(x_train)
# 正解率を表示
print(accuracy_score(y_train, train_pred))
# 検証データで予測値を出力
print('-'*20, '正解率（検証データ）', '-'*20)
test_pred = base.predict(x_test)
accuracy_score(y_test, test_pred)

--------------------  コレステロール値のカラムを追加  --------------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak
0,56,1,155,342,1,150,1,3.0
1,55,0,130,394,0,150,0,0.0
2,47,1,110,0,1,120,1,0.0
3,34,1,115,0,1,154,0,0.2
4,54,0,160,201,0,163,0,0.0
...,...,...,...,...,...,...,...,...
637,48,1,106,263,1,110,0,0.0
638,53,1,126,0,0,106,0,0.0
639,54,1,200,198,0,142,1,2.0
640,45,0,130,237,0,170,0,0.0


-------------------- 正解率（訓練データ） --------------------
0.7505567928730512
-------------------- 正解率（検証データ） --------------------


0.7046632124352331

訓練誤差は5%ほど向上したが検証誤差は1%ほどしか向上してないが改善がみられた

>
### ベースラインの標準化
ベースラインの説明変数に標準化を取り入れる

In [28]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [29]:
# ベースラインで使わない変数
x = df.drop(['Cholesterol', 'ChestPainType', 'RestingECG', 'ST_Slope', 'HeartDisease'], axis=1)
# 正解データ
y = df.HeartDisease

# 標準化
scaler = StandardScaler()
scaler.fit(x)
x = scaler.transform(x)
print('-'*15, f' ベースラインで使用した変数 shape: {x.shape}', '-'*15)
display(x)

# ベースラインはホールドアウト法（7:3）
x_train, x_test, y_train, y_test = train_test_split(
                                                    x, y,
                                                    test_size=0.3,
                                                    shuffle=True,
                                                    random_state=42
                                                    )
# モデルはサポートベクターマシンのデフォルト
base = SVC()
# モデル訓練
base.fit(x_train, y_train)
# 訓練データで予測値を出力
print('-'*20, '正解率（訓練データ）', '-'*20)
train_pred = base.predict(x_train)
# 正解率を表示
print(accuracy_score(y_train, train_pred))
# 検証データで予測値を出力
print('-'*20, '正解率（検証データ）', '-'*20)
test_pred = base.predict(x_test)
accuracy_score(y_test, test_pred)

---------------  ベースラインで使用した変数 shape: (642, 7) ---------------


array([[ 0.25630601,  0.52085231,  1.27267188, ...,  0.5122651 ,
         1.25219807,  1.95990337],
       [ 0.15143128, -1.91993005, -0.11678351, ...,  0.5122651 ,
        -0.79859571, -0.80935339],
       [-0.6875666 ,  0.52085231, -1.22834782, ..., -0.70699168,
         1.25219807, -0.80935339],
       ...,
       [ 0.04655654,  0.52085231,  3.77369158, ...,  0.18712996,
         1.25219807,  1.03681778],
       [-0.89731607, -1.91993005, -0.11678351, ...,  1.32510296,
        -0.79859571, -0.80935339],
       [ 0.15143128,  0.52085231,  0.21668578, ..., -0.25993086,
         1.25219807,  0.29834931]])

-------------------- 正解率（訓練データ） --------------------
0.8307349665924276
-------------------- 正解率（検証データ） --------------------


0.8134715025906736

標準化を施すことで<span style="color: orange;">ベースラインより精度が10%以上向上</span>していることがわかる
>

# ワンホットエンコーディング（SVCモデル）
ベースラインで取り除いたカテゴリ変数をOne_Hot_Encodingで説明変数に取り入れて比較する<br>
(比較を行うために標準化は行わない)

In [63]:
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

from mymodule import one_hot_split

In [10]:
cats = ['ChestPainType', 'RestingECG', 'ST_Slope']
path = './data/train.csv'

# ワンホット化して訓練データとテストデータを分割
path = './data/train.csv'
one_cat = cats
x_train, x_test, y_train, y_test = one_hot_split(path=path, one_cat=one_cat, display_columns=True)

# モデル訓練
model = SVC()
model.fit(x_train, y_train)

# 訓練データで予測値を表示
train_pred = model.predict(x_train)
print('-'*20, '正解率（訓練データ）', '-'*20)
print(accuracy_score(y_train, train_pred))

# 検証データで正解率を表示
test_pred = model.predict(x_test)
print('-'*20, '正解率（検証データ）', '-'*20)
print(accuracy_score(y_test, test_pred))

---------------  ['ChestPainType', 'RestingECG', 'ST_Slope']をエンコード ---------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,56,1,155,342,1,150,1,3.0,1,0,0,0,0,1,0,0,1,0
1,55,0,130,394,0,150,0,0.0,0,1,0,0,1,0,0,0,0,1
2,47,1,110,0,1,120,1,0.0,0,0,1,0,0,1,0,0,1,0
3,34,1,115,0,1,154,0,0.2,1,0,0,0,0,1,0,0,0,1
4,54,0,160,201,0,163,0,0.0,0,0,1,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
637,48,1,106,263,1,110,0,0.0,1,0,0,0,0,1,0,0,1,0
638,53,1,126,0,0,106,0,0.0,1,0,0,0,0,1,0,0,1,0
639,54,1,200,198,0,142,1,2.0,1,0,0,0,0,1,0,0,1,0
640,45,0,130,237,0,170,0,0.0,0,1,0,0,0,1,0,0,0,1


-------------------- 正解率（訓練データ） --------------------
0.7527839643652561
-------------------- 正解率（検証データ） --------------------
0.6994818652849741


訓練誤差は向上しているが汎化誤差はベースラインとほぼ一緒<br>むしろ悪い
>
以下は各カテゴリを別々で追加して精度を確認

In [45]:
cats = ['ChestPainType', 'RestingECG', 'ST_Slope']
path = './data/train.csv'

dic = {'train': [], 'test': []}
for cat in cats:
    one_cat = [cat]
    path = './data/train.csv'
    x_train, x_test, y_train, y_test = one_hot_split(path=path, one_cat=one_cat, display_columns=True)
    # モデルはサポートベクターマシンのデフォルト
    model = SVC()
    # モデル訓練
    model.fit(x_train, y_train)
    print(' '*20, f'{cat}', ' '*20)
    # 訓練データで予測値を出力
    train_pred = model.predict(x_train)
    # 正解率を表示
    print('-'*20, '正解率（訓練データ）', '-'*20)
    print(accuracy_score(y_train, train_pred))
    # 検証データで正解率を表示
    test_pred = model.predict(x_test)
    print('-'*20, '正解率（検証データ）', '-'*20)
    print(accuracy_score(y_test, test_pred))
    print()
    dic['train'].append(x_train)
    dic['test'].append(x_test)

---------------  ['ChestPainType']をエンコード ---------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,56,1,155,342,1,150,1,3.0,1,0,0,0
1,55,0,130,394,0,150,0,0.0,0,1,0,0
2,47,1,110,0,1,120,1,0.0,0,0,1,0
3,34,1,115,0,1,154,0,0.2,1,0,0,0
4,54,0,160,201,0,163,0,0.0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
637,48,1,106,263,1,110,0,0.0,1,0,0,0
638,53,1,126,0,0,106,0,0.0,1,0,0,0
639,54,1,200,198,0,142,1,2.0,1,0,0,0
640,45,0,130,237,0,170,0,0.0,0,1,0,0


                     ChestPainType                     
-------------------- 正解率（訓練データ） --------------------
0.755011135857461
-------------------- 正解率（検証データ） --------------------
0.6994818652849741

---------------  ['RestingECG']をエンコード ---------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,RestingECG_LVH,RestingECG_Normal,RestingECG_ST
0,56,1,155,342,1,150,1,3.0,0,1,0
1,55,0,130,394,0,150,0,0.0,1,0,0
2,47,1,110,0,1,120,1,0.0,0,1,0
3,34,1,115,0,1,154,0,0.2,0,1,0
4,54,0,160,201,0,163,0,0.0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
637,48,1,106,263,1,110,0,0.0,0,1,0
638,53,1,126,0,0,106,0,0.0,0,1,0
639,54,1,200,198,0,142,1,2.0,0,1,0
640,45,0,130,237,0,170,0,0.0,0,1,0


                     RestingECG                     
-------------------- 正解率（訓練データ） --------------------
0.7527839643652561
-------------------- 正解率（検証データ） --------------------
0.6994818652849741

---------------  ['ST_Slope']をエンコード ---------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,56,1,155,342,1,150,1,3.0,0,1,0
1,55,0,130,394,0,150,0,0.0,0,0,1
2,47,1,110,0,1,120,1,0.0,0,1,0
3,34,1,115,0,1,154,0,0.2,0,0,1
4,54,0,160,201,0,163,0,0.0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
637,48,1,106,263,1,110,0,0.0,0,1,0
638,53,1,126,0,0,106,0,0.0,0,1,0
639,54,1,200,198,0,142,1,2.0,0,1,0
640,45,0,130,237,0,170,0,0.0,0,0,1


                     ST_Slope                     
-------------------- 正解率（訓練データ） --------------------
0.7527839643652561
-------------------- 正解率（検証データ） --------------------
0.6994818652849741



標準化していないためかカテゴリ変数の効果は薄いよう<br>
モデルの表現力不足のせいかRestingECGとST_Slopeの結果に変化がない

### 標準化とワンホットエンコーディング（SVC)
標準化をしたものにワンホットエンコードがどのように寄与するか確認する

# 分類木による評価

In [64]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

from mymodule import one_hot_split

In [49]:
cats = ['Cholesterol', 'ChestPainType', 'RestingECG', 'ST_Slope']
path = './data/train.csv'

path = './data/train.csv'
one_cat = cats
x_train, x_test, y_train, y_test = one_hot_split(path=path, one_cat=one_cat, display_columns=False)

# 分類木
model = DecisionTreeClassifier(max_depth=3)
# モデル訓練
model.fit(x_train, y_train)
print(' '*20, f'{cat}', ' '*20)

# 訓練データで予測値を出力
train_pred = model.predict(x_train)
# 正解率を表示
print('-'*20, '正解率（訓練データ）', '-'*20)
print(accuracy_score(y_train, train_pred))
# 検証データで正解率を表示
test_pred = model.predict(x_test)
print('-'*20, '正解率（検証データ）', '-'*20)
print(accuracy_score(y_test, test_pred))
print('-'*20, '各説明変数の重要度', '-'*20)
display(pd.DataFrame([model.feature_importances_], columns=model.feature_names_in_))
print()

                     ST_Slope                     
-------------------- 正解率（訓練データ） --------------------
0.8797327394209354
-------------------- 正解率（検証データ） --------------------
0.8549222797927462
-------------------- 各説明変数の重要度 --------------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,Cholesterol.1,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,0.0,0.007564,0.0,0.0,0.0,0.032009,0.0,0.101312,0.0,0.203358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.655757





In [57]:
cats = ['ChestPainType', 'RestingECG', 'ST_Slope']
path = './data/train.csv'

for cat in cats:
    one_cat = [cat]
    path = './data/train.csv'
    x_train, x_test, y_train, y_test = one_hot_split(path=path, one_cat=one_cat, display_columns=False)
    # 分類木
    model = DecisionTreeClassifier(max_depth=3)
    # モデル訓練
    model.fit(x_train, y_train)
    print(' '*15, f'【 {cat}の追加 】', ' '*15)
    # 訓練データで予測値を出力
    train_pred = model.predict(x_train)
    # 正解率を表示
    print('-'*20, '正解率（訓練データ）', '-'*20)
    print(accuracy_score(y_train, train_pred))
    # 検証データで正解率を表示
    test_pred = model.predict(x_test)
    print('-'*20, '正解率（検証データ）', '-'*20)
    print(accuracy_score(y_test, test_pred))
    print('-'*20, '格説明変数の重要度', '-'*20)
    display(pd.DataFrame([model.feature_importances_], columns=model.feature_names_in_))
    print()

                【 ChestPainTypeの追加 】                
-------------------- 正解率（訓練データ） --------------------
0.8151447661469933
-------------------- 正解率（検証データ） --------------------
0.772020725388601
-------------------- 格説明変数の重要度 --------------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,0.0,0.087394,0.0,0.081464,0.0,0.033667,0.0,0.172791,0.624684,0.0,0.0,0.0



                【 RestingECGの追加 】                
-------------------- 正解率（訓練データ） --------------------
0.8173719376391982
-------------------- 正解率（検証データ） --------------------
0.7668393782383419
-------------------- 格説明変数の重要度 --------------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,RestingECG_LVH,RestingECG_Normal,RestingECG_ST
0,0.077465,0.0,0.0,0.263856,0.028652,0.090309,0.532049,0.007669,0.0,0.0,0.0



                【 ST_Slopeの追加 】                
-------------------- 正解率（訓練データ） --------------------
0.8596881959910914
-------------------- 正解率（検証データ） --------------------
0.8497409326424871
-------------------- 格説明変数の重要度 --------------------


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,0.0,0.030911,0.0,0.109583,0.030789,0.057872,0.0,0.075718,0.0,0.0,0.695127



