In [1]:
from pathlib import Path
import sys

import numpy as np
import pandas as pd
from sklearn.svm import SVC

rank = 0 
sys.path[0] = f'{Path().resolve().parents[rank]}' # mymoduleが上の階層にある場合rankを調整してコメント解除
from mymodule import PipeLine, k_fold_prediction, ensemble_evals

## 検証方法の比較
ホールドアウト法と交差検証の比較

### ホールドアウト法を使用する場合
PipeLineクラスのtrainingメソッドで valid = 'fold_out_split' を指定する<br>
valid_args = {'test_size': テストサイズ比率} を指定する

In [2]:
df = pd.read_csv('../data/train.csv')

############################# 検証方法とモデルの選択 #############################
valid, model = ['fold_out_split', SVC]  # fold_out_splitを指定
valid_args = {'test_size': 0.2}  # テストサイズの割合を指定
params = {'kernel': 'rbf' ,'probability': True}  # 選択したモデルパラメータを入力

################################## パイプライン ################################
pipe = PipeLine()
pipe(df)
pipe.standard_scaler()
pipe.one_hot(['ChestPainType'])
models = pipe.training(valid=valid, Model=model, valid_args=valid_args ,params=params)

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.875244,0.895911,0.870036,0.882784
test,0.844961,0.897436,0.853659,0.875


>
### 交差検証法を使用する場合
PipeLineクラスのtrainingメソッドで valid = 'k_fold' を指定する.<br>
valid_args = {'n_splits': 分割数} を指定する

In [3]:
df = pd.read_csv('../data/train.csv')

########################### 検証方法とモデルの選択 ##############################
valid, model = ['k_fold', SVC]  # k_foldを指定
valid_args = {'n_splits': 5}  # ホールドアウト法なのでテストサイズの割合を指定
params = {'kernel': 'rbf' ,'probability': True}

################################ パイプライン ##################################
pipe = PipeLine()
pipe(df)
pipe.standard_scaler()
pipe.one_hot(['ChestPainType'])
models = pipe.training(valid=valid, Model=model, valid_args=valid_args ,params=params)

-------------------- model0 predict --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.875244,0.895911,0.870036,0.882784
test,0.844961,0.897436,0.853659,0.875


-------------------- model1 predict --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.869396,0.901099,0.86014,0.880143
test,0.868217,0.878378,0.890411,0.884354


-------------------- model2 predict --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.891051,0.912727,0.886926,0.899642
test,0.796875,0.847222,0.802632,0.824324


-------------------- model3 predict --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.879377,0.90493,0.880137,0.892361
test,0.835938,0.838235,0.850746,0.844444


-------------------- model4 predict --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
train,0.875486,0.897959,0.885906,0.891892
test,0.851562,0.862069,0.819672,0.840336


交差検証のため５つのモデルがリストに格納されて出力される<br>
本番環境では5のモデルの平均を出力する<br>

In [4]:
ensemble = PipeLine()
ensemble.viewer = False
ensemble(df)
ensemble.standard_scaler()
ensemble.one_hot(['ChestPainType'])
# k_fold_prediction関数に交差検証の出力と正解ラベル入れる
pred = k_fold_prediction(models, ensemble.df_num.values)
ensemble_evals(pred, ensemble.df_target.values)

-------------------- ensemble --------------------


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
ensemble,0.861371,0.877095,0.874652,0.875872
