In [1]:
import warnings
warnings.filterwarnings("ignore")

# 교차검증과 그리드 서치
- 머신러닝을 사용할 때 모델의 정확도를 측정하기 위해 반드시 사용해야 하는 방법   
- 딥러닝시에는 데이터의 크기가 크므로 이 방법은 사용할 필요가 없다

In [2]:
import pandas as pd
wine = pd.read_csv("../Data/wine.csv")
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [3]:
# Feature, Target
data = wine.drop("class", axis=1)
target = wine["class"].to_numpy()

# 검증 세트 추가


In [4]:
# 전체 세트 중 훈련세트와 테스트 세트를 8:2의 기준으로 분리한다
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = \
  train_test_split(data, target, test_size=0.2, random_state=42)

In [5]:
# 훈련 세트 중 훈련세트와 검증세트를 8:2의 기준으로 분리한다
sub_input, val_input, sub_target, val_target = \
  train_test_split(train_input, train_target, test_size=0.2, random_state=42)

In [6]:
# 훈련 세트, 검증세트, 테스트 세트의 크기 구하기
print("훈련세트 :", sub_input.shape)
print("검증세트 :", val_input.shape)
print("테스트세트 :", test_input.shape)

훈련세트 : (4157, 3)
검증세트 : (1040, 3)
테스트세트 : (1300, 3)


In [7]:
# 훈련세트와 검증세트를 결정트리로 모델 만들기

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input,sub_target)

print("Train score :", dt.score(sub_input, sub_target))
print("Valid score :", dt.score(val_input, val_target))


Train score : 0.9971133028626413
Valid score : 0.864423076923077


---
# 교차검증

In [8]:
from sklearn.model_selection import cross_validate
scores = cross_validate(dt, train_input, train_target)
scores

{'fit_time': array([0.00779176, 0.00901008, 0.00900698, 0.00670004, 0.00729799]),
 'score_time': array([0.00163627, 0.00182295, 0.00352192, 0.00459909, 0.00157022]),
 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}

In [10]:
import numpy as np

In [12]:
np.mean(scores["test_score"])

0.855300214703487

---
# KFold : 분할기를 사용한 교차검증

In [16]:
from sklearn.model_selection import StratifiedKFold

splitter = StratifiedKFold(n_splits=10) # 기본 n_splits는 5
scores = cross_validate(dt, train_input, train_target, cv=splitter)
scores

{'fit_time': array([0.01335502, 0.00994492, 0.01031709, 0.009341  , 0.01068091,
        0.00931716, 0.00872111, 0.01114106, 0.00938177, 0.02097201]),
 'score_time': array([0.0016129 , 0.0019989 , 0.00174475, 0.002285  , 0.00172424,
        0.00196004, 0.00297809, 0.00826406, 0.01481009, 0.00208426]),
 'test_score': array([0.84807692, 0.85769231, 0.875     , 0.86730769, 0.88461538,
        0.87692308, 0.875     , 0.86705202, 0.8477842 , 0.81695568])}

In [17]:
np.mean(scores["test_score"])

0.8616407292129834

In [18]:
# KFold의 Fold를 10개로 나누어서 교차검증
# shuffle = True로 하는 이유는 과대적합때문이다. 해주는게 좋음
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) # 기본 n_splits는 5
scores = cross_validate(dt, train_input, train_target, cv=splitter)
scores

{'fit_time': array([0.00914216, 0.00867605, 0.00912404, 0.00806618, 0.00857902,
        0.00753617, 0.00804305, 0.00851822, 0.00862002, 0.00848794]),
 'score_time': array([0.00168586, 0.00173211, 0.00191402, 0.00198913, 0.00139809,
        0.00171494, 0.00148797, 0.00177288, 0.0014751 , 0.0018971 ]),
 'test_score': array([0.83461538, 0.87884615, 0.85384615, 0.85384615, 0.84615385,
        0.87307692, 0.85961538, 0.85549133, 0.85163776, 0.86705202])}

In [19]:
np.mean(scores["test_score"])

0.8574181117533719

In [23]:
splitter = StratifiedKFold(n_splits=10) # 기본 n_splits는 5
scores = cross_validate(dt, train_input, train_target, cv=splitter)
np.mean(scores["test_score"])

0.8616407292129834