In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### 교차 검증과 그리드 서치
- 머신러닝 을 사용할 때 모델의 정획도를 측정하기 위해 반드시 사용해야 하는 방법
- 딥러닝시에는 데이터의 크기가 크므로 이 방법을 사용할 필요가 없다

In [2]:
wine = pd.read_csv('../Data/wine.csv')

In [4]:
# Feature n Target
data = wine[['alcohol','sugar','pH']].to_numpy()
target = wine['class'].to_numpy()

### 검증 세트 추가
- 훈련(60%), 검증(20%), 테스트(20%)

In [11]:
# Train n Test
train_input, test_input, train_target, test_target = train_test_split(data, target, test_size=0.2, random_state=42)
sub_input, val_input, sub_target, val_target = train_test_split(train_input, train_target, test_size=0.2, random_state=42)


In [12]:
# 세트별 크기 확인

print('Train :', sub_input.shape)
print('Valid :', val_input.shape)
print('Test  :', test_input.shape)

Train : (4157, 3)
Valid : (1040, 3)
Test  : (1300, 3)


In [13]:
# model tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(sub_input, sub_target)

print('Train score :', dt.score(sub_input, sub_target))
print('Valid score :', dt.score(val_input, val_target))

Train score : 0.9971133028626413
Valid score : 0.8711538461538462


In [14]:
# 교차 검즞ㅇ
from sklearn.model_selection import cross_validate
scores = cross_validate(dt, train_input, train_target)
scores

{'fit_time': array([0.00466299, 0.00435376, 0.00429702, 0.00433612, 0.00424814]),
 'score_time': array([0.00054097, 0.00047708, 0.00044584, 0.00054002, 0.00047588]),
 'test_score': array([0.87019231, 0.84711538, 0.87584216, 0.84889317, 0.84215592])}

In [15]:
# 교차검증 후의 정확도 판단
np.mean(scores['test_score'])

0.8568397867772267

### KFold를 이용한 방법

In [17]:
from sklearn.model_selection import StratifiedKFold
splitter = StratifiedKFold()
scores = cross_validate(dt, train_input, train_target, cv = splitter)
scores

{'fit_time': array([0.00577283, 0.0050199 , 0.00506091, 0.00475621, 0.00439787]),
 'score_time': array([0.00065494, 0.00052094, 0.00054121, 0.00047183, 0.00043201]),
 'test_score': array([0.87115385, 0.84807692, 0.87776708, 0.85178056, 0.83926853])}

In [18]:
# 교차검증 후의 정확도 판단
np.mean(scores['test_score'])

0.8576093877248834

In [20]:
# KFold의 Flod를 10개로 나누어서 교차검증
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_validate(dt, train_input, train_target, cv = splitter)
scores

{'fit_time': array([0.00597405, 0.00553203, 0.00542998, 0.00485992, 0.00469828,
        0.00506997, 0.00485706, 0.00473976, 0.00445795, 0.00434875]),
 'score_time': array([0.00045419, 0.00045896, 0.00040293, 0.00033998, 0.00048566,
        0.00041914, 0.00044394, 0.00039816, 0.00023723, 0.00023317]),
 'test_score': array([0.84230769, 0.87884615, 0.85769231, 0.84423077, 0.84807692,
        0.87307692, 0.86346154, 0.85163776, 0.85934489, 0.86705202])}

In [21]:
# 교차검증 후의 정확도 판단
np.mean(scores['test_score'])

0.8585726989773234

---
### 그리드 서치(Grid Search)를 이용한 최적의 Hyper Parameter값 찾기

In [22]:
from sklearn.model_selection import GridSearchCV
params = {'min_impurity_decrease' :[0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}

In [23]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs = -1)

In [25]:
gs.fit(train_input,train_target)

In [26]:
dt = gs.best_estimator_
print(dt.score(train_input, train_target))

0.9615162593804117


In [27]:
gs.best_params_

{'min_impurity_decrease': 0.0001}

In [28]:
# 교차 검증
gs.cv_results_['mean_test_score'] # 임마의 평균값이 진짜 점수

array([0.86819297, 0.86453617, 0.86492226, 0.86780891, 0.86761605])