# 크로스 밸리데이션 실습

### 데이터 불러오기

In [1]:
from sklearn import datasets
raw_wine = datasets.load_wine()

### 피처, 타깃 데이터 지정

In [3]:
X = raw_wine.data
y = raw_wine.target

### 트레이닝/테스트 분할

In [5]:
from sklearn.model_selection import train_test_split
X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state=0)

### 데이터 표준화

In [11]:
from sklearn.preprocessing import StandardScaler
std_scale = StandardScaler()
std_scale.fit(X_tn)
X_tn_std = std_scale.transform(X_tn)
X_te_std = std_scale.transform(X_te)

### 그리드 서치

In [12]:
from sklearn import svm
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

param_grid = {'kernel': ('linear', 'rbf'),
             'C': [0.5, 1, 10, 100]}
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
svc = svm.SVC(random_state=0)
grid_cv = GridSearchCV(svc, param_grid, cv = kfold, scoring='accuracy')
grid_cv.fit(X_tn_std, y_tn)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
             estimator=SVC(random_state=0),
             param_grid={'C': [0.5, 1, 10, 100], 'kernel': ('linear', 'rbf')},
             scoring='accuracy')

### 그리드 서치 결과 확인

In [13]:
grid_cv.cv_results_

{'mean_fit_time': array([0.00120139, 0.00139427, 0.0009933 , 0.00100565, 0.00078912,
        0.00100217, 0.00080018, 0.00120058]),
 'std_fit_time': array([4.01142444e-04, 4.83236314e-04, 1.24594183e-05, 1.38060622e-05,
        3.95072158e-04, 6.36719331e-04, 4.00090370e-04, 4.14987042e-04]),
 'mean_score_time': array([0.0002048 , 0.00019913, 0.        , 0.00039473, 0.00041184,
        0.00039988, 0.00019999, 0.00060472]),
 'std_score_time': array([0.0004096 , 0.00039825, 0.        , 0.00048351, 0.00050461,
        0.00048975, 0.00039997, 0.00049382]),
 'param_C': masked_array(data=[0.5, 0.5, 1, 1, 10, 10, 100, 100],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf', 'linear', 'rbf',
                    'linear', 'rbf'],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=ob

### 그리드 서치 결과 확인 (데이터 프레임)

In [15]:
import numpy as np
import pandas as pd
np.transpose(pd.DataFrame(grid_cv.cv_results_))

Unnamed: 0,0,1,2,3,4,5,6,7
mean_fit_time,0.001201,0.001394,0.000993,0.001006,0.000789,0.001002,0.0008,0.001201
std_fit_time,0.000401,0.000483,0.000012,0.000014,0.000395,0.000637,0.0004,0.000415
mean_score_time,0.000205,0.000199,0.0,0.000395,0.000412,0.0004,0.0002,0.000605
std_score_time,0.00041,0.000398,0.0,0.000484,0.000505,0.00049,0.0004,0.000494
param_C,0.5,0.5,1,1,10,10,100,100
param_kernel,linear,rbf,linear,rbf,linear,rbf,linear,rbf
params,"{'C': 0.5, 'kernel': 'linear'}","{'C': 0.5, 'kernel': 'rbf'}","{'C': 1, 'kernel': 'linear'}","{'C': 1, 'kernel': 'rbf'}","{'C': 10, 'kernel': 'linear'}","{'C': 10, 'kernel': 'rbf'}","{'C': 100, 'kernel': 'linear'}","{'C': 100, 'kernel': 'rbf'}"
split0_test_score,0.888889,0.962963,0.888889,0.925926,0.888889,0.925926,0.888889,0.925926
split1_test_score,0.962963,1.0,0.962963,0.962963,0.962963,0.962963,0.962963,0.962963
split2_test_score,0.925926,0.962963,0.925926,0.962963,0.925926,0.962963,0.925926,0.962963


### 베스트 스코어 & 하이퍼파라미터

In [16]:
grid_cv.best_score_

0.9774928774928775

In [17]:
grid_cv.best_params_

{'C': 0.5, 'kernel': 'rbf'}

### 최종 모형

In [19]:
clf = grid_cv.best_estimator_
print(clf)

SVC(C=0.5, random_state=0)


### 크로스 밸리데이션 스코어 확인(1)

In [23]:
from sklearn.model_selection import cross_validate
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
cv_scores = cross_validate(clf, X_tn_std, y_tn,
                          cv=kfold, scoring=metrics)

In [24]:
cv_scores

{'fit_time': array([0.00200105, 0.00200248, 0.00200224, 0.00100279, 0.00199604]),
 'score_time': array([0.00299954, 0.00400066, 0.00199938, 0.0039978 , 0.00199914]),
 'test_accuracy': array([0.96296296, 1.        , 0.96296296, 0.96153846, 1.        ]),
 'test_precision_macro': array([0.96296296, 1.        , 0.96969697, 0.96969697, 1.        ]),
 'test_recall_macro': array([0.96666667, 1.        , 0.96296296, 0.95833333, 1.        ]),
 'test_f1_macro': array([0.9628483 , 1.        , 0.96451914, 0.96190476, 1.        ])}

### 크로스 밸리데이션 스코어 확인(2)

In [26]:
from sklearn.model_selection import cross_val_score
cv_score = cross_val_score(clf, X_tn_std, y_tn,
                          cv=kfold, scoring='accuracy')

In [28]:
print(cv_score)
print(cv_score.mean())
print(cv_score.std())

[0.96296296 1.         0.96296296 0.96153846 1.        ]
0.9774928774928775
0.01838434849561446


### 예측

In [29]:
pred_svm = clf.predict(X_te_std)
print(pred_svm)

[0 2 1 0 1 1 0 2 1 1 2 2 0 1 2 1 0 0 1 0 1 0 0 1 1 1 1 1 1 2 0 0 1 0 0 0 2
 1 1 2 0 0 1 1 1]


### 정확도

In [30]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_te, pred_svm)
print(accuracy)

1.0


### confusion matrix 확인

In [31]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_te, pred_svm)
import pandas as pd
pd.DataFrame(conf_matrix)

Unnamed: 0,0,1,2
0,16,0,0
1,0,21,0
2,0,0,8


### 분류 리포트 확인

In [32]:
from sklearn.metrics import classification_report
class_report = classification_report(y_te, pred_svm)
print(class_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00         8

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

