In [13]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn import svm
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [3]:
df = pd.read_csv('c:/reposit/data/철원이형/wine_data.csv')

features = df.columns[:-1]

X = df[features]
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=808, stratify=y)

std_scale = StandardScaler()
std_scale.fit(X_train)

X_train_std = std_scale.transform(X_train)
X_test_std = std_scale.transform(X_test)

In [5]:
# 그리드 서치 학습

pram_grid = {'kernel': ('linear', 'rbf'),
             'C': [.5, 1, 10, 100]}

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=808)
svc = svm.SVC(random_state=808)
grid_cv = GridSearchCV(svc, param_grid=pram_grid, cv=kfold, scoring='accuracy')
grid_cv.fit(X_train_std, y_train)

# 그리드 서치 결과 확인
grid_cv.cv_results_

{'mean_fit_time': array([0.00295229, 0.00332789, 0.00159631, 0.00179663, 0.00085492,
        0.        , 0.        , 0.00312443]),
 'std_fit_time': array([0.00420493, 0.00075894, 0.00048947, 0.00039979, 0.00077757,
        0.        , 0.        , 0.00624886]),
 'mean_score_time': array([0.00291224, 0.00179186, 0.00099688, 0.00099578, 0.00019951,
        0.        , 0.        , 0.        ]),
 'std_score_time': array([0.00533947, 0.00074257, 0.00063037, 0.00063098, 0.00039902,
        0.        , 0.        , 0.        ]),
 'param_C': masked_array(data=[0.5, 0.5, 1, 1, 10, 10, 100, 100],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf', 'linear', 'rbf',
                    'linear', 'rbf'],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.5, 'k

In [6]:
# 그리드 서치 결과 확인(DF)
np.transpose(pd.DataFrame(grid_cv.cv_results_))

Unnamed: 0,0,1,2,3,4,5,6,7
mean_fit_time,0.002952,0.003328,0.001596,0.001797,0.000855,0.0,0.0,0.003124
std_fit_time,0.004205,0.000759,0.000489,0.0004,0.000778,0.0,0.0,0.006249
mean_score_time,0.002912,0.001792,0.000997,0.000996,0.0002,0.0,0.0,0.0
std_score_time,0.005339,0.000743,0.00063,0.000631,0.000399,0.0,0.0,0.0
param_C,0.5,0.5,1,1,10,10,100,100
param_kernel,linear,rbf,linear,rbf,linear,rbf,linear,rbf
params,"{'C': 0.5, 'kernel': 'linear'}","{'C': 0.5, 'kernel': 'rbf'}","{'C': 1, 'kernel': 'linear'}","{'C': 1, 'kernel': 'rbf'}","{'C': 10, 'kernel': 'linear'}","{'C': 10, 'kernel': 'rbf'}","{'C': 100, 'kernel': 'linear'}","{'C': 100, 'kernel': 'rbf'}"
split0_test_score,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
split1_test_score,0.962963,1.0,0.925926,1.0,0.925926,1.0,0.925926,1.0
split2_test_score,0.962963,0.962963,0.962963,0.962963,0.962963,0.962963,0.962963,0.962963


In [7]:
# 베스트 스코어
print(grid_cv.best_score_)

# 베스트 하이퍼파라미터
print(grid_cv.best_params_)

# 최종 모형
clf = grid_cv.best_estimator_
print(clf)

0.9849002849002849
{'C': 1, 'kernel': 'rbf'}
SVC(C=1, random_state=808)


In [11]:
# 크로스 벨리데이션 스코어 확인(1)
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']

cv_scores = cross_validate(clf, X_train_std, y_train, cv=kfold, scoring=metrics)
cv_scores

{'fit_time': array([0.00299144, 0.        , 0.01154709, 0.        , 0.0029943 ]),
 'score_time': array([0.00940537, 0.01635551, 0.00923228, 0.01643515, 0.00698137]),
 'test_accuracy': array([1.        , 1.        , 0.96296296, 1.        , 0.96153846]),
 'test_precision_macro': array([1.        , 1.        , 0.97222222, 1.        , 0.96296296]),
 'test_recall_macro': array([1.        , 1.        , 0.95238095, 1.        , 0.96666667]),
 'test_f1_macro': array([1.        , 1.        , 0.95986622, 1.        , 0.9628483 ])}

In [12]:
# 크로스 벨리데이션 스코어 확인(2)
cv_score = cross_val_score(clf, X_train_std, y_train, cv=kfold, scoring='accuracy')

print(cv_score)
print(cv_score.mean())
print(cv_score.std())

[1.         1.         0.96296296 1.         0.96153846]
0.9849002849002849
0.01849878413760448


In [14]:
# 예측
pred_svm = clf.predict(X_test_std)

# confusion matrix
conf_matrix = confusion_matrix(y_test, pred_svm)
print(conf_matrix)

# 분류 레포트
class_report = classification_report(y_test, pred_svm)
print(class_report)

[[15  0  0]
 [ 0 18  0]
 [ 0  0 12]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00        18
           2       1.00      1.00      1.00        12

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

