서포트 벡터 머신

In [1]:
import pandas as pd
df = pd.read_csv('./data/wine_data.csv')

In [2]:
df.columns

Index(['Alcohol', 'Malic', 'Ash', 'Alcalinity', 'Magesium', 'Phenols',
       'Flavanoids', 'Nonflavanoids', 'Proanthocyanins', 'Color', 'Hue',
       'Dilution', 'Proline', 'class'],
      dtype='object')

In [3]:
features = ['Alcohol', 'Malic', 'Ash', 'Alcalinity', 'Magesium', 'Phenols',
       'Flavanoids', 'Nonflavanoids', 'Proanthocyanins', 'Color', 'Hue',
       'Dilution', 'Proline']

X = df[features]
y = df['class']

In [4]:
# 트레이닝/테스트 데이터 분할
from sklearn.model_selection import train_test_split
X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state=0)

In [5]:
# 데이터 표준화
from sklearn.preprocessing import StandardScaler
std_scale = StandardScaler()
std_scale.fit(X_tn)
X_tn_std = std_scale.transform(X_tn)
X_te_std = std_scale.transform(X_te)

In [6]:
# 서포트 벡터 머신
from sklearn import svm
clf_svm_lr = svm.SVC(kernel='linear')  # 회귀분석은 SVR / 선형이 아닌 비선형도 kernel값으로 조정 가능
clf_svm_lr.fit(X_tn_std, y_tn)

In [7]:
# 예측
pred_svm = clf_svm_lr.predict(X_te_std)
print(pred_svm)

[0 2 1 0 1 1 0 2 1 1 2 2 0 1 2 1 0 0 1 0 1 0 0 1 1 1 1 1 1 2 0 0 1 0 0 0 2
 1 1 2 0 0 1 1 1]


In [8]:
# 정확도 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_te, pred_svm)
print("accuracy:", accuracy)

# 정밀도 
from sklearn.metrics import precision_score
precision = precision_score(y_te, pred_svm, average='macro')
print("precision:", precision)

# 리콜 
from sklearn.metrics import recall_score
recall = recall_score(y_te, pred_svm, average='macro')
print("recall:", recall)

# F1 스코어
from sklearn.metrics import f1_score
f1 = f1_score(y_te, pred_svm, average='macro')
print("f1:", f1)


accuracy: 1.0
precision: 1.0
recall: 1.0
f1: 1.0


In [9]:
# confusion matrix 확인
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_te, pred_svm)
print(conf_matrix)

[[16  0  0]
 [ 0 21  0]
 [ 0  0  8]]


In [10]:
# 분류 레포트 확인
from sklearn.metrics import classification_report
class_report = classification_report(y_te, pred_svm)
print(class_report)


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00         8

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45



보팅 

In [11]:
# 보팅 학습
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

clf1 = LogisticRegression()
clf2 = svm.SVC(kernel='linear')
clf3 = GaussianNB()

clf_voting = VotingClassifier(
                estimators=[
                    ('lr', clf1),
                    ('svm', clf2),
                    ('gnb', clf3)
                ],
                voting='hard',  # 'hard': 많이 나온 값 선택 / 'soft': 결과값(확률 => 클래스별 확률의 평균값)이 높은 값 선택
                weights=[1,1,1])

clf_voting.fit(X_tn_std, y_tn)

In [12]:
# 예측
pred_voting = clf_voting.predict(X_te_std)
print(pred_voting)

[0 2 1 0 1 1 0 2 1 1 2 2 0 1 2 1 0 0 1 0 1 0 0 1 1 1 1 1 1 2 0 0 1 0 0 0 2
 1 1 2 0 0 1 1 1]


In [13]:
# 정확도 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_te, pred_voting)
print("accuracy:", accuracy)

# 정밀도 
from sklearn.metrics import precision_score
precision = precision_score(y_te, pred_voting, average='macro')
print("precision:", precision)

# 리콜 
from sklearn.metrics import recall_score
recall = recall_score(y_te, pred_voting, average='macro')
print("recall:", recall)

# F1 스코어
from sklearn.metrics import f1_score
f1 = f1_score(y_te, pred_voting, average='macro')
print("f1:", f1)


accuracy: 1.0
precision: 1.0
recall: 1.0
f1: 1.0


In [14]:
# confusion matrix 확인
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_te, pred_voting)
print(conf_matrix)

[[16  0  0]
 [ 0 21  0]
 [ 0  0  8]]


In [15]:
# 분류 레포트 확인
from sklearn.metrics import classification_report
class_report = classification_report(y_te, pred_voting)
print(class_report)


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00         8

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45



랜덤 포레스트

In [16]:
# 랜덤포레스트 
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier(max_depth=2,
                                random_state=0)
clf_rf.fit(X_tn_std, y_tn)

In [17]:
# 예측 
pred_rf = clf_rf.predict(X_te_std)
print(pred_rf)

[0 2 1 0 1 1 0 2 1 1 2 2 0 1 2 1 0 0 2 0 0 0 0 1 1 1 1 1 1 2 0 0 1 0 0 0 2
 1 1 2 0 0 1 1 1]


In [19]:
# 정확도 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_te, pred_rf)
print("accuracy:", accuracy)

# 정밀도 
from sklearn.metrics import precision_score
precision = precision_score(y_te, pred_rf, average='macro')
print("precision:", precision)

# 리콜 
from sklearn.metrics import recall_score
recall = recall_score(y_te, pred_rf, average='macro')
print("recall:", recall)

# F1 스코어
from sklearn.metrics import f1_score
f1 = f1_score(y_te, pred_rf, average='macro')
print("f1:", f1)

accuracy: 0.9555555555555556
precision: 0.943355119825708
recall: 0.9682539682539683
f1: 0.9536244800950685


In [20]:
# confusion matrix 확인
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_te, pred_rf)
print(conf_matrix)

[[16  0  0]
 [ 1 19  1]
 [ 0  0  8]]


In [21]:
# 분류 레포트 확인
from sklearn.metrics import classification_report
class_report = classification_report(y_te, pred_rf)
print(class_report)

              precision    recall  f1-score   support

           0       0.94      1.00      0.97        16
           1       1.00      0.90      0.95        21
           2       0.89      1.00      0.94         8

    accuracy                           0.96        45
   macro avg       0.94      0.97      0.95        45
weighted avg       0.96      0.96      0.96        45



배깅

In [22]:
# 배깅 학습
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
clf_bagging = BaggingClassifier(estimator=GaussianNB(),
                                n_estimators=10,
                                random_state=0)
clf_bagging.fit(X_tn_std, y_tn)

In [23]:
# 예측
pred_bagging = clf_bagging.predict(X_te_std)
print(pred_bagging)

[0 2 1 0 1 1 0 2 1 1 2 2 0 1 2 1 0 0 2 0 0 0 0 1 1 1 1 1 1 2 0 0 1 0 0 0 2
 1 1 2 0 0 1 1 1]


In [25]:
# 정확도 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_te, pred_bagging)
print("accuracy:", accuracy)

# 정밀도 
from sklearn.metrics import precision_score
precision = precision_score(y_te, pred_bagging, average='macro')
print("precision:", precision)

# 리콜 
from sklearn.metrics import recall_score
recall = recall_score(y_te, pred_bagging, average='macro')
print("recall:", recall)

# F1 스코어
from sklearn.metrics import f1_score
f1 = f1_score(y_te, pred_bagging, average='macro')
print("f1:", f1)

accuracy: 0.9555555555555556
precision: 0.943355119825708
recall: 0.9682539682539683
f1: 0.9536244800950685


In [27]:
# confusion matrix 확인
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_te, pred_bagging)
print(conf_matrix)

[[16  0  0]
 [ 1 19  1]
 [ 0  0  8]]


In [28]:
# 분류 레포트 확인
from sklearn.metrics import classification_report
class_report = classification_report(y_te, pred_bagging)
print(class_report)

              precision    recall  f1-score   support

           0       0.94      1.00      0.97        16
           1       1.00      0.90      0.95        21
           2       0.89      1.00      0.94         8

    accuracy                           0.96        45
   macro avg       0.94      0.97      0.95        45
weighted avg       0.96      0.96      0.96        45



에이다 부스트

In [29]:
from sklearn.ensemble import AdaBoostClassifier
clf_ada = AdaBoostClassifier(random_state=0)
clf_ada.fit(X_tn_std, y_tn)

In [30]:
# 예측
pred_ada = clf_ada.predict(X_te_std)
print(pred_ada)

[0 2 0 0 1 0 0 2 1 1 2 2 0 0 2 1 0 0 1 0 0 0 0 1 1 1 1 1 1 2 0 0 1 0 0 0 2
 1 0 2 1 0 1 1 1]


In [31]:
# 정확도 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_te, pred_ada)
print("accuracy:", accuracy)

# 정밀도 
from sklearn.metrics import precision_score
precision = precision_score(y_te, pred_ada, average='macro')
print("precision:", precision)

# 리콜 
from sklearn.metrics import recall_score
recall = recall_score(y_te, pred_ada, average='macro')
print("recall:", recall)

# F1 스코어
from sklearn.metrics import f1_score
f1 = f1_score(y_te, pred_ada, average='macro')
print("f1:", f1)

accuracy: 0.8666666666666667
precision: 0.8970588235294118
recall: 0.8998015873015873
f1: 0.8918128654970761


In [32]:
# confusion matrix 확인
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_te, pred_ada)
print(conf_matrix)

[[15  1  0]
 [ 5 16  0]
 [ 0  0  8]]


In [33]:
# 분류 레포트 확인
from sklearn.metrics import classification_report
class_report = classification_report(y_te, pred_ada)
print(class_report)

              precision    recall  f1-score   support

           0       0.75      0.94      0.83        16
           1       0.94      0.76      0.84        21
           2       1.00      1.00      1.00         8

    accuracy                           0.87        45
   macro avg       0.90      0.90      0.89        45
weighted avg       0.88      0.87      0.87        45



그래디언트 부스트

In [35]:
# Gradient Boosting 학습
from sklearn.ensemble import GradientBoostingClassifier
clf_gbt = GradientBoostingClassifier(max_depth=2,
                                     learning_rate=0.1,
                                     random_state=0)
clf_gbt.fit(X_tn_std, y_tn)

In [36]:
# 예측
pred_gbt = clf_gbt.predict(X_te_std)
print(pred_gbt)

[0 2 1 0 1 0 0 2 1 1 2 2 0 1 2 1 0 0 2 0 1 0 0 1 1 1 1 1 1 2 0 0 1 0 0 0 2
 1 1 2 0 0 1 1 1]


In [37]:
# 정확도 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_te, pred_gbt)
print("accuracy:", accuracy)

# 정밀도 
from sklearn.metrics import precision_score
precision = precision_score(y_te, pred_gbt, average='macro')
print("precision:", precision)

# 리콜 
from sklearn.metrics import recall_score
recall = recall_score(y_te, pred_gbt, average='macro')
print("recall:", recall)

# F1 스코어
from sklearn.metrics import f1_score
f1 = f1_score(y_te, pred_gbt, average='macro')
print("f1:", f1)

accuracy: 0.9555555555555556
precision: 0.943355119825708
recall: 0.9682539682539683
f1: 0.9536244800950685


In [38]:
# confusion matrix 확인
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_te, pred_gbt)
print(conf_matrix)ㅁ

[[16  0  0]
 [ 1 19  1]
 [ 0  0  8]]


In [39]:
# 분류 레포트 확인
from sklearn.metrics import classification_report
class_report = classification_report(y_te, pred_gbt)
print(class_report)

              precision    recall  f1-score   support

           0       0.94      1.00      0.97        16
           1       1.00      0.90      0.95        21
           2       0.89      1.00      0.94         8

    accuracy                           0.96        45
   macro avg       0.94      0.97      0.95        45
weighted avg       0.96      0.96      0.96        45



스태킹

In [40]:
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

clf1 = svm.SVC(kernel='linear', random_state=1)
clf2 = GaussianNB()

clf_stkg = StackingClassifier(
    estimators=[
        ('svm', clf1),
        ('gnb', clf2)
    ],
    final_estimator=LogisticRegression())
clf_stkg.fit(X_tn_std, y_tn)

In [42]:
# 예측
pred_stkg = clf_stkg.predict(X_te_std)
print(pred_stkg)

[0 2 1 0 1 1 0 2 1 1 2 2 0 1 2 1 0 0 2 0 0 0 0 1 1 1 1 1 1 2 0 0 1 0 0 0 2
 1 1 2 0 0 1 1 1]


In [43]:
# 정확도 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_te, pred_stkg)
print("accuracy:", accuracy)

# 정밀도 
from sklearn.metrics import precision_score
precision = precision_score(y_te, pred_stkg, average='macro')
print("precision:", precision)

# 리콜 
from sklearn.metrics import recall_score
recall = recall_score(y_te, pred_stkg, average='macro')
print("recall:", recall)

# F1 스코어
from sklearn.metrics import f1_score
f1 = f1_score(y_te, pred_stkg, average='macro')
print("f1:", f1)

accuracy: 0.9555555555555556
precision: 0.943355119825708
recall: 0.9682539682539683
f1: 0.9536244800950685


In [44]:
# confusion matrix 확인
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_te, pred_stkg)
print(conf_matrix)

[[16  0  0]
 [ 1 19  1]
 [ 0  0  8]]


In [45]:
# 분류 레포트 확인
from sklearn.metrics import classification_report
class_report = classification_report(y_te, pred_stkg)
print(class_report)

              precision    recall  f1-score   support

           0       0.94      1.00      0.97        16
           1       1.00      0.90      0.95        21
           2       0.89      1.00      0.94         8

    accuracy                           0.96        45
   macro avg       0.94      0.97      0.95        45
weighted avg       0.96      0.96      0.96        45



크로스 밸리데이션

In [46]:
# 그리드 서치 학습
from sklearn import svm
svm.SVC()
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

param_grid = {'kernel' : ('linear', 'rbf', 'poly'),
              'C' : [0.001, 0.5, 1, 10, 100]}
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
svc = svm.SVC(random_state=0)
grid_cv = GridSearchCV(svc, param_grid, cv=kfold, scoring='accuracy')
grid_cv.fit(X_tn_std, y_tn)

In [48]:
# 그리드 서치 결과 확인
import numpy as np

np.transpose(pd.DataFrame(grid_cv.cv_results_))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
mean_fit_time,0.012963,0.002394,0.002201,0.001612,0.002194,0.002181,0.001803,0.002196,0.001995,0.001589,0.002394,0.002194,0.002393,0.002394,0.002195
std_fit_time,0.02144,0.00049,0.000387,0.000503,0.00096,0.001148,0.00039,0.000398,0.00063,0.000496,0.000474,0.000746,0.000488,0.00049,0.000398
mean_score_time,0.001592,0.0008,0.000994,0.000798,0.000599,0.000822,0.000985,0.000996,0.000797,0.000997,0.001994,0.001198,0.001396,0.001396,0.001395
std_score_time,0.001346,0.0004,0.00063,0.000399,0.000489,0.000412,0.000016,0.000003,0.000398,0.000632,0.000643,0.000398,0.000488,0.000489,0.00049
param_C,0.001,0.001,0.001,0.5,0.5,0.5,1,1,1,10,10,10,100,100,100
param_kernel,linear,rbf,poly,linear,rbf,poly,linear,rbf,poly,linear,rbf,poly,linear,rbf,poly
params,"{'C': 0.001, 'kernel': 'linear'}","{'C': 0.001, 'kernel': 'rbf'}","{'C': 0.001, 'kernel': 'poly'}","{'C': 0.5, 'kernel': 'linear'}","{'C': 0.5, 'kernel': 'rbf'}","{'C': 0.5, 'kernel': 'poly'}","{'C': 1, 'kernel': 'linear'}","{'C': 1, 'kernel': 'rbf'}","{'C': 1, 'kernel': 'poly'}","{'C': 10, 'kernel': 'linear'}","{'C': 10, 'kernel': 'rbf'}","{'C': 10, 'kernel': 'poly'}","{'C': 100, 'kernel': 'linear'}","{'C': 100, 'kernel': 'rbf'}","{'C': 100, 'kernel': 'poly'}"
split0_test_score,0.37037,0.37037,0.37037,0.888889,0.962963,0.888889,0.888889,0.925926,0.888889,0.888889,0.925926,0.925926,0.888889,0.925926,0.888889
split1_test_score,0.37037,0.37037,0.37037,0.962963,1.0,0.851852,0.962963,0.962963,0.962963,0.962963,0.962963,1.0,0.962963,0.962963,1.0
split2_test_score,0.37037,0.37037,0.37037,0.925926,0.962963,0.962963,0.925926,0.962963,1.0,0.925926,0.962963,0.925926,0.925926,0.962963,0.962963


In [49]:
# 베스트 스코어
print("베스트 스코어:", grid_cv.best_score_)

# 베스트 하이퍼파라미터
print("베스트 하이퍼파라미터: ", grid_cv.best_params_)

# 최종 모형
clf = grid_cv.best_estimator_
print(clf)

베스트 스코어: 0.9774928774928775
베스트 하이퍼파라미터:  {'C': 0.5, 'kernel': 'rbf'}
SVC(C=0.5, random_state=0)


In [50]:
# 크로스 밸리데이션 스코어 확인(1)
from sklearn.model_selection import cross_validate
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
cv_scores = cross_validate(clf, X_tn_std, y_tn, 
                           cv=kfold, scoring=metrics)
cv_scores

{'fit_time': array([0.00398946, 0.00199461, 0.00398612, 0.0029912 , 0.0019691 ]),
 'score_time': array([0.01399684, 0.01496482, 0.01097012, 0.02296734, 0.00798392]),
 'test_accuracy': array([0.96296296, 1.        , 0.96296296, 0.96153846, 1.        ]),
 'test_precision_macro': array([0.96296296, 1.        , 0.96969697, 0.96969697, 1.        ]),
 'test_recall_macro': array([0.96666667, 1.        , 0.96296296, 0.95833333, 1.        ]),
 'test_f1_macro': array([0.9628483 , 1.        , 0.96451914, 0.96190476, 1.        ])}

In [51]:
# 크로스 밸리데이션 스코어 확인(1)
from sklearn.model_selection import cross_val_score
cv_score = cross_val_score(clf, X_tn_std, y_tn, 
                           cv=kfold, scoring='accuracy')
print(cv_score)
print(cv_score.mean())
print(cv_score.std())

[0.96296296 1.         0.96296296 0.96153846 1.        ]
0.9774928774928775
0.01838434849561446
