# SVM

= 클래스 사이의 경계에 위치한 데이터 포인트를 서폭트 벡터라고하는데,  
각 서포트 벡터가 클래스 사이의 결정 경계를 구분하는데 얼마나 중요한지를 학습하며    
각 서포트 벡터 사이의 마진이 가장 큰 방향으로 학습된다.

- 회귀와 분류, 이상치 탐지 등에 사용되는 지도학습 방법이다.
- SVM은 입력데이터가 정규화되어야 좋은 성능을 보인다.  
    (=주로 모든 feature에 대한 값을 [0, 1]의 범위로 맞추는 방식을 사용)  
    sklearn의 StandardScaler 또는 MinMaxScaler을 사용하면 될것.
    

In [38]:
import warnings
warnings.filterwarnings(action='ignore') #warnings.filterwarnings(action='default')
import multiprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.svm import SVR, SVC #회귀, 분류
from sklearn.datasets import load_boston, load_diabetes
from sklearn.datasets import load_breast_cancer, load_iris, load_wine
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.manifold import TSNE

## Support vector Regression

In [2]:
X, y =load_boston(return_X_y = True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)

svr = SVR().fit(X_train, y_train)

print("train score : {:.3f}".format(svr.score(X_train, y_train)))
print("test score : {:.3f}".format(svr.score(X_test, y_test)))

train score : 0.218
test score : 0.135


## Support Vector Clssification

In [3]:
X, y =load_breast_cancer(return_X_y = True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)

svc = SVC().fit(X_train, y_train)

print("train score : {:.3f}".format(svc.score(X_train, y_train)))
print("test score : {:.3f}".format(svc.score(X_test, y_test)))

train score : 0.901
test score : 0.923


# Kernel SVM

: 입력 데이터를 고차원 공간에 사상해서 비선형 특징을 학습할 수 있도록 확장하는 방법이다.

- sklearn에서는 Linear(선형), Polynomial(비선형), RBF 등 다양한 커널 기법이 지원된다.

## kernel SVR

In [4]:
X, y =load_boston(return_X_y = True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)

linear_svr = SVR(kernel = "linear").fit(X_train, y_train)

print("linear_train score : {:.3f}".format(linear_svr.score(X_train, y_train)))
print("linear_test score : {:.3f}".format(linear_svr.score(X_test, y_test)))

print()

X, y =load_boston(return_X_y = True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)

polynomial_svr = SVR(kernel = "poly").fit(X_train, y_train)

print("polynomial_train score : {:.3f}".format(polynomial_svr.score(X_train, y_train)))
print("polynomial_test score : {:.3f}".format(polynomial_svr.score(X_test, y_test)))

print()

X, y =load_boston(return_X_y = True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)

RBF_svr = SVR(kernel = "rbf").fit(X_train, y_train)

print("RBF_train score : {:.3f}".format(RBF_svr.score(X_train, y_train)))
print("RBF_test score : {:.3f}".format(RBF_svr.score(X_test, y_test)))

# Kernel SVR에서는 linear kerneal이 가장 성능이 좋다.

linear_train score : 0.716
linear_test score : 0.638

polynomial_train score : 0.202
polynomial_test score : 0.134

RBF_train score : 0.218
RBF_test score : 0.135


## kernel SVC

In [5]:
X, y =load_breast_cancer(return_X_y = True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)

linear_svc = SVC(kernel = "linear").fit(X_train, y_train)

print("linear_train score : {:.3f}".format(linear_svc.score(X_train, y_train)))
print("linear_test score : {:.3f}".format(linear_svc.score(X_test, y_test)))

print()

X, y =load_breast_cancer(return_X_y = True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)

polynomial_svc = SVC(kernel = "poly").fit(X_train, y_train)

print("polynomial_train score : {:.3f}".format(polynomial_svc.score(X_train, y_train)))
print("polynomial_test score : {:.3f}".format(polynomial_svc.score(X_test, y_test)))

print()

X, y =load_breast_cancer(return_X_y = True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)

RBF_svc = SVC(kernel = "rbf").fit(X_train, y_train)

print("RBF_train score : {:.3f}".format(RBF_svc.score(X_train, y_train)))
print("RBF_test score : {:.3f}".format(RBF_svc.score(X_test, y_test)))

# Kernel SVC에서도 linear kernel이 가장 성능이 좋다.

linear_train score : 0.960
linear_test score : 0.986

polynomial_train score : 0.901
polynomial_test score : 0.923

RBF_train score : 0.901
RBF_test score : 0.923


# parameter

- SVM은 사용하는 Kernel에 따라 다양한 매개변수 설정이 가능하다.
- 매개변수를 변경하면서 성능변화를 관찰할 수 있다.

> ***parameter***  
> - ***C***  : 모델이 오류를 어느정도 허용할 것인지를 조정이 가능하다.
> - ***gamma***  : 결정 경계를 얼마나 유연하게 그을 것인지 정해줄 수 있다.

In [6]:
X, y =load_breast_cancer(return_X_y = True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)

polynomial_svc = SVC(kernel = "poly", degree = 2, C = 0.1, gamma = "auto").fit(X_train, y_train)

print("Kernel = Polynomial, degree = {}, C = {}, gammer = {}".format(2, 0.1, "auto"))
print("polynomial_train score : {:.3f}".format(polynomial_svc.score(X_train, y_train)))
print("polynomial_test score : {:.3f}".format(polynomial_svc.score(X_test, y_test)))

# hyper parameter를 설정하니 기존의 성능보다 향상되었다.

Kernel = Polynomial, degree = 2, C = 0.1, gammer = auto
polynomial_train score : 0.984
polynomial_test score : 0.993


In [7]:
X, y =load_breast_cancer(return_X_y = True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)

RBF_svc = SVC(kernel = "rbf", C = 2.0, gamma = "scale").fit(X_train, y_train)

print("Kernel = RBF, C = {}, gammer = {}".format(2.0, "scale"))
print("RBF_train score : {:.3f}".format(RBF_svc.score(X_train, y_train)))
print("RBF_test score : {:.3f}".format(RBF_svc.score(X_test, y_test)))

# hyper parameter를 설정하니 기존의 성능보다 향상되었다.

Kernel = RBF, C = 2.0, gammer = scale
RBF_train score : 0.915
RBF_test score : 0.937


# SVM 활용시 데이터 전처리의 중요성

In [22]:
X, y =load_breast_cancer(return_X_y = True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)

svc = SVC().fit(X_train, y_train)

print("train score : {:.3f}".format(svc.score(X_train, y_train)))
print("test score : {:.3f}".format(svc.score(X_test, y_test)))

train score : 0.901
test score : 0.923


## StandardScaler

In [24]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

svc = SVC().fit(X_train, y_train)

print("train score : {:.3f}".format(svc.score(X_train, y_train)))
print("test score : {:.3f}".format(svc.score(X_test, y_test)))

# 표준화를 진행하고 모델을 만든결과 기존보다 성능이 향상함

train score : 0.984
test score : 0.986


## MinMaxScaler

In [25]:
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

svc = SVC().fit(X_train, y_train)

print("train score : {:.3f}".format(svc.score(X_train, y_train)))
print("test score : {:.3f}".format(svc.score(X_test, y_test)))

# 정규화를 진행하고 모델을 만든결과 기존보다 성능이 향상함

train score : 0.981
test score : 0.986


# Example

In [26]:
X,  y = load_boston(return_X_y = True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2) #분할

In [28]:
scaler = StandardScaler() #표준화
scaler.fit(X_train)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [29]:
svr = SVR(kernel = "linear").fit(X_train, y_train)

print("train score : {:.3f}".format(svr.score(X_train, y_train)))
print("test score : {:.3f}".format(svr.score(X_test, y_test))) # 성능이 좋지 못한 결과

train score : 0.726
test score : 0.579


In [32]:
estimator = make_pipeline(StandardScaler(), SVR(kernel = "linear"))

cross_validate(estimator = estimator,
              X = X,
              y = y,
              cv = 5,
              n_jobs = multiprocessing.cpu_count(),
              verbose = True
              )

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    1.9s finished


{'fit_time': array([0.04089141, 0.03690267, 0.02493429, 0.01396108, 0.04288435]),
 'score_time': array([0.00299072, 0.00698233, 0.00399041, 0.00199485, 0.00399113]),
 'test_score': array([0.76908568, 0.72180141, 0.56428426, 0.14083339, 0.07810211])}

In [34]:
pipe = Pipeline([("scaler", StandardScaler()),
                ("model", SVR(kernel = "linear"))])

param_grid = [{"model__gamma" : ["scale", "auto"],
              "model__C" : [1.0, 0.1, 0.01],
              "model__epsilon" : [1.0, 0.1, 0.01]}]

gs = GridSearchCV(estimator = pipe,
                 param_grid = param_grid,
                 n_jobs = multiprocessing.cpu_count(),
                 cv = 5,
                 verbose = True
                 )

gs.fit(X, y)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('model', SVR(kernel='linear'))]),
             n_jobs=4,
             param_grid=[{'model__C': [1.0, 0.1, 0.01],
                          'model__epsilon': [1.0, 0.1, 0.01],
                          'model__gamma': ['scale', 'auto']}],
             verbose=True)

In [37]:
gs.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('model', SVR(C=0.1, epsilon=1.0, kernel='linear'))])