In [1]:
"""
Pipeline & GridSearch
 1. SVM Model
 2. Pipeline : model workflow(dataset 전처리 -> model -> test)
 3. GridSearch : Model turning  
  - 가능한 모든 조합을 시도하여 최적의 매개변수를 찾는 방법
  - 매개변수를 튜닝하여 일반화 성능을 개선
"""
from sklearn.datasets import load_breast_cancer # dataset 
from sklearn.svm import SVC # svm model 
from sklearn.model_selection import train_test_split # dataset split
from sklearn.preprocessing import MinMaxScaler#, StandardScaler # scaling
from sklearn.pipeline import Pipeline # pipeline model
from sklearn.model_selection import GridSearchCV # gridSearch model(model tuning)

###############################
## 1. SVM Model
###############################
# 1) data load
cancer = load_breast_cancer()
X = cancer.data
print(X.shape,"\n") # (569, 30)

print(X.mean(axis=0) ,"\n")#  열 단위 평균 -> 정규화 필요 


y = cancer.target

(569, 30) 

[1.41272917e+01 1.92896485e+01 9.19690334e+01 6.54889104e+02
 9.63602812e-02 1.04340984e-01 8.87993158e-02 4.89191459e-02
 1.81161863e-01 6.27976098e-02 4.05172056e-01 1.21685343e+00
 2.86605923e+00 4.03370791e+01 7.04097891e-03 2.54781388e-02
 3.18937163e-02 1.17961371e-02 2.05422988e-02 3.79490387e-03
 1.62691898e+01 2.56772232e+01 1.07261213e+02 8.80583128e+02
 1.32368594e-01 2.54265044e-01 2.72188483e-01 1.14606223e-01
 2.90075571e-01 8.39458172e-02] 



In [2]:
# 2) train/test split
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [5]:
# 3) x변수 scaling 
scaler = MinMaxScaler().fit(X)
x_train_nor = scaler.transform(x_train)
print(x_train_nor.mean(axis=0))
print()
x_test_nor = scaler.transform(x_test)
print(x_test_nor.mean(axis=0))

[0.33973077 0.32204953 0.33414344 0.2184157  0.39483602 0.2585412
 0.20770883 0.24425478 0.37612747 0.2665753  0.10620861 0.18830163
 0.09819438 0.06330557 0.179273   0.17143324 0.08004879 0.22166197
 0.17665835 0.09735464 0.29835706 0.36295259 0.28412337 0.17264252
 0.40502809 0.21882619 0.21524051 0.39614895 0.26246658 0.18693597]

[0.33372718 0.32967168 0.3293353  0.21246461 0.3946311  0.26673532
 0.20909967 0.23980675 0.38996609 0.28171155 0.1067518  0.19236982
 0.10289651 0.06064053 0.18661846 0.18339128 0.08200208 0.22879254
 0.18256759 0.10864818 0.29161536 0.36711423 0.28020125 0.16573365
 0.40148579 0.22434166 0.22384487 0.38694495 0.26581008 0.19752057]


In [6]:
# 4) svm model
svc=SVC(gamma = "auto", random_state = 1) # gamma 생략 시 warning
model=svc.fit(x_train_nor,y_train) # x변수는 정규화한걸로

In [7]:
# 5) model 평가 
acc = model.score(x_test_nor, y_test)
print('Accuracy(tuning before) =', acc)

Accuracy(tuning before) = 0.958041958041958


In [11]:
###############################
# 2. Pipeline model
###############################

# 1) pipeline object : [ ('object', class), ('object', class) ] - [ tuple ]
pipe_svc = Pipeline([ ('scaler', MinMaxScaler()), ('svc', SVC(gamma='auto')) ])
#MinMaxScaler() or StandardScaler()
# 2) pipeline model
pipe_model = pipe_svc.fit(x_train, y_train) # raw data(none scaling)

# 3) pipeline model test
acc = pipe_model.score(x_test, y_test) # raw data(none scaling)

print('Accuracy=', acc)
# [해설] SVM 모델 결과와 동일함

Accuracy= 0.951048951048951


In [12]:
#############################################
# 3. pipeling model + GridSearch -> Turning
#############################################

# SVM 매개변수 튜닝 
'''
 SVC(C=1.0, kernel='rbf', degree=3, gamma='auto_deprecated'
 SVC parameter : PPT 참고 
 C : 오분류 조절(값이 클 수록 오분류 적고, 과적합 증가)
 kernel : rbf:radial basis func(비선형), linear(선형)
 gamma : 비선형 결정경계 조절(값이 클 수록 원 크기 작고, 찌그러짐)  
'''


# 1) C, gamma 파라미터값 범위 : 10e-3 ~ 10e+3  
params = [0.001, 0.01, 0.1, 1.0, 100.0, 1000.0] # 주의 : 0 제외 

# dict 형식 : {'pipelineOjbect__C' : params_range}
param_grid = [
    {'svc__C': params, 'svc__kernel': ['linear']}, # 선형
    {'svc__C': params, 'svc__gamma': params, 'svc__kernel': ['rbf']}] # 비선형 

# 2) GridSearch model : Pipeline model 이용 
# estimator = pipeline object, param_grid = params,     
gs = GridSearchCV(estimator = pipe_svc, param_grid=param_grid, 
             scoring='accuracy', cv = 10, n_jobs=-1) 
# scoring='평가방법', cv=교차검정, n_jobs = CPU 코어 수

grid_model = gs.fit(x_train, y_train)

# 3) best params, score
print(grid_model.best_params_)
# {'svm__C': 1.0, 'svm__gamma': 1.0, 'svm__kernel': 'rbf'}

acc = grid_model.score(x_test, y_test)
print('Accuracy(tuning afterr)', acc)

{'svc__C': 1.0, 'svc__gamma': 1.0, 'svc__kernel': 'rbf'}
Accuracy(tuning afterr) 0.972027972027972
