# 設定

In [13]:
# 乱数のシードを設定
import random
import numpy as np

np.random.seed(1234)
random.seed(1234)

In [14]:
# Breast cancer dataset for binary classification
import pandas as pd
from sklearn.datasets import load_breast_cancer
import time

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.svm import SVC

In [15]:
# Bayesian Optimization
!pip install scikit-optimize

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer



# データの読み込み

In [16]:
# set x and y
dataset = load_breast_cancer()
X = pd.DataFrame(dataset.data, columns = dataset.feature_names)
y = pd.Series(dataset.target, name = 'y')
X.join(y).head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,y
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0


# Grid Search

In [17]:
# パラメータグリッドの設定
param_grid_svc = {
    'C' : [0.1, 1, 10, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500],
    'gamma' : [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1.0, 2.0]
}
print('探索空間: ',  param_grid_svc)

探索空間:  {'C': [0.1, 1, 10, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500], 'gamma': [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1.0, 2.0]}


In [18]:
# Grid Search
t1 = time.time()   # 時間計測開始
gs_svc = GridSearchCV(estimator = SVC(random_state = 1),
                      param_grid = param_grid_svc,
                      scoring = 'f1',
                      cv = 5,
                      return_train_score = False
                      )
gs_svc.fit(X, y)
t2 = time.time()   # 時間計測終了
print('{:.2f}秒かかった'.format(t2 - t1))

10.53秒かかった


In [19]:
# テストスコアの確認
df_gs_tmp = pd.DataFrame(gs_svc.cv_results_)
df_gs = df_gs_tmp.sort_values('rank_test_score', ascending = True)
df_gs.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
103,0.008792,0.001978,0.002275,0.000255,400,1e-05,"{'C': 400, 'gamma': 1e-05}",0.972222,0.945205,0.979592,0.972603,0.964539,0.966832,0.011816,1
113,0.008066,0.000244,0.002267,0.000104,450,1e-05,"{'C': 450, 'gamma': 1e-05}",0.972222,0.945205,0.979592,0.972603,0.964539,0.966832,0.011816,1
123,0.00812,0.000725,0.002155,7.5e-05,500,1e-05,"{'C': 500, 'gamma': 1e-05}",0.972222,0.945205,0.979592,0.972603,0.964539,0.966832,0.011816,1
93,0.007641,0.000751,0.002197,4.1e-05,350,1e-05,"{'C': 350, 'gamma': 1e-05}",0.952381,0.945205,0.979592,0.972603,0.964539,0.962864,0.012643,4
83,0.007527,0.000555,0.002114,5.6e-05,300,1e-05,"{'C': 300, 'gamma': 1e-05}",0.952381,0.945205,0.979592,0.972603,0.964539,0.962864,0.012643,4


In [20]:
# Best Hyperparameter
gs_svc.best_params_

{'C': 400, 'gamma': 1e-05}

# Bayesian Optimization

In [21]:
# 探索範囲の設定
param_bayes_svc = {
    'C' : Real(0.1, 500.0, prior = 'log-uniform'),
    'gamma' : Real(1e-8, 2.0, prior = 'log-uniform')
}
print('探索空間: ',  param_bayes_svc)

探索空間:  {'C': Real(low=0.1, high=500.0, prior='log-uniform', transform='identity'), 'gamma': Real(low=1e-08, high=2.0, prior='log-uniform', transform='identity')}


In [22]:
# Bayes Search
t1 = time.time()   # 時間計測開始
bo_svc = BayesSearchCV(estimator = SVC(random_state = 1),
                      search_spaces = param_bayes_svc,
                      scoring = 'f1',
                      cv = 5,
                      n_iter = 30,
                      return_train_score = False
                      )
bo_svc.fit(X, y)
t2 = time.time()   # 時間計測終了
print('{:.2f}秒かかった'.format(t2 - t1))

24.83秒かかった


In [23]:
# テストスコアの確認
df_bo_tmp = pd.DataFrame(bo_svc.cv_results_)
df_bo = df_bo_tmp.sort_values('rank_test_score', ascending = True)
df_bo.head()

Unnamed: 0,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,params
14,0.972222,0.945205,0.979592,0.972603,0.964539,0.966836,0.011826,1,0.010814,0.002254,0.002532,0.000509,500.0,9e-06,"{'C': 499.99999999999994, 'gamma': 9.368957868..."
29,0.959459,0.945205,0.986111,0.958904,0.971429,0.964209,0.013747,2,0.011919,0.001192,0.002955,0.000446,500.0,5e-06,"{'C': 499.99999999999994, 'gamma': 4.727958710..."
15,0.958904,0.945205,0.979592,0.972603,0.964539,0.964168,0.011813,3,0.010856,0.002357,0.002577,0.00055,500.0,9e-06,"{'C': 499.99999999999994, 'gamma': 8.872734832..."
22,0.959459,0.945205,0.97931,0.958904,0.971429,0.962847,0.011692,4,0.009924,0.002186,0.002389,0.000435,500.0,5e-06,"{'C': 499.99999999999994, 'gamma': 5.063814348..."
25,0.965986,0.945205,0.972973,0.965517,0.964539,0.962841,0.00932,5,0.009941,0.00193,0.002548,0.000554,500.0,7e-06,"{'C': 499.99999999999994, 'gamma': 7.018848832..."


In [24]:
# Best Hyperparameter
bo_svc.best_params_

OrderedDict([('C', 499.99999999999994), ('gamma', 9.368957868762131e-06)])