# チューニング

## 0. 設定

In [2]:
# Do not show warnings
import warnings
warnings.filterwarnings('ignore')

# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# matplotlib for japanese support
import japanize_matplotlib

# functions for data frame display
from IPython.display import display

# Adjust display options
np.set_printoptions(suppress=True, precision=4)
pd.options.display.float_format = '{:.4f}'.format
pd.set_option("display.max_columns", None)
plt.rcParams["font.size"]=14
random_seed = 123

## 1. アルゴリズム選択

In [4]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

x = cancer.data
y = cancer.target

test_size = 0.1

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    test_size=test_size, random_state=random_seed,
                                                    stratify=y)
print(x.shape)
print(x_train.shape)
print(x_test.shape)

# Linear Regression
from sklearn.linear_model import LogisticRegression
algorithm1 = LogisticRegression(random_state=random_seed)

# SVM
from sklearn.svm import SVC
algorithm2 = SVC(kernel='rbf', random_state=random_seed)

# Decision Tree
from sklearn.tree import DecisionTreeClassifier
algorithm3 = DecisionTreeClassifier(random_state=random_seed)

# Random Forest
from sklearn.ensemble import RandomForestClassifier
algorithm4 = RandomForestClassifier(random_state=random_seed)

# XGBoost
from xgboost import XGBClassifier
algorithm5 = XGBClassifier(random_state=random_seed)

# Make a list of algorithm
algorithms = [algorithm1, algorithm2, algorithm3, algorithm4, algorithm5]

for algorithm in algorithms:
    algorithm.fit(x_train, y_train)
    score = algorithm.score(x_test, y_test)
    name = algorithm.__class__.__name__
    print(f'score: {score:.4f} {name}')

(569, 30)
(512, 30)
(57, 30)
score: 0.9649 LogisticRegression
score: 0.8947 SVC
score: 0.9474 DecisionTreeClassifier
score: 0.9298 RandomForestClassifier
score: 0.9825 XGBClassifier


## 2. ハイパーパラメータの最適化
　SVMのパラメータであるgammaとＣについて最適値を求める。

In [5]:
algorithm = SVC(kernel='rbf', random_state=random_seed)
print(algorithm)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=123, shrinking=True, tol=0.001,
    verbose=False)


In [8]:
# Find gamma value
gammas = [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]

for gamma in gammas:
    algorithm = SVC(kernel='rbf', gamma=gamma, random_state=random_seed)
    algorithm.fit(x_train, y_train)
    score = algorithm.score(x_test, y_test)
    print(f'score: {score:.4f} gamma: {gamma}')

score: 0.6316 gamma: 1
score: 0.6316 gamma: 0.1
score: 0.6316 gamma: 0.01
score: 0.9474 gamma: 0.001
score: 0.9474 gamma: 0.0001
score: 0.9474 gamma: 1e-05


In [9]:
# Find C value
Cs = [1, 10, 100, 1000, 10000]
for C in Cs:
    algorithm = SVC(kernel='rbf', gamma=0.001, C=C, random_state=random_seed)
    algorithm.fit(x_train, y_train)
    score = algorithm.score(x_test, y_test)
    print(f'score: {score:.4f} C: {C}')

score: 0.9474 C: 1
score: 0.9298 C: 10
score: 0.9298 C: 100
score: 0.9298 C: 1000
score: 0.9298 C: 10000


## 3. 交差検定法

In [10]:
algorithm = SVC(kernel='rbf', random_state=random_seed, gamma=0.001, C=1)

from sklearn.model_selection import StratifiedKFold
stratifiedkfold = StratifiedKFold(n_splits=3)

from sklearn.model_selection import cross_val_score
scores = cross_val_score(algorithm, x_train, y_train, cv=stratifiedkfold)

mean = scores.mean()

print(f'平均スコア:{mean:.4f} 個別スコア:{scores}')

平均スコア:0.9141 個別スコア:[0.8889 0.9181 0.9353]


In [13]:
# Use cross validation to select algorithm
from sklearn.linear_model import LogisticRegression
algorithm1 = LogisticRegression(random_state=random_seed)

from sklearn.svm import SVC
algorithm2 = SVC(kernel='rbf', random_state=random_seed, gamma=0.001, C=1)

from sklearn.tree import DecisionTreeClassifier
algorithm3 = DecisionTreeClassifier(random_state=random_seed)

from sklearn.ensemble import RandomForestClassifier
algorithm4 = RandomForestClassifier(random_state=random_seed)

from xgboost import XGBClassifier
algorithm5 = XGBClassifier(random_state=random_seed)
algorithms = [algorithm1, algorithm2, algorithm3, algorithm4, algorithm5]

from sklearn.model_selection import StratifiedKFold
stratifiedkfold = StratifiedKFold(n_splits=3)

from sklearn.model_selection import cross_val_score
for algorithm in algorithms:
    scores = cross_val_score(algorithm, x_train, y_train,
                             cv=stratifiedkfold)
    score = scores.mean()
    name = algorithm.__class__.__name__
    print(f'平均スコア:{score:.4f} 個別スコア:{scores} {name}')

平均スコア:0.9473 個別スコア:[0.9415 0.9474 0.9529] LogisticRegression
平均スコア:0.9141 個別スコア:[0.8889 0.9181 0.9353] SVC
平均スコア:0.9062 個別スコア:[0.8713 0.9415 0.9059] DecisionTreeClassifier
平均スコア:0.9629 個別スコア:[0.9649 0.9591 0.9647] RandomForestClassifier
平均スコア:0.9570 個別スコア:[0.9474 0.9649 0.9588] XGBClassifier


## 4. グリッドサーチ ***

In [16]:
params = {
       'C':[1, 10, 100, 1000, 10000],
       'gamma':[1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
}
algorithm = SVC(random_state=random_seed)

from sklearn.model_selection import StratifiedKFold
stratifiedkfold = StratifiedKFold(n_splits=3)

from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(algorithm, params, cv=stratifiedkfold)
gs.fit(x_train, y_train)

best = gs.best_estimator_
best_pred = best.predict(x_test)
print(best)


SVC(C=1000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1e-05, kernel='rbf',
    max_iter=-1, probability=False, random_state=123, shrinking=True, tol=0.001,
    verbose=False)


In [17]:
score = best.score(x_test, y_test)
print(f'スコア: {score:.4f}')

from sklearn.metrics import confusion_matrix
print()
print('混同行列')
print(confusion_matrix(y_test, best_pred))

スコア: 0.9825

混同行列
[[20  1]
 [ 0 36]]
