In [1]:
import hyperopt
hyperopt.__version__

'0.2.7'

In [2]:
import xgboost
xgboost.__version__

'1.5.0'

In [3]:
import lightgbm
lightgbm.__version__

'3.3.2'

## HyperOpt를 이용한 하이퍼파라미터 튜닝
    1. 검색 공간 설정
    2. 대체 모델을 위한 목적함수 지정
    3. 최적의 파라미터 유추

In [5]:
from hyperopt import hp

참고: 검색 공간 설정  
params = {  
    'max_depth' : 3,
    'eta' : 0.05,  
    'objective' : 'binary:logistic',  
    'eval_metric' : 'logloss'  
}

In [7]:
# 검색 공간 설정

search_space = {'x' : hp.quniform('x', -10, 10, 1), 'y' : hp.quniform('y', -15, 15, 1)}


In [8]:
# 목적함수 지정

def objective_func(search_space): 
    x = search_space['x']
    y = search_space['y']
    retval = x**2 - 20*y

    return retval

In [11]:
# 최적 입력값 유추

from hyperopt import fmin, tpe, Trials
import numpy as np

trial_val = Trials()

best_01 = fmin(fn=objective_func, space=search_space, algo=tpe.suggest, max_evals=5, 
     trials=trial_val, rstate=np.random.default_rng(seed=0))

best_01


100%|██████████| 5/5 [00:00<00:00, 384.71trial/s, best loss: -224.0]


{'x': -4.0, 'y': 12.0}

In [13]:
best_02 = fmin(fn=objective_func, space=search_space, algo=tpe.suggest, max_evals=20, 
     trials=trial_val, rstate=np.random.default_rng(seed=0))

best_02

100%|██████████| 20/20 [00:00<?, ?trial/s, best loss=?]


{'x': 2.0, 'y': 15.0}

In [15]:
trial_val.results

[{'loss': -64.0, 'status': 'ok'},
 {'loss': -184.0, 'status': 'ok'},
 {'loss': 56.0, 'status': 'ok'},
 {'loss': -224.0, 'status': 'ok'},
 {'loss': 61.0, 'status': 'ok'},
 {'loss': -64.0, 'status': 'ok'},
 {'loss': -184.0, 'status': 'ok'},
 {'loss': 56.0, 'status': 'ok'},
 {'loss': -224.0, 'status': 'ok'},
 {'loss': 61.0, 'status': 'ok'},
 {'loss': -296.0, 'status': 'ok'},
 {'loss': -40.0, 'status': 'ok'},
 {'loss': 281.0, 'status': 'ok'},
 {'loss': 64.0, 'status': 'ok'},
 {'loss': 100.0, 'status': 'ok'},
 {'loss': 60.0, 'status': 'ok'},
 {'loss': -39.0, 'status': 'ok'},
 {'loss': 1.0, 'status': 'ok'},
 {'loss': -164.0, 'status': 'ok'},
 {'loss': 21.0, 'status': 'ok'}]

In [16]:
losses = [ loss_dict['loss'] for loss_dict in trial_val.results]
losses

[-64.0,
 -184.0,
 56.0,
 -224.0,
 61.0,
 -64.0,
 -184.0,
 56.0,
 -224.0,
 61.0,
 -296.0,
 -40.0,
 281.0,
 64.0,
 100.0,
 60.0,
 -39.0,
 1.0,
 -164.0,
 21.0]

## XGBoost 하이퍼 파라미터 최적화

In [27]:
from hyperopt import fmin, tpe, Trials
from xgboost import XGBClassifier
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

dataset = load_breast_cancer()
cancer_df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
cancer_df.head(3)


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


In [20]:
X_features = dataset.data
y_label = dataset.target

# 1단계
X_train, X_test, y_train, y_test = train_test_split(X_features, y_label, test_size=0.2, random_state=156)

# 2단계
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=156)

In [43]:
# 목적함수 만들기

from hyperopt import fmin, tpe, Trials, STATUS_OK
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

def objective_func_xgb(params):
    
    model = XGBClassifier(
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        learning_rate=params['learning_rate'],
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        random_state=42,
        eval_metric='logloss'
    )
    score_mean = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()
    return {'loss': -1*score_mean , 'status': STATUS_OK}

In [44]:
# 하이퍼파라미터 검색공간

from hyperopt.pyll.base import scope

search_space = {'n_estimators' : scope.int(hp.quniform('n_estimators', 50, 300, 10)), 
                'max_depth' : scope.int(hp.quniform('max_depth', 3, 10, 1)),
                'learning_rate' : hp.loguniform('learning_rate', np.log(0.01), np.log(0.3)),
                'subsample' : hp.uniform('subsample', 0.5, 1.0),
                'colsample_bytree' : hp.uniform('colsample_bytree', 0.5, 1.0),      
                }


In [45]:
# 파라미터 유추

trials = Trials()

best_params = fmin(
    fn=objective_func_xgb, 
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials
)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]








  2%|▏         | 1/50 [00:00<00:36,  1.36trial/s, best loss: -0.9626373626373628]







  4%|▍         | 2/50 [00:01<00:39,  1.23trial/s, best loss: -0.9626373626373628]









  6%|▌         | 3/50 [00:02<00:38,  1.23trial/s, best loss: -0.9626373626373628]








  8%|▊         | 4/50 [00:03<00:40,  1.13trial/s, best loss: -0.9626373626373628]







 10%|█         | 5/50 [00:04<00:35,  1.25trial/s, best loss: -0.9692307692307693]








 12%|█▏        | 6/50 [00:04<00:30,  1.45trial/s, best loss: -0.9692307692307693]









 14%|█▍        | 7/50 [00:05<00:29,  1.45trial/s, best loss: -0.9692307692307693]






 16%|█▌        | 8/50 [00:05<00:25,  1.62trial/s, best loss: -0.9692307692307693]










 18%|█▊        | 9/50 [00:06<00:24,  1.68trial/s, best loss: -0.9692307692307693]






 20%|██        | 10/50 [00:06<00:21,  1.86trial/s, best loss: -0.9692307692307693]










 22%|██▏       | 11/50 [00:07<00:21,  1.77trial/s, best loss: -0.9692307692307693]







 24%|██▍       | 12/50 [00:07<00:21,  1.75trial/s, best loss: -0.9692307692307693]








 26%|██▌       | 13/50 [00:08<00:20,  1.77trial/s, best loss: -0.9692307692307693]









 28%|██▊       | 14/50 [00:09<00:20,  1.74trial/s, best loss: -0.9692307692307693]







 30%|███       | 15/50 [00:09<00:20,  1.71trial/s, best loss: -0.9692307692307693]






 32%|███▏      | 16/50 [00:09<00:17,  1.98trial/s, best loss: -0.9692307692307693]










 34%|███▍      | 17/50 [00:10<00:16,  1.99trial/s, best loss: -0.9692307692307693]









 36%|███▌      | 18/50 [00:11<00:17,  1.81trial/s, best loss: -0.9692307692307693]






 38%|███▊      | 19/50 [00:11<00:16,  1.91trial/s, best loss: -0.9692307692307693]










 40%|████      | 20/50 [00:12<00:17,  1.69trial/s, best loss: -0.9692307692307693]







 42%|████▏     | 21/50 [00:13<00:20,  1.40trial/s, best loss: -0.9692307692307693]








 44%|████▍     | 22/50 [00:13<00:19,  1.43trial/s, best loss: -0.9692307692307693]









 46%|████▌     | 23/50 [00:14<00:19,  1.40trial/s, best loss: -0.9692307692307693]







 48%|████▊     | 24/50 [00:15<00:19,  1.34trial/s, best loss: -0.9692307692307693]






 50%|█████     | 25/50 [00:15<00:14,  1.69trial/s, best loss: -0.9692307692307693]











 52%|█████▏    | 26/50 [00:16<00:15,  1.60trial/s, best loss: -0.9692307692307693]







 54%|█████▍    | 27/50 [00:17<00:14,  1.57trial/s, best loss: -0.9692307692307693]








 56%|█████▌    | 28/50 [00:17<00:12,  1.75trial/s, best loss: -0.9692307692307693]









 58%|█████▊    | 29/50 [00:18<00:13,  1.51trial/s, best loss: -0.9692307692307693]







 60%|██████    | 30/50 [00:19<00:13,  1.53trial/s, best loss: -0.9692307692307693]






 62%|██████▏   | 31/50 [00:19<00:10,  1.83trial/s, best loss: -0.9692307692307693]











 64%|██████▍   | 32/50 [00:20<00:10,  1.74trial/s, best loss: -0.9692307692307693]








 66%|██████▌   | 33/50 [00:21<00:12,  1.37trial/s, best loss: -0.9692307692307693]







 68%|██████▊   | 34/50 [00:21<00:11,  1.34trial/s, best loss: -0.9692307692307693]









 70%|███████   | 35/50 [00:22<00:10,  1.41trial/s, best loss: -0.9692307692307693]






 72%|███████▏  | 36/50 [00:22<00:08,  1.66trial/s, best loss: -0.9692307692307693]










 74%|███████▍  | 37/50 [00:23<00:06,  1.87trial/s, best loss: -0.9692307692307693]






 76%|███████▌  | 38/50 [00:23<00:06,  2.00trial/s, best loss: -0.9692307692307693]









 78%|███████▊  | 39/50 [00:24<00:05,  2.08trial/s, best loss: -0.9692307692307693]









 80%|████████  | 40/50 [00:24<00:04,  2.11trial/s, best loss: -0.9692307692307693]






 82%|████████▏ | 41/50 [00:24<00:03,  2.32trial/s, best loss: -0.9692307692307693]










 84%|████████▍ | 42/50 [00:25<00:03,  2.40trial/s, best loss: -0.9692307692307693]







 86%|████████▌ | 43/50 [00:25<00:03,  2.05trial/s, best loss: -0.9692307692307693]








 88%|████████▊ | 44/50 [00:26<00:02,  2.02trial/s, best loss: -0.9692307692307693]








 90%|█████████ | 45/50 [00:27<00:02,  1.85trial/s, best loss: -0.9692307692307693]








 92%|█████████▏| 46/50 [00:27<00:02,  1.85trial/s, best loss: -0.9692307692307693]









 94%|█████████▍| 47/50 [00:28<00:01,  1.77trial/s, best loss: -0.9692307692307693]






 96%|█████████▌| 48/50 [00:28<00:01,  1.82trial/s, best loss: -0.9692307692307693]








 98%|█████████▊| 49/50 [00:29<00:00,  1.94trial/s, best loss: -0.9736263736263737]










100%|██████████| 50/50 [00:29<00:00,  1.69trial/s, best loss: -0.9736263736263737]


In [46]:
best_params

{'colsample_bytree': 0.7361840526677788,
 'learning_rate': 0.10968994289760835,
 'max_depth': 7.0,
 'n_estimators': 140.0,
 'subsample': 0.5422426149464379}

In [49]:
# 최적의 파라미터를 적용한 모델 생성

best_model = XGBClassifier(n_estimators=int(best_params['n_estimators']),
                           max_depth=int(best_params['max_depth']),
                           subsample=best_params['subsample'],
                           colsample_bytree=best_params['colsample_bytree'],
                           random_state=42,
                           eval_metric='logloss'
                           )

In [50]:
best_model.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7361840526677788,
              enable_categorical=False, eval_metric='logloss', gamma=0,
              gpu_id=-1, importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=140, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=0.5422426149464379, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [51]:
from sklearn.metrics import accuracy_score

pred = best_model.predict(X_test)
accuracy_score(y_test, pred)


0.9824561403508771