In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import warnings

dataset = load_breast_cancer()

cancer_df = pd.DataFrame(data=dataset.data,columns=dataset.feature_names)
cancer_df['target'] = dataset.target
X_features = cancer_df.iloc[:,:-1]
y_label = cancer_df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X_features,y_label,test_size=0.2,random_state=156)

X_tr,X_val,y_tr,y_val = train_test_split(X_train,y_train,test_size=0.1,random_state=156)

In [10]:
from hyperopt import hp

xgb_search_space = {
    'max_depth' : hp.quniform('max_depth',5,20,1),
    'min_child_weight' : hp.quniform('min_child_weight',1,2,1),
    'learning_rate' : hp.uniform('learning_rate',0.01,0.2),
    'colsample_bytree' : hp.uniform('colsample_bytree',0.5,1)
}

In [11]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from hyperopt import STATUS_OK

def objective_func(search_space):
    xgb_clf = XGBClassifier(n_estimators = 100, max_depth = int(search_space['max_depth']),
                            min_child_weight = int(search_space['min_child_weight']), #int형으로 형변환 해줘야된다.
                            learning_rate = search_space['learning_rate'],
                            colsample_bytree=search_space['colsample_bytree'],
                            eval_metric='logloss')
    accuracy = cross_val_score(xgb_clf,X_train,y_train,scoring='accuracy',cv=3) #교차 검증 cross_val_score 
    
    return {'loss':-1 * np.mean(accuracy),'status':STATUS_OK}

In [13]:
from hyperopt import fmin, tpe, Trials

trial_val = Trials()
best = fmin(fn=objective_func,
           space = xgb_search_space,
           algo=tpe.suggest,
           max_evals = 50,
           trials=trial_val)
print('best:',best)

  2%|▉                                               | 1/50 [00:00<00:09,  5.35trial/s, best loss: -0.9428807947019867]







  4%|█▉                                              | 2/50 [00:00<00:08,  5.69trial/s, best loss: -0.9560822586266992]







  8%|███▊                                            | 4/50 [00:00<00:07,  6.11trial/s, best loss: -0.9626466829324968]







 10%|████▊                                           | 5/50 [00:00<00:07,  6.38trial/s, best loss: -0.9626612059951203]








 12%|█████▊                                          | 6/50 [00:00<00:06,  6.48trial/s, best loss: -0.9626612059951203]







 14%|██████▋                                         | 7/50 [00:01<00:07,  6.12trial/s, best loss: -0.9626612059951203]






 18%|████████▋                                       | 9/50 [00:01<00:06,  6.43trial/s, best loss: -0.9626757290577438]







 20%|█████████▍                                     | 10/50 [00:01<00:06,  6.41trial/s, best loss: -0.9626757290577438]








 22%|██████████▎                                    | 11/50 [00:01<00:06,  6.37trial/s, best loss: -0.9626757290577438]






 26%|████████████▏                                  | 13/50 [00:02<00:06,  6.10trial/s, best loss: -0.9626757290577438]







 28%|█████████████▍                                  | 14/50 [00:02<00:05,  6.22trial/s, best loss: -0.964868711513884]







 32%|███████████████▎                                | 16/50 [00:02<00:05,  6.12trial/s, best loss: -0.964868711513884]







 34%|████████████████▎                               | 17/50 [00:02<00:05,  6.29trial/s, best loss: -0.964868711513884]







 36%|█████████████████▎                              | 18/50 [00:02<00:05,  5.91trial/s, best loss: -0.964868711513884]







 38%|██████████████████▏                             | 19/50 [00:03<00:05,  6.06trial/s, best loss: -0.964868711513884]







 42%|████████████████████▏                           | 21/50 [00:03<00:04,  6.16trial/s, best loss: -0.964868711513884]







 44%|█████████████████████                           | 22/50 [00:03<00:04,  6.32trial/s, best loss: -0.964868711513884]







 48%|██████████████████████▌                        | 24/50 [00:03<00:04,  6.36trial/s, best loss: -0.9670616939700244]







 50%|███████████████████████▌                       | 25/50 [00:04<00:03,  6.47trial/s, best loss: -0.9692546764261647]







 52%|████████████████████████▍                      | 26/50 [00:04<00:03,  6.52trial/s, best loss: -0.9692546764261647]








 56%|██████████████████████████▎                    | 28/50 [00:04<00:03,  6.53trial/s, best loss: -0.9692546764261647]







 58%|███████████████████████████▎                   | 29/50 [00:04<00:03,  6.36trial/s, best loss: -0.9692546764261647]







 60%|████████████████████████████▏                  | 30/50 [00:04<00:03,  6.28trial/s, best loss: -0.9692546764261647]







 64%|██████████████████████████████                 | 32/50 [00:05<00:03,  5.66trial/s, best loss: -0.9692546764261647]







 66%|███████████████████████████████                | 33/50 [00:05<00:03,  5.54trial/s, best loss: -0.9692546764261647]







 70%|████████████████████████████████▉              | 35/50 [00:05<00:02,  5.94trial/s, best loss: -0.9692546764261647]







 72%|█████████████████████████████████▊             | 36/50 [00:05<00:02,  6.02trial/s, best loss: -0.9692546764261647]







 74%|██████████████████████████████████▊            | 37/50 [00:06<00:02,  5.95trial/s, best loss: -0.9692546764261647]







 78%|████████████████████████████████████▋          | 39/50 [00:06<00:01,  6.07trial/s, best loss: -0.9692546764261647]







 80%|█████████████████████████████████████▌         | 40/50 [00:06<00:01,  5.91trial/s, best loss: -0.9692546764261647]







 82%|██████████████████████████████████████▌        | 41/50 [00:06<00:01,  5.89trial/s, best loss: -0.9692546764261647]







 86%|████████████████████████████████████████▍      | 43/50 [00:07<00:01,  6.15trial/s, best loss: -0.9692546764261647]







 88%|█████████████████████████████████████████▎     | 44/50 [00:07<00:01,  5.76trial/s, best loss: -0.9692546764261647]






 90%|██████████████████████████████████████████▎    | 45/50 [00:07<00:00,  5.79trial/s, best loss: -0.9692546764261647]








 92%|███████████████████████████████████████████▏   | 46/50 [00:07<00:00,  5.94trial/s, best loss: -0.9692546764261647]







 96%|█████████████████████████████████████████████  | 48/50 [00:07<00:00,  6.02trial/s, best loss: -0.9692546764261647]







 98%|██████████████████████████████████████████████ | 49/50 [00:08<00:00,  5.81trial/s, best loss: -0.9692546764261647]







100%|███████████████████████████████████████████████| 50/50 [00:08<00:00,  6.06trial/s, best loss: -0.9692546764261647]
best: {'colsample_bytree': 0.5468796020576306, 'learning_rate': 0.1967281124299193, 'max_depth': 14.0, 'min_child_weight': 2.0}





In [16]:
print('colsample_bytree:{0}, learning_rate:{1}, max_depth:{2},min_child_weight:{3}'.format(
    round(best['colsample_bytree'],5),round(best['learning_rate'],5),
    int(best['max_depth']),int(best['min_child_weight'])
     ))

colsample_bytree:0.54688, learning_rate:0.19673, max_depth:14,min_child_weight:2


In [17]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test,pred)
    accuracy = accuracy_score(y_test,pred)
    precision = precision_score(y_test,pred)
    recall = recall_score(y_test,pred)
    f1 = f1_score(y_test,pred)
    
    roc_auc = roc_auc_score(y_test,pred_proba)
    print('오차 행렬')
    print(confusion)
    
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, f1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))
    

In [19]:
xgb_wrapper = XGBClassifier(n_estimators = 400, max_depth = int(best['max_depth']),
                            min_child_weight = int(best['min_child_weight']), #int형으로 형변환 해줘야된다.
                            learning_rate = round(best['learning_rate'],5),
                            colsample_bytree= round(best['colsample_bytree'],5),
                            )

evals = [(X_tr,y_tr),(X_val,y_val)]
xgb_wrapper.fit(X_tr,y_tr,early_stopping_rounds=50, eval_metric='logloss',eval_set=evals,verbose=True)

preds = xgb_wrapper.predict(X_test)
pred_proba = xgb_wrapper.predict_proba(X_test)[:,1]

get_clf_eval(y_test,preds,pred_proba)

[0]	validation_0-logloss:0.53702	validation_1-logloss:0.58490
[1]	validation_0-logloss:0.42918	validation_1-logloss:0.50536
[2]	validation_0-logloss:0.34964	validation_1-logloss:0.44700
[3]	validation_0-logloss:0.29039	validation_1-logloss:0.41077
[4]	validation_0-logloss:0.24321	validation_1-logloss:0.38232
[5]	validation_0-logloss:0.20751	validation_1-logloss:0.36475
[6]	validation_0-logloss:0.17725	validation_1-logloss:0.34085
[7]	validation_0-logloss:0.15396	validation_1-logloss:0.32800
[8]	validation_0-logloss:0.13486	validation_1-logloss:0.31164
[9]	validation_0-logloss:0.11747	validation_1-logloss:0.30094
[10]	validation_0-logloss:0.10362	validation_1-logloss:0.29411
[11]	validation_0-logloss:0.09125	validation_1-logloss:0.28013
[12]	validation_0-logloss:0.08287	validation_1-logloss:0.27751
[13]	validation_0-logloss:0.07354	validation_1-logloss:0.26616
[14]	validation_0-logloss:0.06797	validation_1-logloss:0.26737
[15]	validation_0-logloss:0.06314	validation_1-logloss:0.26376
[1



[116]	validation_0-logloss:0.01423	validation_1-logloss:0.22110
[117]	validation_0-logloss:0.01419	validation_1-logloss:0.22083
[118]	validation_0-logloss:0.01414	validation_1-logloss:0.21883
[119]	validation_0-logloss:0.01410	validation_1-logloss:0.21909
[120]	validation_0-logloss:0.01406	validation_1-logloss:0.21725
[121]	validation_0-logloss:0.01402	validation_1-logloss:0.21815
[122]	validation_0-logloss:0.01398	validation_1-logloss:0.21783
[123]	validation_0-logloss:0.01394	validation_1-logloss:0.21683
[124]	validation_0-logloss:0.01390	validation_1-logloss:0.21656
[125]	validation_0-logloss:0.01386	validation_1-logloss:0.21684
[126]	validation_0-logloss:0.01382	validation_1-logloss:0.21846
[127]	validation_0-logloss:0.01379	validation_1-logloss:0.21752
[128]	validation_0-logloss:0.01375	validation_1-logloss:0.21774
[129]	validation_0-logloss:0.01371	validation_1-logloss:0.21748
[130]	validation_0-logloss:0.01368	validation_1-logloss:0.21776
[131]	validation_0-logloss:0.01365	valid