<a href="https://colab.research.google.com/github/Mongsel8/Data-Anaysis/blob/main/Hyperopt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#HpyerOpt



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
 
from sklearn.datasets import load_iris
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
import pandas as pd

iris = load_iris() 

lr_clf = LogisticRegression(solver='liblinear')
knn_clf = KNeighborsClassifier(n_neighbors=7)
gnb_clf = GaussianNB()
vo_clf = VotingClassifier(estimators=[('LR', lr_clf), ('KNN', knn_clf)], voting='soft')

X_train, X_test, y_train, y_test = train_test_split(
iris.data
, 
iris.target
, test_size=.2)

vo_clf.fit(X_train,y_train) 
pred = vo_clf.predict(X_test) 
print(accuracy_score(y_test, pred) )

from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, pred) 

0.9666666666666667


array([[ 9,  0,  0],
       [ 0, 15,  1],
       [ 0,  0,  5]])

In [None]:
'''
from xgboost import plot_importance
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(10, 12))
# 사이킷런 래퍼 클래스를 입력해도 무방. 
plot_importance(lgbm_clf, ax=ax)
'''

'\nfrom xgboost import plot_importance\nimport matplotlib.pyplot as plt\n\nfig, ax = plt.subplots(figsize=(10, 12))\n# 사이킷런 래퍼 클래스를 입력해도 무방. \nplot_importance(lgbm_clf, ax=ax)\n'

In [None]:
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc)) 

In [None]:
from hyperopt import hp

search_space = {'x' : hp.quniform('x',-10,10,1), 'y' : hp.quniform('y',-15,15,1)}

In [None]:
from hyperopt import STATUS_OK

## 목적 함수 생성
## 변수, 검색 공간을 입력으로 받고, 특정 값을 반환하는 구조
def objective_func(search_space):
  
  x = search_space['x']
  y = search_space['y']
  retval = x ** 2 - 20 * y

  return retval

In [None]:
from hyperopt import fmin, tpe, Trials
import numpy as np

## 입력 결과를 저장할 객체 생성
trial_val = Trials()

## 목적 함수의 최솟값을 반환하는 최적 입력 변수를 5번 시도로 찾아냄
## fmin() 함수는 아래의 주요 인자를 가짐
best_01 = fmin(fn=objective_func,    ## 목적 함수
               space=search_space,   ## 검색 공간
               algo=tpe.suggest,     ## 베이지안 최적화 적용 알고리즘
               max_evals=20,         ## 입력 시도 횟수
               trials=trial_val,     ## 시도한 입력 값 및 입력 결과 저장
               #rstate=np.random.default_rng(seed=0)   ## fmin()을 시도할 때마다 동일한 결과를 가질 수 있도록 설정하는 랜덤 시드
               )

100%|██████████| 20/20 [00:00<00:00, 452.43it/s, best loss: -259.0]


In [None]:
print('best:', best_01)
trial_val.results
trial_val.vals

best: {'x': 1.0, 'y': 13.0}


{'x': [4.0,
  6.0,
  7.0,
  1.0,
  -7.0,
  -0.0,
  9.0,
  2.0,
  -9.0,
  -0.0,
  1.0,
  -9.0,
  5.0,
  6.0,
  -9.0,
  -5.0,
  -6.0,
  5.0,
  -6.0,
  7.0],
 'y': [-7.0,
  12.0,
  0.0,
  13.0,
  -3.0,
  -10.0,
  6.0,
  1.0,
  -8.0,
  10.0,
  -11.0,
  -8.0,
  -5.0,
  -1.0,
  12.0,
  6.0,
  0.0,
  -15.0,
  -14.0,
  13.0]}

#하이퍼파라메터 튜닝


In [None]:
dataset = load_breast_cancer()
features = dataset.data
labels = dataset.target

In [None]:
## 데이터를 Pandas DataFrame으로 로드
cancer_df = pd.DataFrame(data=features, columns=dataset.feature_names)
cancer_df['target'] = labels
cancer_df.head(5)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [None]:
## 학습 및 검증 데이터셋으로 데이터 분리
X_features = cancer_df.iloc[:, :-1]
y_label = cancer_df.iloc[:, -1]

In [None]:
## 학습, 검증용 데이터셋 비율 8 : 2
X_train, X_test, y_train, y_test = train_test_split(X_features, y_label, test_size=0.2, random_state=156)

## X_train, y_train을 다시 9 : 1 비율로 분리
## => XGBoost가 제공하는 교차 검증 성능 평가 및 조기 중단을 수행하기 위함
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=156)

In [None]:
## 모델 성능 평가 함수 선언
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):

    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    roc_auc = roc_auc_score(y_test, pred_proba)

    print('오차 행렬')
    print(confusion)
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1: {3:.4f}, AUC: {4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [None]:
from hyperopt import hp

xgb_search_space = {
    'max_depth':hp.quniform('max_depth', 5, 20, 1),                     ## 정수형 하이퍼 파라미터 => quniform 사용
    'min_child_weight':hp.quniform('min_child_weight', 1, 2, 1),        ## 정수형 하이퍼 파라미터 => quniform 사용
    'learning_rate':hp.uniform('learning_rate', 0.01, 0.2),
    'colsample_bytree':hp.uniform('colsample_bytree', 0.5, 1),
}

In [None]:
## 2. 목적 함수 설정
## 검색 공간에서 설정한 하이퍼 파라미터들을 입력 받아서 XGBoost를 학습시키고, 평가 지표를 반환하도록 구성되어야 함

from sklearn.model_selection import cross_val_score  ## 교차 검증
from xgboost import XGBClassifier
from hyperopt import STATUS_OK

def objective_func(search_space):

  xgb_clf = XGBClassifier(
      n_estimators=100,
      max_depth=int(search_space['max_depth']),                ## int형으로 형변환 필요
      min_child_weight=int(search_space['min_child_weight']),  ## int형으로 형변환 필요
      learning_rate=search_space['learning_rate'], 
      colsample_bytree=search_space['colsample_bytree'],
      eval_metric='logloss'
  )

  accuracy = cross_val_score(xgb_clf, X_train, y_train, scoring='accuracy', cv=3)  ## 3개의 교차 검증 세트로 정확도 반환

  ## acc는 cv=3 개수만큼의 결과를 리스트로 가짐, 이를 평균하여 반환하되, -1을 곱함
  return {
      'loss':(-1) * np.mean(accuracy),
      'status':STATUS_OK
  }

In [None]:
## 3. fmin()을 사용하여 최적 하이퍼 파라미터 찾기

from hyperopt import fmin, tpe, Trials

trial_val = Trials() ## 결과 저장

best = fmin(
    fn=objective_func,
    space=xgb_search_space,
    algo=tpe.suggest,
    max_evals=50,    ## 최대 반복 횟수 지정
    trials=trial_val,
)

100%|██████████| 50/50 [00:10<00:00,  4.77it/s, best loss: -0.9692401533635412]


In [None]:
## 획득한 최적의 하이퍼 파라미터를 이용하여 모델 선언

xgb_wrapper = XGBClassifier(
    n_estimators=400,
    learning_rate=round(best['learning_rate'], 5),
    max_depth=int(best['max_depth']),
    min_child_weight=int(best['min_child_weight']),
    colsample_bytree=round(best['colsample_bytree'], 5)
)

In [None]:
## early stopping

evals = [(X_tr, y_tr), (X_val, y_val)]

## model train

xgb_wrapper.fit(
    X_tr, y_tr,
    early_stopping_rounds=50,
    eval_metric='logloss',
    eval_set=evals,
    verbose=True
)

[0]	validation_0-logloss:0.551707	validation_1-logloss:0.610915
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 50 rounds.
[1]	validation_0-logloss:0.449256	validation_1-logloss:0.547035
[2]	validation_0-logloss:0.371838	validation_1-logloss:0.486076
[3]	validation_0-logloss:0.314424	validation_1-logloss:0.455644
[4]	validation_0-logloss:0.26539	validation_1-logloss:0.413263
[5]	validation_0-logloss:0.226504	validation_1-logloss:0.385173
[6]	validation_0-logloss:0.1959	validation_1-logloss:0.359044
[7]	validation_0-logloss:0.171565	validation_1-logloss:0.337337
[8]	validation_0-logloss:0.150198	validation_1-logloss:0.322163
[9]	validation_0-logloss:0.131908	validation_1-logloss:0.31126
[10]	validation_0-logloss:0.118272	validation_1-logloss:0.300637
[11]	validation_0-logloss:0.104171	validation_1-logloss:0.289713
[12]	validation_0-logloss:0.092854	validation_1-logloss:0.280076
[13]

XGBClassifier(colsample_bytree=0.72884, learning_rate=0.18175, max_depth=12,
              min_child_weight=2, n_estimators=400)

In [None]:
## eval

preds = xgb_wrapper.predict(X_test)
pred_proba = xgb_wrapper.predict_proba(X_test)[:, 1]

get_clf_eval(y_test, preds, pred_proba)

오차 행렬
[[33  4]
 [ 2 75]]
정확도: 0.9474, 정밀도: 0.9494, 재현율: 0.9740, F1: 0.9615, AUC: 0.9937


In [None]:
from hyperopt import hp
import numpy as np
xgb_search_space = {'max_depth': hp.quniform('max_depth', 5, 15, 1),'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 0.95),'learning_rate': hp.uniform('learning_rate', 0.01, 0.2)} 
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier  
# 목적 함수 설정. 
# 추후 fmin()에서 입력된 search_space값으로 XGBClassifier 교차 검증 학습 후 -1* roc_auc 평균 값을 반환.  
def objective_func(search_space):
    xgb_clf = XGBClassifier(n_estimators=100, max_depth=int(search_space['max_depth'])
                           , min_child_weight=int(search_space['min_child_weight'])
                            , colsample_bytree=search_space['colsample_bytree']
                            , learning_rate=search_space['learning_rate']
                           )
    
    # 3개 k-fold 방식으로 평가된 roc_auc 지표를 담는 list
    roc_auc_list= []
    
    # 3개 k-fold방식 적용 
    kf = KFold(n_splits=3)
    # X_train을 다시 학습과 검증용 데이터로 분리
    for tr_index, val_index in kf.split(X_train):
        # kf.split(X_train)으로 추출된 학습과 검증 index값으로 학습과 검증 데이터 세트 분리 
        X_tr, y_tr = X_train.iloc[tr_index], y_train.iloc[tr_index]
        X_val, y_val = X_train.iloc[val_index], y_train.iloc[val_index]
        # early stopping은 30회로 설정하고 추출된 학습과 검증 데이터로 XGBClassifier 학습 수행.
        xgb_clf.fit(X_tr, y_tr, early_stopping_rounds=30, eval_metric='auc'
                   , eval_set=[(X_tr, y_tr), (X_val, y_val)])
    
        # 1로 예측한 확률값 추출후 roc auc 계산하고 평균 roc auc 계산을 위해 list에 결과값 담음. 
        score = roc_auc_score(y_val, xgb_clf.predict_proba(X_val)[:, 1])
        roc_auc_list.append(score)
    
    # 3개 k-fold로 계산된 roc_auc값의 평균값을 반환하되, 
    # HyperOpt는 목적함수의 최소값을 위한 입력값을 찾으므로 -1을 곱한 뒤 반환. 
    return -1 * np.mean(roc_auc_list) 
from hyperopt import fmin, tpe, Trials

trials = Trials()
SEED = 30
# fmin()함수를 호출. max_evals지정된 횟수만큼 반복 후 목적함수의 최소값을 가지는 최적 입력값 추출.
best = fmin(fn=objective_func,
            space=xgb_search_space,
            algo=tpe.suggest,
            max_evals=50, # 최대 반복 횟수를 지정합니다.
            trials=trials)

print('best:', best)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
[10]	validation_0-auc:0.995699	validation_1-auc:0.986812

[11]	validation_0-auc:0.996307	validation_1-auc:0.992203

[12]	validation_0-auc:0.996712	validation_1-auc:0.992684

[13]	validation_0-auc:0.996892	validation_1-auc:0.993454

[14]	validation_0-auc:0.997028	validation_1-auc:0.994032

[15]	validation_0-auc:0.997478	validation_1-auc:0.993261

[16]	validation_0-auc:0.997658	validation_1-auc:0.993069

[17]	validation_0-auc:0.997568	validation_1-auc:0.992876

[18]	validation_0-auc:0.997658	validation_1-auc:0.992491

[19]	validation_0-auc:0.997883	validation_1-auc:0.991529

[20]	validation_0-auc:0.997838	validation_1-auc:0.991529

[21]	validation_0-auc:0.997883	validation_1-auc:0.991914

[22]	validation_0-auc:0.998199	validation_1-auc:0.991914

[23]	validation_0-auc:0.998154	validation_1-auc:0.991914

[24]	validation_0-auc:0.998424	validation_1-auc:0.991914

[25]	validation_0-auc:0.998604	validation_1-auc:0.992106

[26]	validation_0-auc: