## VotingClassifier

In [1]:
import pandas as pd

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
cancer = load_breast_cancer()

data_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
data_df.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


> VotingClassifier 클래스는 주요 생성 인자로 estimators와 voting 값을 입력받음. 
> estimators는 리스트 값으로 보팅에 사용될 여러 개의 Classifier 객체들을 튜플 형식으로 입력받으며 voting은 'hard'(하드보팅), 'soft'(소프트보팅) 중 보팅 방식을 적용하라는 의미(기본값 = hard)

In [4]:
# 개별 모델은 로지스틱 회귀와 KNN임.
lr_clf = LogisticRegression(solver='liblinear')
knn_clf = KNeighborsClassifier(n_neighbors=8)

# 개별 모델을 소프트 보팅 기반의 앙상블 모델로 구현한 분류기
vo_clf = VotingClassifier(estimators=[('LR', lr_clf),('KNN', knn_clf)], voting='soft')
x_train, x_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.2, random_state=156)

# VotingClassifier 학습/예측/평가.
vo_clf.fit(x_train, y_train)
pred = vo_clf.predict(x_test)
print('Voting 분류기 정확도: {0:.4f}'.format(accuracy_score(y_test, pred)))

# 개별 모델의 학습/예측/평가
classifiers = [lr_clf, knn_clf]
for classifier in classifiers:
    classifier.fit(x_train, y_train)
    pred = classifier.predict(x_test)
    class_name = classifier.__class__.__name__
    print('{0}정확도: {1:.4f}'.format(class_name, accuracy_score(y_test, pred)))

Voting 분류기 정확도: 0.9561
LogisticRegression정확도: 0.9474
KNeighborsClassifier정확도: 0.9386


> LogisticRegression parameters : solver

|구분|설명|
|--|--|
|||

> 보팅 분류기가 정확도가 조금 높게 나왔지만, 보팅으로 여러 개의 분류기를 결합한다고 해서 무조건 기반 분류기 보다 예측 성능이 향상되지는 않는다. 데이터의 특성과 분포 등 다양한 요건에 따라 오히려 기반 분류기 중 가장 좋은 분류기의 성능이 보팅했을 때보다 나을 수도 있다. 


## RandomForestClassifier

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.datasets import get

> n_estimators : 랜덤포레스트에서 결정트리의 개수 지정. 디폴트는 10개. 많이 설정할수록 좋은 성능을 기대할 수 있지만 계속 증가시킨다고 성능이 무조건 향상되는 것은 아니다. 또한 늘릴 수록 학습 수행 시간이 오래 걸림.

> max_features : 결정트리에 사용된 max_features파라미터와 같음. 하지만 RandomForestClassifier의 기본 max_features는 'None'이 아니라 'auto'즉, 'sqrt'와 같다. 따라서 랜덤포레스트의 트리를 분할하는 피처를 참조할 때 전체 피처가 아니라 sqrt(전체피처개수)만큼 참조

> max_depth, min_samples_leaf, min_samples_split와 같이 결정트리에서 과적합을 개선하기 위해 사용되는 파라미터가 랜덤포레스트에 똑같이 적용. 

#### 하이퍼 파라미터 

In [6]:
from sklearn.model_selection import GridSearchCV

In [None]:
# params = { 'max_depth': [8,16,24],
#           'min_samples_split':[2,8,16]
#           'min_samples_leaf':[1,6,12]}

# # RandomForestClassifier객체 생성 후 GridSearchCV 수행
# rf_clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
# grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=2, n_jobs=-1)
# grid_cv.fit(x_train, x_test)

# print('최적 하이퍼 파라미터:', grid_cv.best_params_)
# print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

#### 최적의 하이퍼 파라미터를 가지고 RandomForestClassifier 실시

In [None]:
# rf_clf1 = RandomForestClassifier(n_estimators=100, min_samples_leaf=6, max_depth=16, min_samples_split=2, random_state=0)
# rf_clf1.fit(x_train, y_train)
# pred = rf_clf1.predict(x_test)
# print('예측정확도:{0:.4f}'.format(accuracy_score(y_test, pred)))

#### feature_importances_ 속성을 이용해 피처의 중요도 확인가능

> RandomForestClassifier 역시 DecisionTreeClassifier와 같이 feature_importances_속성을 이용해 알고리즘이 선택한 피처의 정확도 확인 가능.

In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sb
# %matplotlib inline

# ftr_importance_values = rf_clf1.feature_importances_
# ftr_importances = pd.Series(ftr_importance_values, index=x_train.columns)
# ftr_top20 = ftr_importances.sort_values(ascending=False)[:20]

# plt.figure(figsize=(8,6))
# plt.title('Feature importances Top 20')
# sb.barplot(x=ftr_top20, y=ftr_top20.index)
# plt.show()

## GBM (GradientBoostingClassifier)

In [None]:
# from sklearn.ensemble import GradientBoostingClassifier
# import time
# import warnings
# warnings.filterwarnings('ignore')

# x_train, x_test, y_train, y_test = get_human_dataset()

# # GBM 수행시간 측정을 위함. 시작 시간 설정.
# start_time = time.time()

# gb_clf = GradientBoostingClassifier(random_state=0)
# gb_clf.fit(x_train, y_train)
# gb_pred = gb_clf.predict(x_test)
# gb_accuracy = accuracy_score(y_test, pred)

# print('GBM 정확도: {0:.4f}'.format(gb_accuracy))
# print('GBM 수행시간:{0.1f}초'.format(time.time()-start_time))

|파라미터|설명|
|---|---|
|loss|경사하강법에서 사용할 비용함수를 지정. 특별한 이유가 없으면 기본값인 'deviance'사용|
|learning_rate|GBM이 학습을 진행할 때마다 적용하는 학습률. 약한 분류기가 순차적으로 오류값을 보정해 나가는데 적용하는 계수. |

## XGBoost(eXtra Gradient Boost)

In [7]:
import xgboost as xgb
from xgboost import plot_importance
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [8]:
dataset = load_breast_cancer()
features = dataset.data
labels = dataset.target

cancer_df = pd.DataFrame(data=features, columns=dataset.feature_names)
cancer_df['target'] = labels
cancer_df.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0


> 악성('malignant'):0, 양성('benign'):1

In [9]:
print(dataset.target_names)
print(cancer_df['target'].value_counts())

['malignant' 'benign']
target
1    357
0    212
Name: count, dtype: int64


In [10]:
# cancer_df 에서 feature용 DataFrame과 Label용 Series 객체 추출
# 맨 마지막 칼럼이 Label임. Feature용 DataFrame은 cancer_df의 첫번째 칼럼에서 맨 마지막 두번째 칼럼까지를 :-1슬라이싱으로 추출.
x_features = cancer_df.iloc[:, :-1]
y_label = cancer_df.iloc[:,-1]

# 전체 데이터 중 80%는 학습용 데이터, 20%는 테스트용 데이터 추출
x_train, x_test, y_train, y_test = train_test_split(x_features, y_label, test_size=0.2, random_state=156)

# 위에서 만든 x_train, y_train을 다시 쪼개서 90%는 학습과 10%는 검증용 데이터로 분리
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=156)

print(x_train.shape, x_test.shape)
print(x_tr.shape, x_val.shape)

(455, 30) (114, 30)
(409, 30) (46, 30)


In [11]:
# 학습데이터의 학습용
dtr = xgb.DMatrix(data=x_tr, label=y_tr)
# 학습데이터의 검증용
dval = xgb.DMatrix(data=x_val, label=y_val)
# 검증데이터
dtest = xgb.DMatrix(data=x_test, label=y_test)

In [12]:
# 하이퍼 파라미터 설정
params = {'max_depth':3, 
          'eta':0.05, # =learning_rate
          'objective':'binary:logistic', # 종속변수가 이진분류이므로 binary logistic
          'eval_metric':'logloss'} # 오류함수 평가 성능 지표는 logloss

num_rounds = 400 # 부스팅반복횟수

|xgboost|XGBoostClassifier|
|--|--|
|'eta'|'learning_rate'|
|'sub_sample'|subsample|
|lambda|reg_lambda|
|alpha|reg_alpha|

In [13]:
# 학습 데이터 셋은 'train' 평가데이터 셋은 'eval'로 명기
eval_list = [(dtr, 'train'),(dval, 'eval')]

# 하이퍼 파라미터와 early stopping파라미터를 train()함수의 파라미터로 전달
xgb_model = xgb.train(params=params, dtrain=dtr, num_boost_round=num_rounds, early_stopping_rounds=50, evals=eval_list)

[0]	train-logloss:0.62480	eval-logloss:0.63104
[1]	train-logloss:0.58674	eval-logloss:0.60478
[2]	train-logloss:0.55226	eval-logloss:0.58223
[3]	train-logloss:0.52086	eval-logloss:0.56184
[4]	train-logloss:0.49192	eval-logloss:0.54118
[5]	train-logloss:0.46537	eval-logloss:0.52223
[6]	train-logloss:0.44029	eval-logloss:0.50287
[7]	train-logloss:0.41666	eval-logloss:0.48620
[8]	train-logloss:0.39525	eval-logloss:0.46974
[9]	train-logloss:0.37542	eval-logloss:0.45497
[10]	train-logloss:0.35701	eval-logloss:0.44131
[11]	train-logloss:0.33982	eval-logloss:0.43134
[12]	train-logloss:0.32297	eval-logloss:0.41972
[13]	train-logloss:0.30725	eval-logloss:0.40902
[14]	train-logloss:0.29327	eval-logloss:0.39883
[15]	train-logloss:0.27946	eval-logloss:0.38968
[16]	train-logloss:0.26691	eval-logloss:0.38150
[17]	train-logloss:0.25473	eval-logloss:0.37368
[18]	train-logloss:0.24385	eval-logloss:0.36666
[19]	train-logloss:0.23338	eval-logloss:0.35994
[20]	train-logloss:0.22320	eval-logloss:0.35374
[2

In [14]:
pred_probs = xgb_model.predict(dtest)
print('predict() 수행 결과값을 10개만 표시, 예측 확률 값으로 표시됨')
print(np.round(pred_probs[:10],3))

# 예측 확률이 0.5보다 크면 1, 그렇지 않으면 0으로 예측값 결정하여 List 객체인 preds에 저장
preds = [1 if x>0.5 else 0 for x in pred_probs]
print('예측값 10개만 표시:', preds[:10])

predict() 수행 결과값을 10개만 표시, 예측 확률 값으로 표시됨
[0.938 0.004 0.75  0.049 0.98  1.    0.999 0.999 0.998 0.001]
예측값 10개만 표시: [1, 0, 1, 0, 1, 1, 1, 1, 1, 0]


## LightGBM(LGBMClassifier)

|파라미터|설명|
|--|--|
|num_leaves|개별 트리가 가질 수 있는 최대 리프의 개수|
|min_child_samples|min_data_in_leaf|
|max_depth||


In [1]:
from lightgbm import LGBMClassifier

import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

In [2]:
dataset = load_breast_cancer()

cancer_df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)

In [5]:
cancer_df['target'] = dataset.target
x_features = cancer_df.iloc[:,:-1]
y_label = cancer_df.iloc[:,-1]

# 전체 데이터 중 80%는 학습용 데이터, 20%sms 테스트용 데이터 추출
x_train, x_test, y_train, y_test = train_test_split(x_features, y_label, test_size=0.2, random_state=156)

# 위에서 만든 x_train, y_train을 다시 쪼개서 90%는 학습과 10%는 검증용 데이터로 분리
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=156)

# 앞서 XGBoost와 동일하게 n_estimators는 400설정
lgbm_wrapper = LGBMClassifier(n_estimators=400, learning_rate=0.05)

# LightGBM도 XGBoost와 동일하게 조기 중단 수행 가능.
evals = [(x_tr, y_tr), (x_val, y_val)]
lgbm_wrapper.fit(x_tr, y_tr,  eval_metric='logloss', eval_set=evals)
preds = lgbm_wrapper.predict(x_test)
pred_proba = lgbm_wrapper.predict_proba(x_test)[:,1]

[LightGBM] [Info] Number of positive: 251, number of negative: 158
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000882 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4092
[LightGBM] [Info] Number of data points in the train set: 409, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.613692 -> initscore=0.462858
[LightGBM] [Info] Start training from score 0.462858


## 베이지안 최적화 기반의 HyperOpt를 이용한 하이퍼 파라미터 튜닝

In [7]:
!pip install hyperopt

Collecting hyperopt
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
     ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
     -------- ------------------------------- 0.3/1.6 MB 9.6 MB/s eta 0:00:01
     -------------------------------------- - 1.5/1.6 MB 16.0 MB/s eta 0:00:01
     ---------------------------------------- 1.6/1.6 MB 14.3 MB/s eta 0:00:00
Collecting networkx>=2.2 (from hyperopt)
  Downloading networkx-3.2.1-py3-none-any.whl.metadata (5.2 kB)
Collecting future (from hyperopt)
  Downloading future-0.18.3.tar.gz (840 kB)
     ---------------------------------------- 0.0/840.9 kB ? eta -:--:--
     ----------------- ------------------- 399.4/840.9 kB 12.6 MB/s eta 0:00:01
     ------------------------------------  839.7/840.9 kB 10.6 MB/s eta 0:00:01
     -------------------------------------- 840.9/840.9 kB 8.9 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting cloud

In [8]:
from hyperopt import hp

#### 입력값의 검색 공간 설정

|파라미터|설명|
|--|--|
|hp.quniform(label, low, high, q)|검색공간 설정 low최솟값, high최댓값, q간격|
|hp.uniform(label, low, high)|최솟값low에서 최댓값high까지 정규 분포 형태의 검색 공간 설정|
|hp.randint(label, upper)|0부터 최댓값upper까지 random한 정숫값으로 검색 공간 설정|
|hp.loguniform(label, low, high)|exp(uniform(low, high))값을 반환하며 반환값의 log변환된 값은 정규 분포 형태를 가지는 검색 공간 설정|
|hp.choice(label, options)|검색값이 문자열 또는 문자열과 숫자값이 섞여 있을 경우 설정. Options는 리스트나 튜플 형태로 제공되며 hp.choice('tree_criterion'.['gini','entropy])과 같이 설정하면 입력변수 tree_criterion의 값을 'gini'와 'entropy'로 설정하여 입력함|

In [9]:
# -10~ 10까지 1간격을 가지는 입력변수 x와 -15~15까지 1간격으로 입력변수 y 설정
search_space = {'x':hp.quniform('x', -10, 10, 1), 'y':hp.quniform('y', -15, 15, 1)}

#### 목적함수 설정

In [10]:
from hyperopt import STATUS_OK

In [11]:
# 목적함수를 생성. 변숫값과 변수 검색 공간을 가지는 딕셔너리를 인자로 받고, 특정 값을 반환
def objective_func(search_space):
    x = search_space['x']
    y = search_space['y']
    retval = x**2 - 20*y

    return retval

#### 베이지안 최적화 기법 수행

|파라미터|설명|
|--|--|
|fn|목적함수|
|space|검색 공간 딕셔너리|
|algo|베이지안 최적화 적용 알고리즘(default=tpe.suggest)|
|max_evals|최적 입려값을 찾기 위한 입력값 시도횟수|
|trials|최적 입력값을 찾기 위해 시도한 입력값 및 해당 입력값의 목적 함수 반환값 결과를 저장하는데 사용. |
|rstate|fmin()을 수행할 때마다 동일한 결과값을 가질 수 있도록 설정하는 랜덤 시드(seed)값(일반적으로 적용하지 않는다.)|

In [12]:
from hyperopt import fmin, tpe, Trials

In [13]:
# 입력 결괏값을 저장한 Trials 객체값 생성.
trial_val = Trials()

# 목적 함수의 최솟값을 반환하는 최적 입력 변숫값을 5번의 입력값 시도(max_evals=5)로 찾아냄.
best_01 = fmin(fn=objective_func, space=search_space, algo=tpe.suggest, max_evals=5, trials=trial_val, rstate=np.random.default_rng(seed=0))

100%|██████████| 5/5 [00:00<00:00, 413.16trial/s, best loss: -224.0]


In [14]:
print('best', best_01)

best {'x': -4.0, 'y': 12.0}


In [15]:
# 입력 결괏값을 저장한 Trials 객체값 생성.
trial_val = Trials()

# 목적 함수의 최솟값을 반환하는 최적 입력 변숫값을 5번의 입력값 시도(max_evals=5)로 찾아냄.
best_02 = fmin(fn=objective_func, space=search_space, algo=tpe.suggest, max_evals=20, trials=trial_val, rstate=np.random.default_rng(seed=0))

100%|██████████| 20/20 [00:00<00:00, 1278.28trial/s, best loss: -296.0]


In [16]:
print('best', best_02)

best {'x': 2.0, 'y': 15.0}


#### 함수 반환값 확인해보기 (loss)

In [17]:
# fmin()에 인자로 들어가는 Trials객체의 result 속성에 파이썬 리스트로 목적 함수 반환값들이 저장됨.
# 리스트 내부의 개별 원소는 {'loss':함수 반환값, 'status':반환 상태값}와 같은 딕셔너리임. 
print(trial_val.results)

[{'loss': -64.0, 'status': 'ok'}, {'loss': -184.0, 'status': 'ok'}, {'loss': 56.0, 'status': 'ok'}, {'loss': -224.0, 'status': 'ok'}, {'loss': 61.0, 'status': 'ok'}, {'loss': -296.0, 'status': 'ok'}, {'loss': -40.0, 'status': 'ok'}, {'loss': 281.0, 'status': 'ok'}, {'loss': 64.0, 'status': 'ok'}, {'loss': 100.0, 'status': 'ok'}, {'loss': 60.0, 'status': 'ok'}, {'loss': -39.0, 'status': 'ok'}, {'loss': 1.0, 'status': 'ok'}, {'loss': -164.0, 'status': 'ok'}, {'loss': 21.0, 'status': 'ok'}, {'loss': -56.0, 'status': 'ok'}, {'loss': 284.0, 'status': 'ok'}, {'loss': 176.0, 'status': 'ok'}, {'loss': -171.0, 'status': 'ok'}, {'loss': 0.0, 'status': 'ok'}]


#### 입력변수명 확인해보기 (x, y)

In [18]:
# Trials 객체의 vals 속성에 {'입력변수명':개별 수행 시마다 입력되 값 리스트} 형태로 저장됨.
print(trial_val.vals)

{'x': [-6.0, -4.0, 4.0, -4.0, 9.0, 2.0, 10.0, -9.0, -8.0, -0.0, -0.0, 1.0, 9.0, 6.0, 9.0, 2.0, -2.0, -4.0, 7.0, -0.0], 'y': [5.0, 10.0, -2.0, 12.0, 1.0, 15.0, 7.0, -10.0, 0.0, -5.0, -3.0, 2.0, 4.0, 10.0, 3.0, 3.0, -14.0, -8.0, 11.0, -0.0]}


#### 데이터프레임으로 만들어서 직관적으로 확인해보기

In [19]:
import pandas as pd

In [20]:
# results에서 loss 키값에 해당하는 밸류들을 추출하여 list로 생성.
losses = [loss_dict['loss'] for loss_dict in trial_val.results]

# DataFrame 생성.
result_df = pd.DataFrame({'x':trial_val.vals['x'], 'y':trial_val.vals['y'], 'losses':losses})
result_df

Unnamed: 0,x,y,losses
0,-6.0,5.0,-64.0
1,-4.0,10.0,-184.0
2,4.0,-2.0,56.0
3,-4.0,12.0,-224.0
4,9.0,1.0,61.0
5,2.0,15.0,-296.0
6,10.0,7.0,-40.0
7,-9.0,-10.0,281.0
8,-8.0,0.0,64.0
9,-0.0,-5.0,100.0


#### 실습 - HyperOpt를 이용한 XGBoost 하이퍼 파라미터 최적화

In [21]:
from sklearn.datasets import load_breast_cancer

In [22]:
dataset = load_breast_cancer()

cancer_df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)

In [23]:
cancer_df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,1.0950,0.9053,8.589,153.40,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.01860,0.01340,0.01389,0.003532,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.006150,0.04006,0.03832,0.02058,0.02250,0.004571,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,0.4956,1.1560,3.445,27.23,0.009110,0.07458,0.05661,0.01867,0.05963,0.009208,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.011490,0.02461,0.05688,0.01885,0.01756,0.005115,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,1.1760,1.2560,7.673,158.70,0.010300,0.02891,0.05198,0.02454,0.01114,0.004239,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,0.7655,2.4630,5.203,99.04,0.005769,0.02423,0.03950,0.01678,0.01898,0.002498,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,0.4564,1.0750,3.425,48.55,0.005903,0.03731,0.04730,0.01557,0.01318,0.003892,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,0.7260,1.5950,5.772,86.22,0.006522,0.06158,0.07117,0.01664,0.02324,0.006185,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [28]:
cancer_df.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')

In [30]:
cancer_df['target'] = dataset.target
x_features = cancer_df.iloc[:,:-1]
y_label = cancer_df.iloc[:,-1]

# 전체 데이터 중 80%는 학습용 데이터, 20%sms 테스트용 데이터 추출
x_train, x_test, y_train, y_test = train_test_split(x_features, y_label, test_size=0.2, random_state=156)

# 위에서 만든 x_train, y_train을 다시 쪼개서 90%는 학습과 10%는 검증용 데이터로 분리
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=156)

#### 공간설정

In [31]:
from hyperopt import hp

In [37]:
# max_depth는 5에서 20까지 1간격으로, min_child_weight는 1에서 2까지 1간격으로
# colsample_bytree는 0.5에서 1사이, learning_rate는 0.01에서 0.2사이 정규 분포된 값으로 검색.
xgb_search_space = {'max_depth':hp.quniform('max_depth', 5, 20, 1),
                    'min_child_weight':hp.quniform('min_child_weight', 1,2,1),
                     'learning_rate':hp.uniform('learning_rate', 0.01, 0.2),
                      'colsample_bytree':hp.uniform('colsample_bytree', 0.5,1) }

#### 입력변수 설정

In [38]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from hyperopt import STATUS_OK

In [39]:
# fmin()에서 입력된 search_space 값으로 입력된 모든 값은 실수형임.
# XGBClassifier의 정수형 하이퍼 파라미터는 정수형 변환을 해줘야 함.
# 정확도가 높을수록 더 좋은 수치임. -1 * 정확도를 곱해서 큰 정확도일수록 최소가 되도록 변환.
def objective_func(search_space):
    # 수행시간 절약을 위해 n_estimators는 100으로 축소
    xgb_clf = XGBClassifier(n_estimators=100, max_depth=int(search_space['max_depth']),
                            min_child_weight=int(search_space['min_child_weight']),
                            learning_rate = search_space['learning_rate'],
                            colsample_bytree=search_space['colsample_bytree'],
                            eval_metric='logloss')
    accuracy = cross_val_score(xgb_clf, x_train, y_train, scoring='accuracy', cv=3)

    # accuracy는 cv=3개수만큼 roc_auc결과를 리스트로 가짐. 이를 평균해서 반환하되 -1을 곱함.
    return {'loss':-1 * np.mean(accuracy), 'status':STATUS_OK}

#### fmin()을 이용해 최적 하이퍼 파라미터 도출

In [40]:
from hyperopt import fmin, tpe, Trials

In [41]:
trial_val = Trials()
best = fmin(fn=objective_func, space=xgb_search_space, algo=tpe.suggest, max_evals=50, 
            trials=trial_val, rstate=np.random.default_rng(seed=9))

print('best:', best)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 50/50 [00:09<00:00,  5.20trial/s, best loss: -0.9670616939700244]
best: {'colsample_bytree': 0.684441779397407, 'learning_rate': 0.1475201153968472, 'max_depth': 9.0, 'min_child_weight': 2.0}


#### 최적 하이퍼파라미터를 이용해 XGBClassifier 재학습 후 성능 평가

In [47]:
xgb_wrapper = XGBClassifier(n_estimators=400,
                            learning_rate=round(best['learning_rate'],5),
                            max_depth=int(best['max_depth']),
                            min_child_weight=int(best['min_child_weight']),
                            colsample_bytree=round(best['colsample_bytree'],5))

evals = [(x_tr, y_tr),(x_val, y_val)]
xgb_wrapper.fit(x_tr, y_tr, eval_metric='logloss', eval_set=evals)

preds = xgb_wrapper.predict(x_test)
pred_proba = xgb_wrapper.predict(x_test)[:,1]

[0]	validation_0-logloss:0.55271	validation_1-logloss:0.58669
[1]	validation_0-logloss:0.46532	validation_1-logloss:0.52479
[2]	validation_0-logloss:0.39616	validation_1-logloss:0.46923
[3]	validation_0-logloss:0.34165	validation_1-logloss:0.42858
[4]	validation_0-logloss:0.29745	validation_1-logloss:0.39483
[5]	validation_0-logloss:0.25934	validation_1-logloss:0.36657
[6]	validation_0-logloss:0.22862	validation_1-logloss:0.35072
[7]	validation_0-logloss:0.20367	validation_1-logloss:0.33159
[8]	validation_0-logloss:0.18239	validation_1-logloss:0.32347
[9]	validation_0-logloss:0.16291	validation_1-logloss:0.30890
[10]	validation_0-logloss:0.14780	validation_1-logloss:0.30568
[11]	validation_0-logloss:0.13390	validation_1-logloss:0.29906
[12]	validation_0-logloss:0.12276	validation_1-logloss:0.28876
[13]	validation_0-logloss:0.11289	validation_1-logloss:0.28343
[14]	validation_0-logloss:0.10346	validation_1-logloss:0.27987
[15]	validation_0-logloss:0.09554	validation_1-logloss:0.27622
[1

[19]	validation_0-logloss:0.07293	validation_1-logloss:0.26352
[20]	validation_0-logloss:0.06751	validation_1-logloss:0.26310
[21]	validation_0-logloss:0.06306	validation_1-logloss:0.25711
[22]	validation_0-logloss:0.05846	validation_1-logloss:0.25678
[23]	validation_0-logloss:0.05452	validation_1-logloss:0.25732
[24]	validation_0-logloss:0.05132	validation_1-logloss:0.25525
[25]	validation_0-logloss:0.04834	validation_1-logloss:0.25395
[26]	validation_0-logloss:0.04550	validation_1-logloss:0.25433
[27]	validation_0-logloss:0.04313	validation_1-logloss:0.25181
[28]	validation_0-logloss:0.04134	validation_1-logloss:0.25446
[29]	validation_0-logloss:0.03934	validation_1-logloss:0.25551
[30]	validation_0-logloss:0.03736	validation_1-logloss:0.25798
[31]	validation_0-logloss:0.03579	validation_1-logloss:0.25839
[32]	validation_0-logloss:0.03441	validation_1-logloss:0.25869
[33]	validation_0-logloss:0.03305	validation_1-logloss:0.26211
[34]	validation_0-logloss:0.03201	validation_1-logloss:



[70]	validation_0-logloss:0.01977	validation_1-logloss:0.25877
[71]	validation_0-logloss:0.01966	validation_1-logloss:0.26048
[72]	validation_0-logloss:0.01956	validation_1-logloss:0.26022
[73]	validation_0-logloss:0.01945	validation_1-logloss:0.25763
[74]	validation_0-logloss:0.01935	validation_1-logloss:0.25927
[75]	validation_0-logloss:0.01926	validation_1-logloss:0.25970
[76]	validation_0-logloss:0.01915	validation_1-logloss:0.25861
[77]	validation_0-logloss:0.01906	validation_1-logloss:0.25966
[78]	validation_0-logloss:0.01896	validation_1-logloss:0.25713
[79]	validation_0-logloss:0.01888	validation_1-logloss:0.25692
[80]	validation_0-logloss:0.01878	validation_1-logloss:0.25861
[81]	validation_0-logloss:0.01869	validation_1-logloss:0.25765
[82]	validation_0-logloss:0.01860	validation_1-logloss:0.25761
[83]	validation_0-logloss:0.01851	validation_1-logloss:0.25923
[84]	validation_0-logloss:0.01843	validation_1-logloss:0.25832
[85]	validation_0-logloss:0.01835	validation_1-logloss:

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed