#1. 데이터 읽기

In [1]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()

In [None]:
df_data = pd.DataFrame(cancer.data, columns = cancer.feature_names)
df_labels = pd.DataFrame(cancer.target)
df_data.head(2)

#2. 모델 만들기

##2.1 데이터 분류

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_data, df_labels, test_size=0.3, random_state=62)
print(X_train.shape, X_test.shape)

(398, 30) (171, 30)


##2.2 기본 모델 만들기

In [8]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

xgb = XGBClassifier()
xgb.fit(X_train, y_train.values.reshape(-1))
pred = xgb.predict(X_test)

print('정확도 : ', accuracy_score(y_test, pred))

정확도 :  0.9415204678362573


##2.3 옵션 적용

In [17]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# 검증을 위한 데이터 셋
evals = [(X_test, y_test)]

# n_estimators는 생성할 트리의 개수로 높을 수록 성능이 향상되지만 시간이 오래 걸림
# learning_rate는 학습을 진행할 때마다 적용되는 학습률로 작을수록 오류를 많이 검출할 수 있지만 시간이 오래 걸림
# max_depth는 트리의 최대 깊이, 보통 3~10정도 설정함
xgb = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)

# early_stopping_rounds는 n_estimators가 진행되는 동안 학습 오류가 지정된 횟수 만큼 감소되지 않으면 정지시킴
# eval_set검증 세트를 지정함
# eval_metric은 검증 함수를 지정함.
# verbose는 동작 결과를 확인함
xgb.fit(X_train, y_train.values.reshape(-1), early_stopping_rounds=100, eval_set=evals, eval_metric='logloss', verbose=True)
pred = xgb.predict(X_test)

accuracy_score(y_test, pred)

[0]	validation_0-logloss:0.620074
Will train until validation_0-logloss hasn't improved in 100 rounds.
[1]	validation_0-logloss:0.562113
[2]	validation_0-logloss:0.513277
[3]	validation_0-logloss:0.471775
[4]	validation_0-logloss:0.436905
[5]	validation_0-logloss:0.407757
[6]	validation_0-logloss:0.381916
[7]	validation_0-logloss:0.36067
[8]	validation_0-logloss:0.341639
[9]	validation_0-logloss:0.323146
[10]	validation_0-logloss:0.307869
[11]	validation_0-logloss:0.295114
[12]	validation_0-logloss:0.283002
[13]	validation_0-logloss:0.272086
[14]	validation_0-logloss:0.263828
[15]	validation_0-logloss:0.253032
[16]	validation_0-logloss:0.243995
[17]	validation_0-logloss:0.236733
[18]	validation_0-logloss:0.228323
[19]	validation_0-logloss:0.224546
[20]	validation_0-logloss:0.22032
[21]	validation_0-logloss:0.215285
[22]	validation_0-logloss:0.209625
[23]	validation_0-logloss:0.205906
[24]	validation_0-logloss:0.20526
[25]	validation_0-logloss:0.202863
[26]	validation_0-logloss:0.202433

  y = column_or_1d(y, warn=True)


[37]	validation_0-logloss:0.195531
[38]	validation_0-logloss:0.194515
[39]	validation_0-logloss:0.194884
[40]	validation_0-logloss:0.195307
[41]	validation_0-logloss:0.196374
[42]	validation_0-logloss:0.19611
[43]	validation_0-logloss:0.196598
[44]	validation_0-logloss:0.197133
[45]	validation_0-logloss:0.196711
[46]	validation_0-logloss:0.197179
[47]	validation_0-logloss:0.197804
[48]	validation_0-logloss:0.198023
[49]	validation_0-logloss:0.198345
[50]	validation_0-logloss:0.198867
[51]	validation_0-logloss:0.20034
[52]	validation_0-logloss:0.200916
[53]	validation_0-logloss:0.200997
[54]	validation_0-logloss:0.200619
[55]	validation_0-logloss:0.202142
[56]	validation_0-logloss:0.202374
[57]	validation_0-logloss:0.203034
[58]	validation_0-logloss:0.203128
[59]	validation_0-logloss:0.203168
[60]	validation_0-logloss:0.203298
[61]	validation_0-logloss:0.203186
[62]	validation_0-logloss:0.203422
[63]	validation_0-logloss:0.204757
[64]	validation_0-logloss:0.205343
[65]	validation_0-logl

0.9298245614035088

#3. 하이퍼 파라미터

In [13]:
from sklearn.model_selection import GridSearchCV

xgb = XGBClassifier()
evals = [(X_test, y_test)]

def get_best_hyper_paramerter():
  params =  {
     'n_estimators':n_estimators_list, 
     'learning_rate':learning_rate_list,
     'max_depth':max_depth_list,
     'subsample':subsample_list
  }

  grid_cv = GridSearchCV(xgb, param_grid=params, scoring='accuracy', cv=2)
  grid_cv.fit(X_train, y_train.values.reshape(-1), early_stopping_rounds=50, eval_set=evals, eval_metric='logloss')
  print('Best Score: ', grid_cv.best_score_)
  print('Best Hyper Parameter: ', grid_cv.best_params_)

In [14]:
n_estimators_list = range(100, 500, 100)
learning_rate_list = [0.1, 0.4, 0.7,1]
max_depth_list = range(3, 9, 3)
subsample_list = [0.1, 0.4, 0.7,1]

get_best_hyper_paramerter()

[0]	validation_0-logloss:0.642224
Will train until validation_0-logloss hasn't improved in 50 rounds.
[1]	validation_0-logloss:0.599763
[2]	validation_0-logloss:0.556612
[3]	validation_0-logloss:0.523392
[4]	validation_0-logloss:0.496886
[5]	validation_0-logloss:0.470913
[6]	validation_0-logloss:0.452196
[7]	validation_0-logloss:0.428381
[8]	validation_0-logloss:0.416068
[9]	validation_0-logloss:0.40312
[10]	validation_0-logloss:0.383101
[11]	validation_0-logloss:0.381725
[12]	validation_0-logloss:0.366985
[13]	validation_0-logloss:0.355311
[14]	validation_0-logloss:0.348409
[15]	validation_0-logloss:0.335367
[16]	validation_0-logloss:0.3321
[17]	validation_0-logloss:0.321518
[18]	validation_0-logloss:0.316434
[19]	validation_0-logloss:0.305147
[20]	validation_0-logloss:0.300938
[21]	validation_0-logloss:0.290628
[22]	validation_0-logloss:0.286128
[23]	validation_0-logloss:0.276323
[24]	validation_0-logloss:0.276443
[25]	validation_0-logloss:0.271099
[26]	validation_0-logloss:0.271222


  y = column_or_1d(y, warn=True)


[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
[9]	validation_0-logloss:0.199853
[10]	validation_0-logloss:0.19456
[11]	validation_0-logloss:0.193758
[12]	validation_0-logloss:0.196353
[13]	validation_0-logloss:0.195141
[14]	validation_0-logloss:0.195168
[15]	validation_0-logloss:0.197577
[16]	validation_0-logloss:0.19576
[17]	validation_0-logloss:0.193375
[18]	validation_0-logloss:0.198658
[19]	validation_0-logloss:0.19931
[20]	validation_0-logloss:0.202763
[21]	validation_0-logloss:0.201659
[22]	validation_0-logloss:0.201341
[23]	validation_0-logloss:0.199971
[24]	validation_0-logloss:0.197991
[25]	validation_0-logloss:0.198054
[26]	validation_0-logloss:0.196963
[27]	validation_0-logloss:0.195832
[28]	validation_0-logloss:0.195434
[29]	validation_0-logloss:0.196511
[30]	validation_0-logloss:0.197369
[31]	validation_0-logloss:0.197237
[32]	validation_0-logloss:0.196304
[33]	validation_0-logloss:0.199743
[34]	validation_0-logloss:0.197369
[35]	validation_0-logloss:0.198461
[36]	vali

In [15]:
n_estimators_list = range(150, 300, 50)
learning_rate_list = [0.05, 0.1, 0.15]
max_depth_list = range(3, 5, 1)
subsample_list = [0.5, 0.6, 0.7, 0.8]

get_best_hyper_paramerter()

[0]	validation_0-logloss:0.659933
Will train until validation_0-logloss hasn't improved in 50 rounds.
[1]	validation_0-logloss:0.628899
[2]	validation_0-logloss:0.60121
[3]	validation_0-logloss:0.577375
[4]	validation_0-logloss:0.554797
[5]	validation_0-logloss:0.533477
[6]	validation_0-logloss:0.514403
[7]	validation_0-logloss:0.496094
[8]	validation_0-logloss:0.480354
[9]	validation_0-logloss:0.463148
[10]	validation_0-logloss:0.447723
[11]	validation_0-logloss:0.43413
[12]	validation_0-logloss:0.4216
[13]	validation_0-logloss:0.410181
[14]	validation_0-logloss:0.398876
[15]	validation_0-logloss:0.388849
[16]	validation_0-logloss:0.379454
[17]	validation_0-logloss:0.371891
[18]	validation_0-logloss:0.363418
[19]	validation_0-logloss:0.353637
[20]	validation_0-logloss:0.346475
[21]	validation_0-logloss:0.338805
[22]	validation_0-logloss:0.333803
[23]	validation_0-logloss:0.325903
[24]	validation_0-logloss:0.317318
[25]	validation_0-logloss:0.312239
[26]	validation_0-logloss:0.305103
[

  y = column_or_1d(y, warn=True)


[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
[179]	validation_0-logloss:0.170457
[180]	validation_0-logloss:0.169767
[181]	validation_0-logloss:0.169844
[182]	validation_0-logloss:0.169564
[183]	validation_0-logloss:0.169715
[184]	validation_0-logloss:0.170334
[185]	validation_0-logloss:0.17006
[186]	validation_0-logloss:0.170321
[187]	validation_0-logloss:0.169928
[188]	validation_0-logloss:0.169603
[189]	validation_0-logloss:0.169589
[190]	validation_0-logloss:0.169335
[191]	validation_0-logloss:0.16942
[192]	validation_0-logloss:0.16929
[193]	validation_0-logloss:0.16924
[194]	validation_0-logloss:0.169338
[195]	validation_0-logloss:0.168982
[196]	validation_0-logloss:0.169223
[197]	validation_0-logloss:0.168873
[198]	validation_0-logloss:0.16908
[199]	validation_0-logloss:0.168826
[200]	validation_0-logloss:0.168898
[201]	validation_0-logloss:0.169166
[202]	validation_0-logloss:0.169172
[203]	validation_0-logloss:0.168853
[204]	validation_0-logloss:0.169638
[205]	validation_0-

In [16]:
n_estimators_list = [250]
learning_rate_list = [0.03, 0.04, 0.05, 0.06, 0.7]
max_depth_list = [3]
subsample_list = [0.6]

get_best_hyper_paramerter()

[0]	validation_0-logloss:0.672608
Will train until validation_0-logloss hasn't improved in 50 rounds.
[1]	validation_0-logloss:0.652799
[2]	validation_0-logloss:0.634266
[3]	validation_0-logloss:0.617467
[4]	validation_0-logloss:0.600813
[5]	validation_0-logloss:0.585589
[6]	validation_0-logloss:0.57142
[7]	validation_0-logloss:0.557272
[8]	validation_0-logloss:0.543956
[9]	validation_0-logloss:0.531381
[10]	validation_0-logloss:0.519118
[11]	validation_0-logloss:0.507695
[12]	validation_0-logloss:0.496626
[13]	validation_0-logloss:0.487246
[14]	validation_0-logloss:0.476832
[15]	validation_0-logloss:0.467575
[16]	validation_0-logloss:0.458248
[17]	validation_0-logloss:0.4503
[18]	validation_0-logloss:0.44138
[19]	validation_0-logloss:0.431639
[20]	validation_0-logloss:0.423977
[21]	validation_0-logloss:0.416957
[22]	validation_0-logloss:0.410004
[23]	validation_0-logloss:0.403076
[24]	validation_0-logloss:0.39678
[25]	validation_0-logloss:0.390695
[26]	validation_0-logloss:0.385126
[2

  y = column_or_1d(y, warn=True)


[47]	validation_0-logloss:0.288038
[48]	validation_0-logloss:0.284181
[49]	validation_0-logloss:0.281661
[50]	validation_0-logloss:0.278482
[51]	validation_0-logloss:0.275363
[52]	validation_0-logloss:0.272292
[53]	validation_0-logloss:0.27047
[54]	validation_0-logloss:0.268548
[55]	validation_0-logloss:0.266182
[56]	validation_0-logloss:0.264348
[57]	validation_0-logloss:0.262093
[58]	validation_0-logloss:0.260579
[59]	validation_0-logloss:0.257821
[60]	validation_0-logloss:0.256173
[61]	validation_0-logloss:0.253701
[62]	validation_0-logloss:0.252254
[63]	validation_0-logloss:0.249579
[64]	validation_0-logloss:0.248919
[65]	validation_0-logloss:0.246453
[66]	validation_0-logloss:0.244758
[67]	validation_0-logloss:0.242723
[68]	validation_0-logloss:0.240894
[69]	validation_0-logloss:0.239108
[70]	validation_0-logloss:0.237967
[71]	validation_0-logloss:0.236904
[72]	validation_0-logloss:0.236082
[73]	validation_0-logloss:0.23513
[74]	validation_0-logloss:0.233397
[75]	validation_0-logl

#4. 하이퍼 파라미터 적용

In [18]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

evals = [(X_test, y_test)]
xgb = XGBClassifier(n_estimators=250, learning_rate=0.05, max_depth=3, subsample= 0.6)

xgb.fit(X_train, y_train.values.reshape(-1), early_stopping_rounds=100, eval_set=evals, eval_metric='logloss', verbose=True)
pred = xgb.predict(X_test)

accuracy_score(y_test, pred)

[0]	validation_0-logloss:0.65749
Will train until validation_0-logloss hasn't improved in 100 rounds.
[1]	validation_0-logloss:0.624492
[2]	validation_0-logloss:0.596138
[3]	validation_0-logloss:0.569356
[4]	validation_0-logloss:0.544033
[5]	validation_0-logloss:0.521017
[6]	validation_0-logloss:0.500194
[7]	validation_0-logloss:0.48196
[8]	validation_0-logloss:0.464693
[9]	validation_0-logloss:0.448295
[10]	validation_0-logloss:0.432536
[11]	validation_0-logloss:0.418143
[12]	validation_0-logloss:0.405139
[13]	validation_0-logloss:0.391089
[14]	validation_0-logloss:0.377231
[15]	validation_0-logloss:0.366129
[16]	validation_0-logloss:0.356593
[17]	validation_0-logloss:0.346849
[18]	validation_0-logloss:0.336792
[19]	validation_0-logloss:0.328108
[20]	validation_0-logloss:0.32107
[21]	validation_0-logloss:0.314772
[22]	validation_0-logloss:0.307943
[23]	validation_0-logloss:0.302048
[24]	validation_0-logloss:0.295589
[25]	validation_0-logloss:0.288883
[26]	validation_0-logloss:0.281564

  y = column_or_1d(y, warn=True)


[42]	validation_0-logloss:0.220504
[43]	validation_0-logloss:0.21872
[44]	validation_0-logloss:0.217825
[45]	validation_0-logloss:0.216148
[46]	validation_0-logloss:0.214603
[47]	validation_0-logloss:0.212972
[48]	validation_0-logloss:0.210456
[49]	validation_0-logloss:0.208768
[50]	validation_0-logloss:0.206856
[51]	validation_0-logloss:0.206004
[52]	validation_0-logloss:0.203362
[53]	validation_0-logloss:0.201203
[54]	validation_0-logloss:0.200037
[55]	validation_0-logloss:0.198128
[56]	validation_0-logloss:0.196472
[57]	validation_0-logloss:0.195209
[58]	validation_0-logloss:0.194002
[59]	validation_0-logloss:0.192807
[60]	validation_0-logloss:0.192153
[61]	validation_0-logloss:0.191867
[62]	validation_0-logloss:0.190337
[63]	validation_0-logloss:0.189545
[64]	validation_0-logloss:0.188355
[65]	validation_0-logloss:0.187514
[66]	validation_0-logloss:0.186729
[67]	validation_0-logloss:0.185868
[68]	validation_0-logloss:0.185272
[69]	validation_0-logloss:0.185286
[70]	validation_0-log

0.9473684210526315