### 모델 생성, 학습, 평가

### 학습 내용
 * 샘플 데이터 셋을 활용하여 교차 검증을 수행해 본다.
 * 샘플 데이터 셋을 활용하여 XGBOOST 모델을 지정하여 학습과 평가를 수행해 본다.

### 데이터 및 라이브러리 불러오기

In [7]:
# ---------------------------------
# 데이터 등 준비
# ----------------------------------
import numpy as np
import pandas as pd
import xgboost as xgb

# 버전 확인
print(np.__version__)  
print(pd.__version__)  
print(xgb.__version__)  

1.23.5
1.4.4
1.7.5


In [2]:
# train_x는 학습 데이터, train_y는 목적 변수, test_x는 테스트 데이터
# pandas의 DataFrame, Series의 자료형 사용(numpy의 array로 값을 저장하기도 함.)
train = pd.read_csv('../input/sample-data/train_preprocessed.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']
test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv')

### 데이터 셋 나누기

In [3]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=4, shuffle=True, random_state=71)
tr_idx, va_idx = list(kf.split(train_x))[0]

# 학습 데이터를 학습 데이터와 평가용 데이터셋으로 분할
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

In [4]:
# -----------------------------------
# xgboost에 있어, 사용자 평가지표와 목적 변수의 예
# （참조）https://github.com/dmlc/xgboost/blob/master/demo/guide-python/custom_objective.py
# -----------------------------------
import xgboost as xgb
from sklearn.metrics import log_loss

# 특징과 목적변수를 xgboost의 데이터 구조로 변환
# 학습 데이터의 특징과 목적변수는 tr_x, tr_y
# 검증 데이터의 특징과 목적변수는 va_x, va_y
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)

In [8]:
# 하이퍼 파라미터의 설정
# xgboost 버전이 하위버전의 경우, 'verbosity':0을 'silent':1로 변경 후, 실행.
# params = {'silent': 1, 'random_state': 71}
params = {'verbosity': 0, 'random_state': 71}   # xgboost 1.3.3 버전 적용
num_round = 50
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

# 모델 학습 실행
bst = xgb.train(params, dtrain, num_round, watchlist, obj=logregobj, feval=evalerror)


[0]	train-rmse:0.40041	train-custom-error:0.16947	eval-rmse:0.42362	eval-custom-error:0.19080
[1]	train-rmse:0.70228	train-custom-error:0.11547	eval-rmse:0.72145	eval-custom-error:0.14920
[2]	train-rmse:0.98133	train-custom-error:0.10280	eval-rmse:0.99697	eval-custom-error:0.13520
[3]	train-rmse:1.22320	train-custom-error:0.09920	eval-rmse:1.23609	eval-custom-error:0.13680
[4]	train-rmse:1.43864	train-custom-error:0.09453	eval-rmse:1.44949	eval-custom-error:0.13720
[5]	train-rmse:1.63033	train-custom-error:0.08947	eval-rmse:1.63831	eval-custom-error:0.12920
[6]	train-rmse:1.79480	train-custom-error:0.08453	eval-rmse:1.80122	eval-custom-error:0.12920
[7]	train-rmse:1.94509	train-custom-error:0.07920	eval-rmse:1.94581	eval-custom-error:0.12640
[8]	train-rmse:2.06040	train-custom-error:0.07680	eval-rmse:2.06055	eval-custom-error:0.12840
[9]	train-rmse:2.16885	train-custom-error:0.07160	eval-rmse:2.16838	eval-custom-error:0.12400
[10]	train-rmse:2.27799	train-custom-error:0.06853	eval-rmse



[17]	train-rmse:2.81005	train-custom-error:0.04613	eval-rmse:2.78479	eval-custom-error:0.11320
[18]	train-rmse:2.87602	train-custom-error:0.04280	eval-rmse:2.84483	eval-custom-error:0.11040
[19]	train-rmse:2.93112	train-custom-error:0.04187	eval-rmse:2.89821	eval-custom-error:0.10800
[20]	train-rmse:2.98633	train-custom-error:0.03853	eval-rmse:2.95071	eval-custom-error:0.10600
[21]	train-rmse:3.03141	train-custom-error:0.03747	eval-rmse:2.99213	eval-custom-error:0.10520
[22]	train-rmse:3.08452	train-custom-error:0.03573	eval-rmse:3.04040	eval-custom-error:0.10320
[23]	train-rmse:3.14076	train-custom-error:0.03213	eval-rmse:3.09457	eval-custom-error:0.10000
[24]	train-rmse:3.19359	train-custom-error:0.03160	eval-rmse:3.14409	eval-custom-error:0.10160
[25]	train-rmse:3.22294	train-custom-error:0.03160	eval-rmse:3.17373	eval-custom-error:0.10200
[26]	train-rmse:3.25593	train-custom-error:0.02920	eval-rmse:3.20251	eval-custom-error:0.10160
[27]	train-rmse:3.31172	train-custom-error:0.02560

In [9]:
# 목적함수에 binary:logistic을 지정했을 때와 달리 확률로 변환하기 전 값으로
# 예측값이 출력되므로 변환이 필요
pred_val = bst.predict(dvalid)
pred = 1.0 / (1.0 + np.exp(-pred_val))
logloss = log_loss(va_y, pred)
print(logloss)

0.22561102085065796


In [10]:
# (참고)일반적인 방법으로 학습하는 경우
params = {'verbosity': 0, 'random_state': 71, 'objective': 'binary:logistic'}   # xgb 버전 1.7.5 버전(23/06)

bst = xgb.train(params, dtrain, num_round, watchlist)

pred = bst.predict(dvalid)
logloss = log_loss(va_y, pred)
print(logloss)

[0]	train-logloss:0.54088	eval-logloss:0.55003
[1]	train-logloss:0.45269	eval-logloss:0.47182
[2]	train-logloss:0.39482	eval-logloss:0.42026
[3]	train-logloss:0.35198	eval-logloss:0.38520
[4]	train-logloss:0.32021	eval-logloss:0.36150
[5]	train-logloss:0.29673	eval-logloss:0.34463
[6]	train-logloss:0.27610	eval-logloss:0.32900
[7]	train-logloss:0.25886	eval-logloss:0.31670
[8]	train-logloss:0.24363	eval-logloss:0.30775
[9]	train-logloss:0.23153	eval-logloss:0.30093
[10]	train-logloss:0.22016	eval-logloss:0.29413
[11]	train-logloss:0.20963	eval-logloss:0.28528
[12]	train-logloss:0.19951	eval-logloss:0.27912
[13]	train-logloss:0.19324	eval-logloss:0.27642
[14]	train-logloss:0.18547	eval-logloss:0.27154
[15]	train-logloss:0.17474	eval-logloss:0.26516
[16]	train-logloss:0.16900	eval-logloss:0.26089
[17]	train-logloss:0.16323	eval-logloss:0.25849
[18]	train-logloss:0.15950	eval-logloss:0.25691
[19]	train-logloss:0.15637	eval-logloss:0.25511
[20]	train-logloss:0.14722	eval-logloss:0.25035




[21]	train-logloss:0.14290	eval-logloss:0.24734
[22]	train-logloss:0.13782	eval-logloss:0.24612
[23]	train-logloss:0.13362	eval-logloss:0.24387
[24]	train-logloss:0.13047	eval-logloss:0.24251
[25]	train-logloss:0.12654	eval-logloss:0.24094
[26]	train-logloss:0.12268	eval-logloss:0.24005
[27]	train-logloss:0.11966	eval-logloss:0.23803
[28]	train-logloss:0.11506	eval-logloss:0.23699
[29]	train-logloss:0.11027	eval-logloss:0.23626
[30]	train-logloss:0.10827	eval-logloss:0.23621
[31]	train-logloss:0.10262	eval-logloss:0.23269
[32]	train-logloss:0.10062	eval-logloss:0.23212
[33]	train-logloss:0.09913	eval-logloss:0.23180
[34]	train-logloss:0.09582	eval-logloss:0.23184
[35]	train-logloss:0.09378	eval-logloss:0.22998
[36]	train-logloss:0.09243	eval-logloss:0.22980
[37]	train-logloss:0.08952	eval-logloss:0.22913
[38]	train-logloss:0.08732	eval-logloss:0.22870
[39]	train-logloss:0.08576	eval-logloss:0.22786
[40]	train-logloss:0.08340	eval-logloss:0.22857
[41]	train-logloss:0.08125	eval-logloss:

### 참조 코드 
 * 위의 코드에서 사용되지 않았던 참조 코드

In [None]:
# 사용자 정의 목적함수(이 경우는 logloss이며, xgboost의 ‘binary:logistic’과 동일)
def logregobj(preds, dtrain):
    labels = dtrain.get_label()           # 실젯값 레이블 획득
    preds = 1.0 / (1.0 + np.exp(-preds))  # 시그모이드 함수
    grad = preds - labels                 # 그래디언트
    hess = preds * (1.0 - preds)          # 시그모이드 함수 미분
    return grad, hess

# 사용자 정의 평가지표(이 경우 오류율)
def evalerror(preds, dtrain):
    labels = dtrain.get_label()           # 실젯값 레이블 획득
    return 'custom-error', float(sum(labels != (preds > 0.0))) / len(labels)