# GBDT(勾配ブースティング木)

GBDTによる学習は以下のように行われ、それぞれの決定木の分岐および葉のウェイトが定められる。
  - 目的変数と予測値から計算される目的関数を改善するように、決定木を作成してモデルに追加する。
  - 上記の処理をハイパーパラメータで定めた決定木の本数の分だけ繰り返す。

## 必要なライブラリのインポート

In [29]:
import numpy as np
import pandas as pd

# GBDT(勾配ブースティング木)を構成するライブラリ
import xgboost as xgb
import lightgbm as lgb

from sklearn.metrics import log_loss
from sklearn.model_selection import KFold

## データセットの取得

データセットはこちらのURLにアップデートされているものを使用\
https://github.com/ghmagazine/kagglebook/tree/master/input/sample-data

In [30]:
# train_xは学習データ、train_yは目的変数、test_xはテストデータ
train = pd.read_csv('train_preprocessed.csv')
test = pd.read_csv('test_preprocessed.csv')

In [31]:
train.head()

Unnamed: 0,age,sex,height,weight,product,amount,medical_info_a1,medical_info_a2,medical_info_a3,medical_info_b1,...,medical_keyword_6,medical_keyword_7,medical_keyword_8,medical_keyword_9,medical_keyword_10,year,month,day,yearmonth,target
0,50,1,166.445608,65.016732,9,7000000,134,202,1,11,...,1,0,1,0,0,2015,2,3,24182,0
1,68,0,164.334615,56.544217,0,7000000,438,263,3,14,...,0,1,1,0,0,2015,5,9,24185,0
2,77,1,167.462917,54.242267,2,6000000,313,325,1,18,...,1,0,1,0,0,2016,2,13,24194,1
3,17,1,177.097725,71.147762,3,8000000,342,213,2,11,...,0,0,1,0,0,2015,7,6,24187,0
4,62,0,158.165788,65.240697,1,9000000,327,102,0,14,...,0,1,1,1,0,2016,9,17,24201,1


In [32]:
train.shape

(10000, 29)

In [33]:
train.drop(['target'], axis=1)

Unnamed: 0,age,sex,height,weight,product,amount,medical_info_a1,medical_info_a2,medical_info_a3,medical_info_b1,...,medical_keyword_5,medical_keyword_6,medical_keyword_7,medical_keyword_8,medical_keyword_9,medical_keyword_10,year,month,day,yearmonth
0,50,1,166.445608,65.016732,9,7000000,134,202,1,11,...,0,1,0,1,0,0,2015,2,3,24182
1,68,0,164.334615,56.544217,0,7000000,438,263,3,14,...,0,0,1,1,0,0,2015,5,9,24185
2,77,1,167.462917,54.242267,2,6000000,313,325,1,18,...,0,1,0,1,0,0,2016,2,13,24194
3,17,1,177.097725,71.147762,3,8000000,342,213,2,11,...,0,0,0,1,0,0,2015,7,6,24187
4,62,0,158.165788,65.240697,1,9000000,327,102,0,14,...,0,0,1,1,1,0,2016,9,17,24201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,61,1,182.729800,73.393777,1,2000000,189,232,7,17,...,0,0,0,1,1,0,2015,10,21,24190
9996,33,0,167.701136,75.006529,8,9000,426,202,3,19,...,0,0,0,1,1,0,2015,5,28,24185
9997,44,0,145.609998,47.739397,8,1000,370,274,1,11,...,0,0,0,1,0,1,2016,2,29,24194
9998,34,0,165.796017,57.567695,6,5000,291,105,1,13,...,0,1,1,1,1,0,2016,2,27,24194


In [34]:
train.head().iloc[:, :28]

Unnamed: 0,age,sex,height,weight,product,amount,medical_info_a1,medical_info_a2,medical_info_a3,medical_info_b1,...,medical_keyword_5,medical_keyword_6,medical_keyword_7,medical_keyword_8,medical_keyword_9,medical_keyword_10,year,month,day,yearmonth
0,50,1,166.445608,65.016732,9,7000000,134,202,1,11,...,0,1,0,1,0,0,2015,2,3,24182
1,68,0,164.334615,56.544217,0,7000000,438,263,3,14,...,0,0,1,1,0,0,2015,5,9,24185
2,77,1,167.462917,54.242267,2,6000000,313,325,1,18,...,0,1,0,1,0,0,2016,2,13,24194
3,17,1,177.097725,71.147762,3,8000000,342,213,2,11,...,0,0,0,1,0,0,2015,7,6,24187
4,62,0,158.165788,65.240697,1,9000000,327,102,0,14,...,0,0,1,1,1,0,2016,9,17,24201


In [35]:
test.head()

Unnamed: 0,age,sex,height,weight,product,amount,medical_info_a1,medical_info_a2,medical_info_a3,medical_info_b1,...,medical_keyword_5,medical_keyword_6,medical_keyword_7,medical_keyword_8,medical_keyword_9,medical_keyword_10,year,month,day,yearmonth
0,49,1,187.431987,81.008363,1,1000000,302,212,1,10,...,0,1,0,1,0,0,2016,12,6,24204
1,79,1,171.63263,71.067812,6,2000,197,469,0,14,...,0,0,0,0,1,1,2016,9,3,24201
2,78,0,163.543983,64.032098,0,4000000,247,225,2,17,...,0,1,0,1,0,0,2015,4,10,24184
3,26,1,150.391858,52.32291,2,1000000,108,228,0,15,...,0,0,1,0,0,0,2016,4,17,24196
4,14,1,165.835167,67.008154,2,4000000,181,90,2,11,...,0,0,0,1,0,0,2015,1,26,24181


In [36]:
list_idx = [0, 1, 2]
train.iloc[list_idx]

Unnamed: 0,age,sex,height,weight,product,amount,medical_info_a1,medical_info_a2,medical_info_a3,medical_info_b1,...,medical_keyword_6,medical_keyword_7,medical_keyword_8,medical_keyword_9,medical_keyword_10,year,month,day,yearmonth,target
0,50,1,166.445608,65.016732,9,7000000,134,202,1,11,...,1,0,1,0,0,2015,2,3,24182,0
1,68,0,164.334615,56.544217,0,7000000,438,263,3,14,...,0,1,1,0,0,2015,5,9,24185,0
2,77,1,167.462917,54.242267,2,6000000,313,325,1,18,...,1,0,1,0,0,2016,2,13,24194,1


In [37]:
train_x = train.iloc[:, :28]
train_y = train['target']
test_x = test.copy()

In [38]:
# KFoldクロスバリデーションによる分割の1つを使用し、学習データとバリデーションデータに分ける
kf = KFold(n_splits=4, shuffle=True, random_state=71)
##  トレーニングデータセットとテストデータセットにそれぞれ振り分けた要素番号のリスト
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

In [39]:
list(kf.split(train_x))[0]

(array([   0,    1,    2, ..., 9996, 9998, 9999]),
 array([   6,   10,   11, ..., 9994, 9995, 9997]))

## xgboostライブラリを使用

In [40]:
# 特徴量と目的変数をxgboostのデータ後続に変換する
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)
dtest = xgb.DMatrix(test_x)

In [50]:
# モニタリングをloglossで行い、アーリーストッピングの観察するroundを20とする
params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71, 'eval_metric': 'logloss'}
num_round = 500

# 学習の実行
# バリデーションデータもモデルに渡し、学習の進行とともにスコアがどう変わるかモニタリングする
# watchlistには学習データおよびバリデーションデータをセットする
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
model = xgb.train(params, dtrain, num_round, evals=watchlist, early_stopping_rounds=20)

[0]	train-logloss:0.540877	eval-logloss:0.550038
Multiple eval metrics have been passed: 'eval-logloss' will be used for early stopping.

Will train until eval-logloss hasn't improved in 20 rounds.
[1]	train-logloss:0.452701	eval-logloss:0.471823
[2]	train-logloss:0.394812	eval-logloss:0.420262
[3]	train-logloss:0.351976	eval-logloss:0.385207
[4]	train-logloss:0.320214	eval-logloss:0.361498
[5]	train-logloss:0.296737	eval-logloss:0.34463
[6]	train-logloss:0.27611	eval-logloss:0.329004
[7]	train-logloss:0.258854	eval-logloss:0.316698
[8]	train-logloss:0.243628	eval-logloss:0.30775
[9]	train-logloss:0.231526	eval-logloss:0.300926
[10]	train-logloss:0.220166	eval-logloss:0.294131
[11]	train-logloss:0.209626	eval-logloss:0.28528
[12]	train-logloss:0.199507	eval-logloss:0.279123
[13]	train-logloss:0.193238	eval-logloss:0.276415
[14]	train-logloss:0.185473	eval-logloss:0.271543
[15]	train-logloss:0.174737	eval-logloss:0.265163
[16]	train-logloss:0.168997	eval-logloss:0.260891
[17]	train-logl

In [51]:
# バリデーションデータでのスコアの確認
va_pred = model.predict(dvalid)
score = log_loss(va_y, va_pred)
print(f'logloss: {score:.4f}')

logloss: 0.2196


In [52]:
# 予測（二値の予測値ではなく、1である確率を出力するようにしている）
pred = model.predict(dtest)

In [53]:
pred

array([2.3402779e-01, 1.6670238e-02, 1.0728571e-03, ..., 9.1822338e-01,
       3.6742925e-04, 2.2936371e-01], dtype=float32)

## lightgbmライブラリを使用

In [54]:
# 特徴量と目的変数をlightgbmのデータ構造に変換する
lgb_train = lgb.Dataset(tr_x, tr_y)
lgb_eval = lgb.Dataset(va_x, va_y)

In [55]:
# ハイパーパラメータの設定
params2 = {'objective': 'binary', 'seed': 71, 'verbose': 0, 'metrics': 'binary_logloss'}
num_round2 = 500

In [63]:
# 学習の実行
# カテゴリ変数をパラメータで指定している
# バリデーションデータもモデルに渡し、学習の進行とともにスコアがどう変わるかモニタリングする
model2 = lgb.train(params2, 
                                 lgb_train, 
                                 num_boost_round=num_round2, 
                                 valid_names=['train', 'valid'], 
                                 valid_sets=[lgb_train, lgb_eval], 
                                 early_stopping_rounds=20)

[1]	train's binary_logloss: 0.454308	valid's binary_logloss: 0.465515
Training until validation scores don't improve for 20 rounds
[2]	train's binary_logloss: 0.429565	valid's binary_logloss: 0.443444
[3]	train's binary_logloss: 0.410077	valid's binary_logloss: 0.425543
[4]	train's binary_logloss: 0.39358	valid's binary_logloss: 0.410625
[5]	train's binary_logloss: 0.379354	valid's binary_logloss: 0.397666
[6]	train's binary_logloss: 0.365913	valid's binary_logloss: 0.387422
[7]	train's binary_logloss: 0.354309	valid's binary_logloss: 0.376037
[8]	train's binary_logloss: 0.344354	valid's binary_logloss: 0.366734
[9]	train's binary_logloss: 0.334834	valid's binary_logloss: 0.35898
[10]	train's binary_logloss: 0.326209	valid's binary_logloss: 0.351612
[11]	train's binary_logloss: 0.317809	valid's binary_logloss: 0.34563
[12]	train's binary_logloss: 0.310845	valid's binary_logloss: 0.340564
[13]	train's binary_logloss: 0.30401	valid's binary_logloss: 0.334274
[14]	train's binary_logloss: 

[136]	train's binary_logloss: 0.0714022	valid's binary_logloss: 0.211088
[137]	train's binary_logloss: 0.0708807	valid's binary_logloss: 0.211445
[138]	train's binary_logloss: 0.0700744	valid's binary_logloss: 0.210563
[139]	train's binary_logloss: 0.0695329	valid's binary_logloss: 0.210665
[140]	train's binary_logloss: 0.0690487	valid's binary_logloss: 0.210766
[141]	train's binary_logloss: 0.0684624	valid's binary_logloss: 0.210647
[142]	train's binary_logloss: 0.0679562	valid's binary_logloss: 0.210728
[143]	train's binary_logloss: 0.067303	valid's binary_logloss: 0.210724
[144]	train's binary_logloss: 0.0667676	valid's binary_logloss: 0.211084
[145]	train's binary_logloss: 0.0662361	valid's binary_logloss: 0.211097
[146]	train's binary_logloss: 0.065744	valid's binary_logloss: 0.211218
[147]	train's binary_logloss: 0.0650035	valid's binary_logloss: 0.21058
[148]	train's binary_logloss: 0.0644377	valid's binary_logloss: 0.21052
[149]	train's binary_logloss: 0.0639	valid's binary_log

In [64]:
# バリデーションデータでのスコアの確認
va_pred2 = model2.predict(va_x)
score2 = log_loss(va_y, va_pred2)
print(f'logloss: {score2:.4f}')

logloss: 0.2089


In [65]:
# 予測
pred2 = model2.predict(test_x)

In [66]:
pred2

array([8.12544115e-02, 4.79846791e-02, 2.90712975e-03, ...,
       8.56305912e-01, 6.00376883e-04, 3.65623218e-01])

### 二つのGBDTライブラリの精度があまり変わらない場合は速度が早い方のライブラリを使用する

In [None]:
今回のテストデータではlightgbmの方が精度が高いので、lightgbmを使用する。