# 分析モデルの永続化

In [1]:
import pickle
import os

import numpy as np
import pandas as pd

# GBDT(勾配ブースティング木)を構成するライブラリ
import xgboost as xgb
import lightgbm as lgb

from sklearn.metrics import log_loss
from sklearn.model_selection import KFold

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


## 永続化した分析モデルを保管するディレクトリの作成

In [2]:
dest = os.path.join('pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)

## 分析モデルの作成

In [3]:
# train_xは学習データ、train_yは目的変数、test_xはテストデータ
train = pd.read_csv('train_preprocessed.csv')
test = pd.read_csv('test_preprocessed.csv')

In [4]:
train.head()

Unnamed: 0,age,sex,height,weight,product,amount,medical_info_a1,medical_info_a2,medical_info_a3,medical_info_b1,...,medical_keyword_6,medical_keyword_7,medical_keyword_8,medical_keyword_9,medical_keyword_10,year,month,day,yearmonth,target
0,50,1,166.445608,65.016732,9,7000000,134,202,1,11,...,1,0,1,0,0,2015,2,3,24182,0
1,68,0,164.334615,56.544217,0,7000000,438,263,3,14,...,0,1,1,0,0,2015,5,9,24185,0
2,77,1,167.462917,54.242267,2,6000000,313,325,1,18,...,1,0,1,0,0,2016,2,13,24194,1
3,17,1,177.097725,71.147762,3,8000000,342,213,2,11,...,0,0,1,0,0,2015,7,6,24187,0
4,62,0,158.165788,65.240697,1,9000000,327,102,0,14,...,0,1,1,1,0,2016,9,17,24201,1


In [5]:
train_x = train.iloc[:, :28]
train_y = train['target']
test_x = test.copy()

In [6]:
# KFoldクロスバリデーションによる分割の1つを使用し、学習データとバリデーションデータに分ける
kf = KFold(n_splits=4, shuffle=True, random_state=71)
##  トレーニングデータセットとテストデータセットにそれぞれ振り分けた要素番号のリスト
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

In [7]:
# 特徴量と目的変数をlightgbmのデータ構造に変換する
lgb_train = lgb.Dataset(tr_x, tr_y)
lgb_eval = lgb.Dataset(va_x, va_y)

In [8]:
# ハイパーパラメータの設定
params = {'objective': 'binary', 'metrics': 'binary_logloss'}
num_round = 500

In [9]:
# 学習の実行
# カテゴリ変数をパラメータで指定している
# バリデーションデータもモデルに渡し、学習の進行とともにスコアがどう変わるかモニタリングする
categorical_features = []

# num_boost_round: ブースティングの反復回数。
# early_stopping_rounds: 
model = lgb.train(params, 
                                 lgb_train, 
                                 num_boost_round=num_round, 
                                 #categorical_feature=categorical_features,
                                 valid_names=['train', 'valid'], 
                                 valid_sets=[lgb_train, lgb_eval], 
                                 early_stopping_rounds=10)

[1]	train's binary_logloss: 0.454308	valid's binary_logloss: 0.465515
Training until validation scores don't improve for 10 rounds
[2]	train's binary_logloss: 0.429565	valid's binary_logloss: 0.443444
[3]	train's binary_logloss: 0.410077	valid's binary_logloss: 0.425543
[4]	train's binary_logloss: 0.39358	valid's binary_logloss: 0.410625
[5]	train's binary_logloss: 0.379354	valid's binary_logloss: 0.397666
[6]	train's binary_logloss: 0.365913	valid's binary_logloss: 0.387422
[7]	train's binary_logloss: 0.354309	valid's binary_logloss: 0.376037
[8]	train's binary_logloss: 0.344354	valid's binary_logloss: 0.366734
[9]	train's binary_logloss: 0.334834	valid's binary_logloss: 0.35898
[10]	train's binary_logloss: 0.326209	valid's binary_logloss: 0.351612
[11]	train's binary_logloss: 0.317809	valid's binary_logloss: 0.34563
[12]	train's binary_logloss: 0.310845	valid's binary_logloss: 0.340564
[13]	train's binary_logloss: 0.30401	valid's binary_logloss: 0.334274
[14]	train's binary_logloss: 

[131]	train's binary_logloss: 0.074473	valid's binary_logloss: 0.212178
[132]	train's binary_logloss: 0.0738179	valid's binary_logloss: 0.211958
[133]	train's binary_logloss: 0.0732225	valid's binary_logloss: 0.211775
[134]	train's binary_logloss: 0.0725873	valid's binary_logloss: 0.211546
[135]	train's binary_logloss: 0.071945	valid's binary_logloss: 0.211122
[136]	train's binary_logloss: 0.0714022	valid's binary_logloss: 0.211088
[137]	train's binary_logloss: 0.0708807	valid's binary_logloss: 0.211445
[138]	train's binary_logloss: 0.0700744	valid's binary_logloss: 0.210563
[139]	train's binary_logloss: 0.0695329	valid's binary_logloss: 0.210665
[140]	train's binary_logloss: 0.0690487	valid's binary_logloss: 0.210766
[141]	train's binary_logloss: 0.0684624	valid's binary_logloss: 0.210647
[142]	train's binary_logloss: 0.0679562	valid's binary_logloss: 0.210728
[143]	train's binary_logloss: 0.067303	valid's binary_logloss: 0.210724
[144]	train's binary_logloss: 0.0667676	valid's binary

In [10]:
# バリデーションデータでのスコアの確認
va_pred = model.predict(va_x)
score = log_loss(va_y, va_pred)
print(f'logloss: {score:.4f}')

logloss: 0.2103


In [11]:
# 予測
pred = model.predict(test_x)

In [12]:
pred

array([9.64145464e-02, 4.82369617e-02, 3.77019674e-03, ...,
       8.68562307e-01, 7.84682997e-04, 3.91142663e-01])

## 分析モデルの永続化

- pickleのdump関数は学習させたモデルをシリアライズする。

In [13]:
pickle.dump(model, 
                      open(os.path.join(dest, 'lightgbm_classifier.pkl'), 'wb'), 
                      protocol=4)

## 永続化させた分析モデルを使用する。

In [14]:
current_dir = os.getcwd()
lightgbm_classifier = pickle.load(open(os.path.join(current_dir, 'pkl_objects', 'lightgbm_classifier.pkl'), 'rb'))

### 永続化した分析モデルが永続化する前と同じバリデーションスコアになるか確認

In [15]:
# バリデーションデータでのスコアの確認
va_pred_persistence = lightgbm_classifier.predict(va_x)
score_persistence = log_loss(va_y, va_pred_persistence)
print(f'logloss: {score_persistence:.4f}')

logloss: 0.2103


### 永続化した分析モデルが永続化する前と同じ予測を行うかを確認

In [16]:
# predict関数を使用
pred_persistence = lightgbm_classifier.predict(test_x)

In [17]:
pred_persistence

array([9.64145464e-02, 4.82369617e-02, 3.77019674e-03, ...,
       8.68562307e-01, 7.84682997e-04, 3.91142663e-01])

In [18]:
# predict_proba関数を使用