In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from random import shuffle
from sklearn.model_selection import StratifiedKFold
import numpy as np
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import os
import pickle
import gc

以下、モデルの学習と評価のための操作を関数化しておく

In [2]:
params = {
    'boosting_type': 'gbdt', 'objective': 'binary',
    'metric': 'auc', 'learning_rate': 0.02,
    'num_leaves': 16, 'n_estimators': 100000,
    'random_state': 123, 'importance_type': 'gain'
}

In [6]:
def train_cv(x_train, y_train, id_train, params, list_nfold=[0,1,2,3,4] ,n_splits=5):
    metrics = []
    imp = pd.DataFrame()
    
    cv = list(StratifiedKFold(n_splits, shuffle=True, random_state=123).split(x_train, y_train))
    for nfold in np.arange(n_splits):
        print('-'*20, nfold, '-'*20)
    
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr = x_train.loc[idx_tr, :], y_train.loc[idx_tr, :]
        x_va, y_va = x_train.loc[idx_va, :], y_train.loc[idx_va, :]
    
        print('x_tr.shape:{}, y_tr.shape:{} '.format(x_tr.shape, y_tr.shape))
        print('x_va.shape:{}, y_va.shape:{} '.format(x_va.shape, y_va.shape))
        print('y_train:{:.3f}, y_tr:{:.3f}, y_va:{:.3f} '.format(
            y_train['y'].mean(), y_tr['y'].mean(), y_va['y'].mean()
            ))
    # train
        model = lgb.LGBMClassifier(**params)
        model.fit(x_tr, y_tr, eval_set=[(x_tr, y_tr), (x_va, y_va)], early_stopping_rounds=100, verbose=100)
        fname_lgb = 'model_lgb_fold{}.pickle'.format(nfold)
        with open(fname_lgb, 'wb') as f:
            pickle.dump(model, f, protocol=4)
        
    # evaluate
        y_tr_pred = model.predict_proba(x_tr)[:, 1]
        y_va_pred = model.predict_proba(x_va)[:, 1]
    
        metric_tr = roc_auc_score(y_tr, y_tr_pred)
        metric_va = roc_auc_score(y_va, y_va_pred)
        print('[auc] tr:{:.4f}, va:{:.4f} '.format(metric_tr, metric_va))
        metrics.append([nfold, metric_tr, metric_va])

        _imp = pd.DataFrame({'col': x_train.columns, 'imp': model.feature_importances_, 'nfold': nfold})
        imp = pd.concat([imp, _imp], axis=0, ignore_index=True)
#以上がfor文
    print('-'*20, 'result', '-'*20)
    metrics = np.array(metrics)
    print(metrics)
    print('[cv] tr:{:.2f}+-{:.2f}, va:{:.2}+-{:.2f} '.format(
    metrics[:, 1].mean(), metrics[:,1].std(), metrics[:, 2].mean(), metrics[:, 2].std()))

    imp = imp.groupby('col')['imp_mean'].agg(['mean', 'std'])
    imp.columns = ['imp_mean', 'imp_std']
    imp = imp.reset_index(drop=False)
    
    return imp, metrics

以下、検証のための関数

In [5]:
#検証のための関数
def predict_lgb(x_train, y_train, id_train, list_nfold=[0,1,2,3,4]):
    pred = np.zeros((len(x_train), len(list_nfold)))
    for nfold in list_nfold:
        pritn("-"*20, nfold, "-"*20)
        fname_lgb = 'model_lgb_fold{}.pickle'.format(nfold)
        with open(fname_lgb, 'rb') as f:
            model = pickle.load(f)
        pred[:, nfold] = model.predict_proba(x_train)[:, 1]

    pred = pd.concat([x_train, id_train, pd.DataFrame({'pred': pred.mean(axis=1)})], axis=1)

    return pred