In [3]:
# coding: utf-8
import json
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

try:
    import cPickle as pickle
except BaseException:
    import pickle

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [9]:
!ls

[34mData[m[m                            Lightgbm_advanced_example.ipynb


In [10]:
print('Loading data...')
# load or create your dataset
df_train = pd.read_csv('https://raw.githubusercontent.com/microsoft/LightGBM/master/examples/binary_classification/binary.train', header=None, sep='\t')
df_test = pd.read_csv('https://raw.githubusercontent.com/microsoft/LightGBM/master/examples/binary_classification/binary.test', header=None, sep='\t')
W_train = pd.read_csv('Data/binary.train.weight', header=None)[0]
W_test = pd.read_csv('Data/binary.test.weight', header=None)[0]

y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)

num_train, num_feature = X_train.shape

Loading data...


In [11]:
print(df_train.shape, df_test.shape, W_train.shape, W_test.shape)
print(df_train.head())
print(df_test.head())
print(W_train.head())
print(W_test.head())

(7000, 29) (500, 29) (7000,) (500,)
   0      1      2      3      4      5      6      7      8      9   ...  \
0   1  0.869 -0.635  0.226  0.327 -0.690  0.754 -0.249 -1.092  0.000  ...   
1   1  0.908  0.329  0.359  1.498 -0.313  1.096 -0.558 -1.588  2.173  ...   
2   1  0.799  1.471 -1.636  0.454  0.426  1.105  1.282  1.382  0.000  ...   
3   0  1.344 -0.877  0.936  1.992  0.882  1.786 -1.647 -0.942  0.000  ...   
4   1  1.105  0.321  1.522  0.883 -1.205  0.681 -1.070 -0.922  0.000  ...   

      19     20     21     22     23     24     25     26     27     28  
0 -0.010 -0.046  3.102  1.354  0.980  0.978  0.920  0.722  0.989  0.877  
1 -1.139 -0.001  0.000  0.302  0.833  0.986  0.978  0.780  0.992  0.798  
2  1.129  0.900  0.000  0.910  1.108  0.986  0.951  0.803  0.866  0.780  
3 -0.678 -1.360  0.000  0.947  1.029  0.999  0.728  0.869  1.027  0.958  
4 -0.374  0.113  0.000  0.756  1.361  0.987  0.838  1.133  0.872  0.808  

[5 rows x 29 columns]
   0      1      2      3      4  

In [12]:
# create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False
lgb_train = lgb.Dataset(X_train, y_train,
                        weight=W_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,
                       weight=W_test, free_raw_data=False)

# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

# generate feature names
feature_name = ['feature_' + str(col) for col in range(num_feature)]

print('Starting training...')
# feature_name and categorical_feature
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                valid_sets=lgb_train,  # eval training data
                feature_name=feature_name,
                categorical_feature=[21])

print('Finished first 10 rounds...')
# check feature name
print('7th feature name is:', lgb_train.feature_name[6])

print('Saving model...')
# save model to file
gbm.save_model('model.txt')

print('Dumping model to JSON...')
# dump model to JSON (and save to file)
model_json = gbm.dump_model()

with open('model.json', 'w+') as f:
    json.dump(model_json, f, indent=4)

# feature names
print('Feature names:', gbm.feature_name())

# feature importances
print('Feature importances:', list(gbm.feature_importance()))

print('Loading model to predict...')
# load model to predict
bst = lgb.Booster(model_file='model.txt')
# can only predict with the best iteration (or the saving iteration)
y_pred = bst.predict(X_test)
# eval with loaded model
print("The rmse of loaded model's prediction is:", mean_squared_error(y_test, y_pred) ** 0.5)

print('Dumping and loading model with pickle...')
# dump model with pickle
with open('model.pkl', 'wb') as fout:
    pickle.dump(gbm, fout)
# load model with pickle to predict
with open('model.pkl', 'rb') as fin:
    pkl_bst = pickle.load(fin)
# can predict with any iteration when loaded in pickle way
y_pred = pkl_bst.predict(X_test, num_iteration=7)
# eval with loaded model
print("The rmse of pickled model's prediction is:", mean_squared_error(y_test, y_pred) ** 0.5)


Starting training...
[1]	training's binary_logloss: 0.680295
[2]	training's binary_logloss: 0.672016
[3]	training's binary_logloss: 0.664438
[4]	training's binary_logloss: 0.655529
[5]	training's binary_logloss: 0.647367
[6]	training's binary_logloss: 0.640943
[7]	training's binary_logloss: 0.635131
[8]	training's binary_logloss: 0.628759
[9]	training's binary_logloss: 0.622764
[10]	training's binary_logloss: 0.616886
Finished first 10 rounds...
7th feature name is: feature_6
Saving model...
Dumping model to JSON...
Feature names: ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27']
Feature importances: [9, 6, 1, 15, 5, 40, 3, 0, 0, 8, 2, 1, 0, 9, 2, 0, 0, 6, 2,

New categorical_feature is [21]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


In [13]:
# continue training
# init_model accepts:
# 1. model file name
# 2. Booster()
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model='model.txt',
                valid_sets=lgb_eval)

print('Finished 10 - 20 rounds with model file...')

# decay learning rates
# learning_rates accepts:
# 1. list/tuple with length = num_boost_round
# 2. function(curr_iter)
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                learning_rates=lambda iter: 0.05 * (0.99 ** iter),
                valid_sets=lgb_eval)

print('Finished 20 - 30 rounds with decay learning rates...')

# change other parameters during training
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                valid_sets=lgb_eval,
                callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])

print('Finished 30 - 40 rounds with changing bagging_fraction...')


[11]	valid_0's binary_logloss: 0.614212
[12]	valid_0's binary_logloss: 0.609782
[13]	valid_0's binary_logloss: 0.605261
[14]	valid_0's binary_logloss: 0.601549
[15]	valid_0's binary_logloss: 0.598284
[16]	valid_0's binary_logloss: 0.59599
[17]	valid_0's binary_logloss: 0.591792
[18]	valid_0's binary_logloss: 0.588347
[19]	valid_0's binary_logloss: 0.585768
[20]	valid_0's binary_logloss: 0.582936
Finished 10 - 20 rounds with model file...
[21]	valid_0's binary_logloss: 0.579478
[22]	valid_0's binary_logloss: 0.57816
[23]	valid_0's binary_logloss: 0.575105
[24]	valid_0's binary_logloss: 0.572562
[25]	valid_0's binary_logloss: 0.570356
[26]	valid_0's binary_logloss: 0.569028
[27]	valid_0's binary_logloss: 0.56685
[28]	valid_0's binary_logloss: 0.565815
[29]	valid_0's binary_logloss: 0.564377
[30]	valid_0's binary_logloss: 0.562884
Finished 20 - 30 rounds with decay learning rates...
[31]	valid_0's binary_logloss: 0.560326
[32]	valid_0's binary_logloss: 0.559062
[33]	valid_0's binary_loglo

New categorical_feature is [21]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


In [24]:
# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array
# log likelihood loss
def loglikelihood(preds, train_data):
    labels = train_data.get_label()
    preds = 1. / (1. + np.exp(-preds))
    grad = preds - labels
    hess = preds * (1. - preds)
    return grad, hess


# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, eval_result: float, is_higher_better: bool
# binary error
# NOTE: when you do customized loss function, the default prediction value is margin
# This may make built-in evalution metric calculate wrong results
# For example, we are doing log likelihood loss, the prediction is score before logistic transformation
# Keep this in mind when you use the customization
def binary_error(preds, train_data):
    labels = train_data.get_label()
    preds = 1. / (1. + np.exp(-preds))
    return 'error', np.mean(labels != (preds > 0.5)), False


gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                fobj=loglikelihood,
                feval=binary_error,
                valid_sets=lgb_eval)

print('Finished 40 - 50 rounds with self-defined objective function and eval metric...')


[71]	valid_0's binary_logloss: 5.6909	valid_0's error: 0.262
[72]	valid_0's binary_logloss: 5.62967	valid_0's error: 0.258
[73]	valid_0's binary_logloss: 5.68806	valid_0's error: 0.258
[74]	valid_0's binary_logloss: 5.68529	valid_0's error: 0.256
[75]	valid_0's binary_logloss: 5.63089	valid_0's error: 0.256
[76]	valid_0's binary_logloss: 5.6256	valid_0's error: 0.256
[77]	valid_0's binary_logloss: 5.68221	valid_0's error: 0.256
[78]	valid_0's binary_logloss: 5.67482	valid_0's error: 0.258
[79]	valid_0's binary_logloss: 5.66856	valid_0's error: 0.258
[80]	valid_0's binary_logloss: 5.60627	valid_0's error: 0.258
Finished 40 - 50 rounds with self-defined objective function and eval metric...


In [26]:
# another self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, eval_result: float, is_higher_better: bool
# accuracy
# NOTE: when you do customized loss function, the default prediction value is margin
# This may make built-in evalution metric calculate wrong results
# For example, we are doing log likelihood loss, the prediction is score before logistic transformation
# Keep this in mind when you use the customization
def accuracy(preds, train_data):
    labels = train_data.get_label()
    preds = 1. / (1. + np.exp(-preds))
    return 'accuracy', np.mean(labels == (preds > 0.5)), True


gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                fobj=loglikelihood,
                feval= accuracy,
                valid_sets=lgb_eval)

print('Finished 50 - 60 rounds with self-defined objective function '
      'and multiple self-defined eval metrics...')

print('Starting a new training job...')


# callback
def reset_metrics():
    def callback(env):
        lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train)
        if env.iteration - env.begin_iteration == 5:
            print('Add a new valid dataset at iteration 5...')
            env.model.add_valid(lgb_eval_new, 'new_valid')
    callback.before_iteration = True
    callback.order = 0
    return callback


gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                valid_sets=lgb_train,
                callbacks=[reset_metrics()])

print('Finished first 10 rounds with callback function...')


[81]	valid_0's binary_logloss: 5.60831	valid_0's accuracy: 0.742
[82]	valid_0's binary_logloss: 5.60753	valid_0's accuracy: 0.742
[83]	valid_0's binary_logloss: 5.60459	valid_0's accuracy: 0.742
[84]	valid_0's binary_logloss: 5.7262	valid_0's accuracy: 0.736
[85]	valid_0's binary_logloss: 5.66634	valid_0's accuracy: 0.736
[86]	valid_0's binary_logloss: 5.66577	valid_0's accuracy: 0.736
[87]	valid_0's binary_logloss: 5.72674	valid_0's accuracy: 0.738
[88]	valid_0's binary_logloss: 5.72827	valid_0's accuracy: 0.738
[89]	valid_0's binary_logloss: 5.73348	valid_0's accuracy: 0.738
[90]	valid_0's binary_logloss: 5.72232	valid_0's accuracy: 0.74
Finished 50 - 60 rounds with self-defined objective function and multiple self-defined eval metrics...
Starting a new training job...
[1]	training's binary_logloss: 0.455912
[2]	training's binary_logloss: 0.454606
[3]	training's binary_logloss: 0.453423
[4]	training's binary_logloss: 0.452032
[5]	training's binary_logloss: 0.450779
Add a new valid da