In [4]:
pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.1.1-py2.py3-none-win_amd64.whl (754 kB)
Installing collected packages: lightgbm
Successfully installed lightgbm-3.1.1
Note: you may need to restart the kernel to use updated packages.


In [5]:
# check lightgbm version
import lightgbm

LightGBM Ensemble for Classification

In [13]:
# evaluate lightgbm algorithm for classification
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from lightgbm import LGBMClassifier


We are going to use the make_classification() function to create a synthetic binary classification problem with 1,000 examples and 20 input features.

In [14]:
make_classification()

(array([[-0.55764551, -0.03237545,  0.094302  , ...,  2.09834233,
          0.78118829,  0.29187808],
        [ 2.7826008 , -0.46094413,  1.89464976, ...,  1.05395455,
          0.64310924, -0.17471091],
        [ 1.39410728,  1.44814788,  1.29221814, ...,  0.0950894 ,
         -1.18508446,  0.76180816],
        ...,
        [-0.63093694,  0.46552145,  0.29444364, ..., -0.02910069,
          0.46439182, -0.20458422],
        [-0.2861078 ,  0.21790097,  0.4410491 , ...,  0.16181877,
          0.68909249, -2.92675125],
        [ 1.16447426,  0.10671813,  1.69623679, ..., -0.46522293,
         -1.7552467 ,  0.34599637]]),
 array([0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0,
        0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
        0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0]))

We will evaluate the model using repeated stratified k-fold cross-validation with three repeats and 10 folds. We will report the mean and standard deviation of the accuracy of the model across all repeats and folds.


In [6]:

# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)

# define the model
model = LGBMClassifier()

# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.930 (0.027)


The LightGBM ensemble with default hyperparameters achieves a classification accuracy of about 93% on this test dataset.

 The LightGBM ensemble is fit on all available data, then the predict() function is used to make predictions on new data.

In [7]:
# make predictions using lightgbm for classification

# fit the model on the whole dataset
model.fit(X, y)

# make a single prediction
row = [0.2929949,-4.21223056,-1.288332,-2.17849815,-0.64527665,2.58097719,0.28422388,-7.1827928,-1.91211104,2.73729512,0.81395695,3.96973717,-2.66939799,3.34692332,4.19791821,0.99990998,-0.30201875,-4.43170633,-2.82646737,0.44916808]
yhat = model.predict([row])
print('Predicted Class: %d' % yhat[0])

Predicted Class: 1


LightGBM Ensemble for Regression

We are going to use the make_regression() function to create a synthetic regression problem with 1,000 examples and 20 input features.

In [16]:
# evaluate lightgbm ensemble for regression
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from lightgbm import LGBMRegressor


In [17]:
make_regression()

(array([[-1.27240351,  0.0101501 ,  0.45526745, ...,  0.38990757,
          1.35324361, -0.40573697],
        [ 0.20816425,  1.91728137, -0.64118619, ..., -0.23562832,
          1.0154021 ,  1.37725085],
        [-1.86090142,  0.01082649, -1.06225793, ...,  0.10700337,
          1.04161986,  0.58396282],
        ...,
        [ 0.49991999,  0.79966908,  1.31496227, ...,  0.57171019,
          0.27228525,  1.30772921],
        [ 1.84172684, -0.82230325, -0.32849745, ..., -2.33236965,
          0.88887065,  1.1064923 ],
        [-0.78794418,  1.1980293 , -0.81050291, ...,  1.68322589,
         -0.7963971 , -0.72801595]]),
 array([  79.64243606,  180.95560533, -184.4263625 , -143.39072724,
         123.97389664, -200.74316751, -218.96361356,   91.40022894,
        -420.46603403,  252.9528697 ,  425.45620995, -171.4533072 ,
          18.95943215, -166.57111778,  139.54578934, -103.06891809,
         385.61086256, -292.69095779,  391.78525543, -117.34014409,
        -252.78962973, -162.64135

We will evaluate the model using repeated k-fold cross-validation, with three repeats and 10 folds. We will report the mean absolute error (MAE) of the model across all repeats and folds. The scikit-learn library makes the MAE negative so that it is maximized instead of minimized. This means that larger negative MAE are better and a perfect model has a MAE of 0.

In [18]:
# define dataset
X, y = make_regression(n_samples=1000, n_features=20, n_informative=15, noise=0.1, random_state=7)

# define the model
model = LGBMRegressor()

# evaluate the model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')

# report performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

MAE: -60.004 (2.887)


We can see the LightGBM ensemble with default hyperparameters achieves a MAE of about 60.

In [19]:
# fit the model on the whole dataset
model.fit(X, y)

# make a single prediction
row = [0.20543991,-0.97049844,-0.81403429,-0.23842689,-0.60704084,-0.48541492,0.53113006,2.01834338,-0.90745243,-1.85859731,-1.02334791,-0.6877744,0.60984819,-0.70630121,-1.29161497,1.32385441,1.42150747,1.26567231,2.56569098,-0.11154792]
yhat = model.predict([row])
print('Prediction: %d' % yhat[0])

Prediction: 52


The LightGBM ensemble is fit on all available data, then the predict() function can be called to make predictions on new data.