### LightGBM. Пример решения задачи.

In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

data = load_boston()
X_full = data.data
y_full = data.target

X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=100, 
                                        random_state=241)

Обучение

In [2]:
print('Starting training...')
# train
gbm = lgb.LGBMRegressor(num_leaves=31,
                        learning_rate=0.05,
                        n_estimators=20)
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='l1',
        early_stopping_rounds=5)

Starting training...
[1]	valid_0's l1: 6.55152
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l1: 6.29855
[3]	valid_0's l1: 6.08526
[4]	valid_0's l1: 5.88219
[5]	valid_0's l1: 5.66549
[6]	valid_0's l1: 5.45931
[7]	valid_0's l1: 5.29116
[8]	valid_0's l1: 5.10806
[9]	valid_0's l1: 4.93583
[10]	valid_0's l1: 4.79662
[11]	valid_0's l1: 4.62366
[12]	valid_0's l1: 4.48798
[13]	valid_0's l1: 4.32989
[14]	valid_0's l1: 4.18449
[15]	valid_0's l1: 4.07735
[16]	valid_0's l1: 3.96772
[17]	valid_0's l1: 3.8743
[18]	valid_0's l1: 3.77696
[19]	valid_0's l1: 3.68361
[20]	valid_0's l1: 3.5872
Did not meet early stopping. Best iteration is:
[20]	valid_0's l1: 3.5872


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.05, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=20,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=0)

Предсказание и оценка качества

In [3]:
print('Starting predicting...')

y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)

print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

# feature importances
print('Feature importances:', list(gbm.feature_importances_))

Starting predicting...
The rmse of prediction is: 5.259625962258064
Feature importances: [35, 0, 6, 0, 20, 70, 21, 31, 3, 7, 15, 4, 83]


### Можно задать любую метрику качества - надо написать свою функцию.

In [None]:
# self-defined eval metric
# f(y_true: array, y_pred: array) -> name: string, eval_result: float, is_higher_better: bool
# Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False


print('Starting training with custom eval function...')
# train
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric=rmsle,
        early_stopping_rounds=5)

print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
print('The rmsle of prediction is:', rmsle(y_test, y_pred)[1])

### Подбор гиперпараметров.

In [None]:
# other scikit-learn modules
estimator = lgb.LGBMRegressor(num_leaves=31)

param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [20, 40]
}

gbm = GridSearchCV(estimator, param_grid, cv=3)
gbm.fit(X_train, y_train)

print('Best parameters found by grid search are:', gbm.best_params_)

Предсказание и оценка качества

In [None]:
pred = gbm.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, pred)