# Импорт

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics.pairwise import distance_metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

from joblib import dump


# Загружаем датасет и разбиваем её на выборку.

In [6]:
data = pd.read_csv('../../Data/DataLaba1/energy_task_moded.csv')
X_train, X_test, y_train, y_test = train_test_split(data.drop('Appliances', axis=1), data['Appliances'], test_size=0.5, random_state=42)
X_train.shape, y_train.shape


((9867, 29), (9867,))

In [7]:
data

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,day,month,year,time
0,60.0,30.0,19.890000,47.596667,19.200000,44.790000,19.790000,44.730000,19.000000,45.566667,...,6.60,733.5,92.000000,7.000000,63.000000,5.3,11.0,1.0,2016.0,1020.0
1,60.0,30.0,19.890000,46.693333,19.200000,44.722500,19.790000,44.790000,19.000000,45.992500,...,6.48,733.6,92.000000,6.666667,59.166667,5.2,11.0,1.0,2016.0,1030.0
2,50.0,30.0,19.890000,46.300000,19.200000,44.626667,19.790000,44.933333,18.926667,45.890000,...,6.37,733.7,92.000000,6.333333,55.333333,5.1,11.0,1.0,2016.0,1040.0
3,50.0,40.0,19.890000,46.066667,19.200000,44.590000,19.790000,45.000000,18.890000,45.723333,...,6.25,733.8,92.000000,6.000000,51.500000,5.0,11.0,1.0,2016.0,1050.0
4,60.0,40.0,19.890000,46.333333,19.200000,44.530000,19.790000,45.000000,18.890000,45.530000,...,6.13,733.9,92.000000,5.666667,47.666667,4.9,11.0,1.0,2016.0,1060.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,100.0,0.0,25.566667,46.560000,25.890000,42.025714,27.200000,41.163333,24.700000,45.590000,...,22.70,755.2,55.666667,3.333333,23.666667,13.3,27.0,5.0,2016.0,1040.0
19731,90.0,0.0,25.500000,46.500000,25.754000,42.080000,27.133333,41.223333,24.700000,45.590000,...,22.60,755.2,56.000000,3.500000,24.500000,13.3,27.0,5.0,2016.0,1050.0
19732,270.0,10.0,25.500000,46.596667,25.628571,42.768571,27.050000,41.690000,24.700000,45.730000,...,22.50,755.2,56.333333,3.666667,25.333333,13.3,27.0,5.0,2016.0,1060.0
19733,420.0,10.0,25.500000,46.990000,25.414000,43.036000,26.890000,41.290000,24.700000,45.790000,...,22.30,755.2,56.666667,3.833333,26.166667,13.2,27.0,5.0,2016.0,1070.0


# Подбираем гиппер-параметры.

In [28]:
gradient_boosting_regressor_optimal_params = RandomizedSearchCV(GradientBoostingRegressor(),
                                                                 {
                                                                    'loss': ['squared_error', 'huber', 'absolute_error', 'quantile'],
                                                                    'n_estimators': np.arange(50, 200),
                                                                    'criterion': ['friedman_mse', 'squared_error'],
                                                                    'min_samples_split': np.arange(2, 10),
                                                                    'min_samples_leaf': np.arange(1, 9),
                                                                    'min_weight_fraction_leaf': np.arange(0, 0.5, 0.01),
                                                                    'max_depth': np.arange(3, 4),
                                                                    'alpha': np.arange(0, 1, 0.01)
                                                                 },
                                                                 n_iter=4,
                                                                 n_jobs=4,
                                                                 scoring='neg_mean_squared_error',
                                                                 random_state=71
                                                                 ).fit(X_train, y_train)
gradient_boosting_regressor_optimal_params.best_params_

{'n_estimators': 80,
 'min_weight_fraction_leaf': 0.43,
 'min_samples_split': 4,
 'min_samples_leaf': 7,
 'max_depth': 3,
 'loss': 'squared_error',
 'criterion': 'squared_error',
 'alpha': 0.91}

In [23]:
from sklearn.metrics import get_scorer_names
print(*(i for i in get_scorer_names() if 'error' in i), sep='\n')

max_error
neg_mean_absolute_error
neg_mean_absolute_percentage_error
neg_mean_squared_error
neg_mean_squared_log_error
neg_median_absolute_error
neg_root_mean_squared_error


# Обучение

In [31]:
gradient_boosting_regressor = GradientBoostingRegressor(loss=gradient_boosting_regressor_optimal_params.best_params_['loss'] if 'loss' in gradient_boosting_regressor_optimal_params.best_params_ else 'log_loss',
                                                          n_estimators=gradient_boosting_regressor_optimal_params.best_params_['n_estimators'] if 'n_estimators' in gradient_boosting_regressor_optimal_params.best_params_ else 100,
                                                          criterion=gradient_boosting_regressor_optimal_params.best_params_['criterion'] if 'criterion' in gradient_boosting_regressor_optimal_params.best_params_ else 'friedman_mse',
                                                          min_weight_fraction_leaf=gradient_boosting_regressor_optimal_params.best_params_['min_weight_fraction_leaf'] if 'min_weight_fraction_leaf' in gradient_boosting_regressor_optimal_params.best_params_ else 0,
                                                          min_samples_split=gradient_boosting_regressor_optimal_params.best_params_['min_samples_split'] if 'min_samples_split' in gradient_boosting_regressor_optimal_params.best_params_ else 2,
                                                          min_samples_leaf=gradient_boosting_regressor_optimal_params.best_params_['min_samples_leaf'] if 'min_samples_leaf' in gradient_boosting_regressor_optimal_params.best_params_ else 1,
                                                          max_depth=gradient_boosting_regressor_optimal_params.best_params_['max_depth'] if 'max_depth' in gradient_boosting_regressor_optimal_params.best_params_ else 3,
                                                          alpha=gradient_boosting_regressor_optimal_params.best_params_['alpha'] if 'alpha' in gradient_boosting_regressor_optimal_params.best_params_ else 0.9,
                                                          )
gradient_boosting_regressor.fit(X_train, y_train)

# Предсказание

In [32]:
y_predict = gradient_boosting_regressor.predict(X_test)
y_predict

array([ 64.77539958, 114.6335319 ,  56.85387133, ..., 105.89474985,
        65.36937735,  94.64379806])

# Оценка

In [33]:
errors = {'MAE': mean_absolute_error(y_predict, y_test), 
          'RMSE': mean_squared_error(y_predict, y_test),
          'MSE': mean_squared_error(y_predict, y_test)**0.5,
          'MAPE': mean_absolute_percentage_error(y_predict, y_test),
          'R^2': gradient_boosting_regressor.score(X_test, y_test)}
errors

{'MAE': 55.49189056378847,
 'RMSE': 9877.382022125608,
 'MSE': 99.38501910311035,
 'MAPE': 0.5437413749371844,
 'R^2': 0.08228175952348937}