# Импорт

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from itertools import combinations, chain

from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

from joblib import dump


# Загружаем датасет и разбиваем её на выборку.

In [3]:
data = pd.read_csv('../../Data/DataLaba1/energy_task_moded.csv')
X_train, X_test, y_train, y_test = train_test_split(data.drop('Appliances', axis=1), data['Appliances'], test_size=0.5, random_state=42)
X_train.shape, y_train.shape


((9867, 29), (9867,))

In [4]:
data

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,day,month,year,time
0,60.0,30.0,19.890000,47.596667,19.200000,44.790000,19.790000,44.730000,19.000000,45.566667,...,6.60,733.5,92.000000,7.000000,63.000000,5.3,11.0,1.0,2016.0,1020.0
1,60.0,30.0,19.890000,46.693333,19.200000,44.722500,19.790000,44.790000,19.000000,45.992500,...,6.48,733.6,92.000000,6.666667,59.166667,5.2,11.0,1.0,2016.0,1030.0
2,50.0,30.0,19.890000,46.300000,19.200000,44.626667,19.790000,44.933333,18.926667,45.890000,...,6.37,733.7,92.000000,6.333333,55.333333,5.1,11.0,1.0,2016.0,1040.0
3,50.0,40.0,19.890000,46.066667,19.200000,44.590000,19.790000,45.000000,18.890000,45.723333,...,6.25,733.8,92.000000,6.000000,51.500000,5.0,11.0,1.0,2016.0,1050.0
4,60.0,40.0,19.890000,46.333333,19.200000,44.530000,19.790000,45.000000,18.890000,45.530000,...,6.13,733.9,92.000000,5.666667,47.666667,4.9,11.0,1.0,2016.0,1060.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,100.0,0.0,25.566667,46.560000,25.890000,42.025714,27.200000,41.163333,24.700000,45.590000,...,22.70,755.2,55.666667,3.333333,23.666667,13.3,27.0,5.0,2016.0,1040.0
19731,90.0,0.0,25.500000,46.500000,25.754000,42.080000,27.133333,41.223333,24.700000,45.590000,...,22.60,755.2,56.000000,3.500000,24.500000,13.3,27.0,5.0,2016.0,1050.0
19732,270.0,10.0,25.500000,46.596667,25.628571,42.768571,27.050000,41.690000,24.700000,45.730000,...,22.50,755.2,56.333333,3.666667,25.333333,13.3,27.0,5.0,2016.0,1060.0
19733,420.0,10.0,25.500000,46.990000,25.414000,43.036000,26.890000,41.290000,24.700000,45.790000,...,22.30,755.2,56.666667,3.833333,26.166667,13.2,27.0,5.0,2016.0,1070.0


# Создаём регрессор.

In [28]:
cbr = CatBoostRegressor(learning_rate=0.05, 
                        task_type="GPU",
                        devices='0:4',
                        logging_level='Silent'
                        )

# Поиск гиппер-параметров.

In [29]:
params = {'depth': np.arange(3, 10), 
          'l2_leaf_reg': np.arange(1, 20),
          'bootstrap_type': ['Bayesian', 'Bernoulli', 'Poisson'],
          # 'leaf_estimation_method': [],
          'num_trees': np.arange(1, 20),
        #   'n_estimators': np.arange(3, 20),
        #   'num_boost_round': np.arange(1, 20),
          }
catboost_regressor_best_params = cbr.randomized_search(params, X=X_train[:1000], y=y_train[:1000], n_iter=3)

catboost_regressor_best_params['params']

0:	loss: 142.5652438	best: 142.5652438 (0)	total: 122ms	remaining: 243ms
1:	loss: 146.1670106	best: 142.5652438 (0)	total: 245ms	remaining: 123ms
2:	loss: 144.1712133	best: 142.5652438 (0)	total: 385ms	remaining: 0us
Estimating final quality...


{'depth': 6, 'l2_leaf_reg': 13, 'iterations': 7, 'bootstrap_type': 'Bayesian'}

In [32]:
cbr.get_all_params()

{'nan_mode': 'Min',
 'gpu_ram_part': 0.95,
 'eval_metric': 'RMSE',
 'iterations': 7,
 'leaf_estimation_method': 'Newton',
 'observations_to_bootstrap': 'TestOnly',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'devices': '0:4',
 'eval_fraction': 0,
 'pinned_memory_bytes': '104857600',
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 13,
 'random_strength': 1,
 'rsm': 1,
 'boost_from_average': True,
 'gpu_cat_features_storage': 'GpuRam',
 'fold_size_loss_normalization': False,
 'model_size_reg': 0.5,
 'pool_metainfo_options': {'tags': {}},
 'use_best_model': False,
 'meta_l2_frequency': 0,
 'random_seed': 0,
 'depth': 6,
 'border_count': 128,
 'min_fold_size': 100,
 'data_partition': 'DocParallel',
 'bagging_temperature': 1,
 'classes_count': 0,
 'auto_class_weights': 'None',
 'leaf_estimation_backtracking': 'AnyImprovement',
 'best_model_min_trees': 1,

# Обучение.

In [33]:
cbr.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x22537b2f550>

# Предсказание.

In [34]:
cbr_predict = cbr.predict(X_test)
cbr_predict

array([ 83.67728782, 125.67610189,  83.82920682, ..., 106.11612123,
       100.18805799,  89.2088818 ])

# Оценка

In [35]:

errors = {'MAE': mean_absolute_error(cbr_predict, y_test), 
          'RMSE': mean_squared_error(cbr_predict, y_test),
          'MSE': mean_squared_error(cbr_predict, y_test)**0.5,
          'MAPE': mean_absolute_percentage_error(cbr_predict, y_test),
          'R^2': cbr.score(X_test, y_test)}
errors

{'MAE': 56.054504028570555,
 'RMSE': 9825.988769386377,
 'MSE': 99.1261255642849,
 'MAPE': 0.5595888776703415,
 'R^2': 0.08705676218822}