## Энергетический оракул
Ноутбук команды #12

Работа выполнена на основе модели LightGBM


### 1. Подготовка данных

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

import re

from tqdm import tqdm

from data_preprocess import DataTransformer
random_state = 12345
NUM_ITERATIONS = 5000

In [2]:
# Функция для вычисления метрики mae по дням из почасовых массивов данных

def mae_day(y_true, y_pred):
    y_true_copy = pd.DataFrame(y_true).reset_index(drop=True)
    y_true_copy['day'] = y_true_copy.index // 24
    y_true_grouped = y_true_copy.groupby(by='day').sum()   
    y_pred_copy = pd.DataFrame(y_pred).reset_index(drop=True)
    y_pred_copy['day'] = y_pred_copy.index // 24
    y_pred_grouped = y_pred_copy.groupby(by='day').sum()
    
    return mean_absolute_error(y_true_grouped, y_pred_grouped)
# Функция для вычисления метрик по дням из почасовых массивов данных

def metrics_day(y_true, y_pred):
    y_true_copy = pd.DataFrame(y_true).reset_index(drop=True)
    y_true_copy['day'] = y_true_copy.index // 24
    y_true_grouped = y_true_copy.groupby(by='day').sum()   
    y_pred_copy = pd.DataFrame(y_pred).reset_index(drop=True)
    y_pred_copy['day'] = y_pred_copy.index // 24
    y_pred_grouped = y_pred_copy.groupby(by='day').sum()
    
    mae = mean_absolute_error(y_true_grouped, y_pred_grouped)
    mape = mean_absolute_percentage_error(y_true_grouped, y_pred_grouped)
    r2 = r2_score(y_true_grouped, y_pred_grouped)
    return mae, mape, r2

#### 1.5 Чтение файлов с данными
Данные объединяются в один датасет

In [3]:
transformer = DataTransformer() #инициализируем трансформер

In [4]:
all_ds, test_begin, test_end = transformer.open_file() #оставляем поле пустым что бы использовать открытый датасет

all_ds = transformer.transform(all_ds)

начало открытого теста: 2023-04-01 00:00:00     конец открытого теста: 2023-08-01 00:00:00


#### 1.10 Демонстрация сформированного датасета

In [5]:
# Итоговый набор колонок
all_ds.columns

Index(['date', 'time', 'target', 'temp', 'temp_pred', 'weather_pred',
       'weather_fact', 'cloudy', 'rainy', 'windy', 'clear', 'rain_probability',
       'has_rain_probability', 'holidays', 'preholidays', 'temp_last_day',
       'target_lag_24', 'target_lag_48', 'target_lag_72', 'target_lag_168',
       'target_lag_336', 'VVP', 'P', 'U', 'WW', 'Td', 'N', 'S', 'W', 'E'],
      dtype='object')

In [6]:
all_ds.head()

Unnamed: 0,date,time,target,temp,temp_pred,weather_pred,weather_fact,cloudy,rainy,windy,...,target_lag_336,VVP,P,U,WW,Td,N,S,W,E
0,2019-01-01,0,481.51,2.9,2.0,"пасм, ветер",ветер,2,0,1,...,0.0,1.8,763.5,100.0,1.0,2.0,1.0,0.0,0.0,0.0
1,2019-01-01,1,462.872,2.9,2.0,"пасм, ветер",ветер,2,0,1,...,0.0,1.8,764.3,93.0,1.0,1.0,1.0,0.0,0.0,0.0
2,2019-01-01,2,449.718,2.9,2.0,"пасм, ветер",ветер,2,0,1,...,0.0,1.8,765.0,93.0,0.0,2.0,1.0,0.0,0.0,0.0
3,2019-01-01,3,430.908,4.3,2.0,"пасм, ветер","ветер, пасм",2,0,1,...,0.0,1.8,765.8,87.0,0.0,1.0,1.0,0.0,0.0,0.0
4,2019-01-01,4,415.163,4.3,2.0,"пасм, ветер","ветер, пасм",2,0,1,...,0.0,1.8,766.6,87.0,0.0,1.0,1.0,0.0,0.0,0.0


#### 1.11 Исключение лишних колонок

In [7]:
# Отбираем признаки. Все лишние колонки здесь отбрасываем, кроме 'date', которую уберем позже 

feature_cols = list(all_ds.columns)

# выбрасываем взгляд в прошлое и расшифрованную погоду
drop_list = ['target', 'weather_pred', 'weather_fact', 'temp']

# выбрасываем признаки, найденные процедурно в процессе оптимизации
# КОМАНДЕ: здесь можно добавлять признаки на выброс с целью оптимизации
drop_list = drop_list + ['target_lag_48', 'target_lag_168', 'target_lag_336',
                        'target_lag_24', 'windy', 'clear',
                        'target_lag_72','has_rain_probability', #'temp_last_day',
                        'N', 'S', 'W', 'E', 'P','U', 'WW', 'Td', 'preholidays',  'cloudy',
 'rainy',
 'rain_probability','temp_pred', 'holidays', 'VVP'] 

for name in drop_list:
    feature_cols.remove(name)

# Итоговый список признаков
feature_cols

['date', 'time', 'temp_last_day']

#### 1.12 Выделение наборов данных для обучения, валидации и тестирования

Выделялось два набора данных для обучения и валидации:
1. Обучение на данных с 2019 по 2021 с валидацией на 2022
2. Обучение на данных с 2019 по 2022 с валидацией на первом квартале 2023

Первый набор позволяет оценить влияние сезонности на обучение и предсказания, второй позволяет обучить модель на большем объеме данных и на более актуальных данных.

In [8]:
# Формируем набор датасетов для обучения и проверки

features = all_ds[feature_cols]
target = all_ds['target']

# Функция для выделения временных интервалов из таблиц признаков и целей
# на этом этапе отбрасываем колонку 'date'
def features_interval(features, target, date1, date2):
    features_interval = features[ (features['date']>=date1) & (features['date']<date2) ]
    target_interval = target[features_interval.index]
    

    features_interval.loc[:, 'date'] = pd.to_datetime(features_interval['date'])

    # Преобразование столбца 'time' в timedelta
    features_interval.loc[:, 'time'] = pd.to_timedelta(features_interval['time'], unit='h')

    # Создание нового столбца 'datetime', объединяющего 'date' и 'time'
    features_interval.loc[:, 'datetime'] = features_interval['date'] + features_interval['time']

    # Установка столбца 'datetime' в качестве индекса
    features_interval.set_index('datetime', inplace=True)

    features_interval = features_interval.drop('date', axis=1)
    features_interval = features_interval.drop('time', axis=1)
    target_interval.index = features_interval.index

    return features_interval, target_interval


# для проверки на тестовой выборке будем учиться на всем тренировочном датасете
features_all_train, target_all_train = features_interval(features, target, '2019-01-01', test_begin)
features_open_test, target_open_test = features_interval(features, target, test_begin, test_end )


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features_interval.loc[:, 'date'] = pd.to_datetime(features_interval['date'])
  features_interval.loc[:, 'date'] = pd.to_datetime(features_interval['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features_interval.loc[:, 'time'] = pd.to_timedelta(features_interval['time'], unit='h')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

In [9]:
features_all_train.head(5)

Unnamed: 0_level_0,temp_last_day
datetime,Unnamed: 1_level_1
2019-01-01 00:00:00,2.0
2019-01-01 01:00:00,2.0
2019-01-01 02:00:00,2.0
2019-01-01 03:00:00,2.0
2019-01-01 04:00:00,2.0


In [10]:
features_open_test.head(5)

Unnamed: 0_level_0,temp_last_day
datetime,Unnamed: 1_level_1
2023-04-01 00:00:00,7.2
2023-04-01 01:00:00,7.2
2023-04-01 02:00:00,7.2
2023-04-01 03:00:00,7.9
2023-04-01 04:00:00,7.9


## AUTO TS

In [11]:
!pip install autots
!pip install scikit-learn
!pip install scipy
!pip install arch
!pip install pytorch-forecasting

!pip install gluonts
!pip install mxnet
!pip install prophet
!pip install statsmodels




In [12]:
!pip install tensorflow



In [13]:
!pip install holidays==0.35



In [14]:
!pip install git+https://github.com/ourownstory/neural_prophet.git

Collecting git+https://github.com/ourownstory/neural_prophet.git
  Cloning https://github.com/ourownstory/neural_prophet.git to /private/var/folders/4y/zccbjjq17fgd73999h5g3ltr0000gn/T/pip-req-build-j3079nhx
  Running command git clone --filter=blob:none --quiet https://github.com/ourownstory/neural_prophet.git /private/var/folders/4y/zccbjjq17fgd73999h5g3ltr0000gn/T/pip-req-build-j3079nhx
  Resolved https://github.com/ourownstory/neural_prophet.git to commit f08dcf814034527577abe21ad0bdf28e27d4eb66
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


In [15]:
!pip install torch==2.0.0




In [16]:
from autots import AutoTS, load_daily, load_hourly
from autots.models.model_list import model_lists
import tensorflow as tf


In [17]:
!pip install pytorch-lightning==1.9.4



In [18]:
!pip install torchvision==0.15



In [19]:
from pytorch_lightning.core.module import LightningModule


In [20]:
!pip install gluonts



from sklearn.metrics import pairwise
#import neuralprophet
import scipy
from arch import arch_model
import pytorch_forecasting 
from neuralprophet import NeuralProphet
from scipy.spatial.distance import cdist
from sklearn.preprocessing import StandardScaler
import gluonts.model
from prophet import Prophet
import statsmodels.api as sm

In [21]:
print(model_lists.keys())

dict_keys(['all', 'default', 'fast', 'superfast', 'parallel', 'fast_parallel', 'fast_parallel_no_arima', 'probabilistic', 'multivariate', 'univariate', 'no_params', 'recombination_approved', 'no_shared', 'no_shared_fast', 'experimental', 'slow', 'gpu', 'regressor', 'best', 'motifs', 'all_result_path', 'regressions', 'all_pragmatic', 'update_fit'])


In [22]:
df_train_autots = pd.concat([target_all_train, features_all_train ], axis=1)

In [23]:
print(model_lists['fast'])

{'ConstantNaive': 1, 'LastValueNaive': 1.5, 'AverageValueNaive': 1, 'GLS': 1, 'SeasonalNaive': 1, 'GLM': 1, 'ETS': 1, 'VAR': 0.8, 'VECM': 1, 'WindowRegression': 0.5, 'DatepartRegression': 0.8, 'UnivariateMotif': 1, 'MultivariateMotif': 0.8, 'SectionalMotif': 1, 'NVAR': 1, 'MAR': 1, 'RRVAR': 1, 'KalmanStateSpace': 1, 'MetricMotif': 1, 'Cassandra': 1, 'SeasonalityMotif': 1}


In [24]:
model_lists['regressor']

['GLM',
 'ARIMA',
 'FBProphet',
 'RollingRegression',
 'UnobservedComponents',
 'VECM',
 'DynamicFactor',
 'WindowRegression',
 'VAR',
 'DatepartRegression',
 'GluonTS',
 'UnivariateRegression',
 'MultivariateRegression',
 'SectionalMotif',
 'ARDL',
 'NeuralProphet',
 'ARCH',
 'Cassandra',
 'PreprocessingRegression']

In [25]:
model_list = model_lists['regressor']
drop_models = ['WindowRegression']
for name in drop_models:
    model_list.remove(name)
model_list

['GLM',
 'ARIMA',
 'FBProphet',
 'RollingRegression',
 'UnobservedComponents',
 'VECM',
 'DynamicFactor',
 'VAR',
 'DatepartRegression',
 'GluonTS',
 'UnivariateRegression',
 'MultivariateRegression',
 'SectionalMotif',
 'ARDL',
 'NeuralProphet',
 'ARCH',
 'Cassandra',
 'PreprocessingRegression']

In [26]:
model_list_2 = model_lists['fast']


model_list_2 = ['ARIMA', 'NeuralProphet', 'DatepartRegression','FBProphet' , 'Cassandra', 'GLM'] 
model_list_2 = ['ARIMA', 'FBProphet'] 


In [27]:
metric_weighting = {
    'mae_weighting': 5,
    'mape_weighting': 3,
    'rmse_weighting': 2,
#    'made_weighting': 0.5,
#    'mage_weighting': 1,
#    'mle_weighting': 0,
    'imle_weighting': 0,
#    'spl_weighting': 3,
    'containment_weighting': 0,
    'contour_weighting': 1,
    'runtime_weighting': 0.05,
}
model = AutoTS(
    forecast_length=24,
    frequency='infer',
    prediction_interval=0.9,
    ensemble='simple',
    model_list='fast', #model_list_2, #"regressor",  "superfast", "default", "fast_parallel"
    transformer_list="fast",  # "superfast",
    metric_weighting=metric_weighting,
    drop_most_recent=1,
    max_generations=2,
    n_jobs='auto',
    num_validations=2,
    validation_method="backwards"
)

Using 7 cpus for n_jobs.


In [28]:

model = model.fit(
    df_train_autots,
    weights={'target': 1}
)

Data frequency is: H, used frequency is: H
Model Number: 1 with model AverageValueNaive in generation 0 of 2
Model Number: 2 with model AverageValueNaive in generation 0 of 2
Model Number: 3 with model AverageValueNaive in generation 0 of 2
Model Number: 4 with model DatepartRegression in generation 0 of 2
Model Number: 5 with model DatepartRegression in generation 0 of 2




Model Number: 6 with model DatepartRegression in generation 0 of 2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Model Number: 7 with model DatepartRegression in generation 0 of 2
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Model Number: 8 with model ETS in generation 0 of 2
Model Number: 9 with model ETS in generation 0 of 2
Model Number: 10 with model GLM in generation 0 of 2
Model Number: 11 with model GLM in generation 0 of 2
Model Number: 12 with model GLS in generation 0 of 2
Model Number: 13 with model GLS in generation 0 of 2
Model Number: 14 with mode

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Model Number: 29 with model DatepartRegression in generation 0 of 2
Model Number: 30 with model ETS in generation 0 of 2
Model Number: 31 with model VECM in generation 0 of 2
Model Number: 32 with model MultivariateMotif in generation 0 of 2
Model Number: 33 with model MultivariateMotif in generation 0 of 2
Model Number: 34 with model UnivariateMotif in generation 0 of 2
Model Number: 35 with model UnivariateMotif in generation 0 of 2
Model Number: 36 with model SectionalMotif in generation 0 of 2
Model Number: 37 with model SectionalMotif in generation 0 of 2
Model Number: 38 with model SeasonalNaive in generation 0 of 2
Model Number: 39 with model DatepartRegression in generation 0 of 2
Model Number: 40 with model NVAR in generation 0 of 2
Model Number: 41 with model Cassandra in generation 0 of 2
Model Number: 42 with model SeasonalityMotif in generation 0 of 2
Model Number: 43 with model ConstantNaive in generation 0 of 2
Model Number: 44 with model LastValueNaive in generation 0 o

  return 1. / (self.link.deriv(mu)**2 * self.variance(mu))
  return np.power(z, 1. / self.power)
  resid_dev = -np.log(endog_mu) + (endog - mu) / mu
  return np.sum(resid / self.family.variance(mu)) / self.df_resid
  return 1. / (self.link.deriv(mu)**2 * self.variance(mu))
  wlsendog = (lin_pred + self.family.link.deriv(mu) * (self.endog-mu)


Template Eval Error: ValueError('NaN, inf or invalid value detected in weights, estimation infeasible.') in model 48 in generation 0: GLM
Model Number: 49 with model ETS in generation 0 of 2
Model Number: 50 with model VAR in generation 0 of 2
Model Number: 51 with model VECM in generation 0 of 2
Model Number: 52 with model WindowRegression in generation 0 of 2
Template Eval Error: AttributeError("'NoneType' object has no attribute 'split'") in model 52 in generation 0: WindowRegression
Model Number: 53 with model DatepartRegression in generation 0 of 2
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Model Number: 54 with model UnivariateMotif in generation 0 of 2
Model Number: 55 with model MultivariateMotif in

In [None]:
# Рисуем графическое представление предсказания на 2022 год

y_true_copy = pd.DataFrame(target_valid).reset_index(drop=True)
y_true_copy['day'] = y_true_copy.index // 24
y_true_grouped = y_true_copy.groupby(by='day').sum()   
y_true_grouped
y_pred_copy = pd.DataFrame(y_pred).reset_index(drop=True)
y_pred_copy['day'] = y_pred_copy.index // 24
y_pred_grouped = y_pred_copy.groupby(by='day').sum()

#pd.DataFrame(date_valid)
tmpdf = pd.DataFrame(train_ds.loc[features_valid.index,:]['date']).groupby(by='date').count().reset_index().join(y_true_grouped)
tmpdf.plot(x='date', y='target', figsize=(18,5))
ax=plt.gca()
tmpdf = pd.DataFrame(train_ds.loc[features_valid.index,:]['date']).groupby(by='date').count().reset_index().join(y_pred_grouped)
tmpdf.plot(ax=ax, x='date', y=0)

In [None]:
# Предсказываем той же моделью (19-21) тренировочный кусок 2023 (первый квартал)
mae = mae_day(target_2023, lgbm_model.predict(features_2023))
print(f'mae = {mae}')