# Привет, это блокнот команды 27 хакатона GlowHack2023.
Перед нами стояла задача прогнозирования энергопотребления Калининградской области на сутки вперед. После обсуждения вариантов решения задачи мы решили разделиться на подкоманды для тестирования разных подходов.

**Команда проекта:**


*   Артем Скрипчак (@daemonic_timmy)
*   Асылхан Кулжанов (@akulzhanov)
*   Илья Бледных (@IlkaXd)

*   Дмитрий (@Da_Vi_Mi)
*   Марат Гасанов (@ggassannovv)


*   Вячеслав (@watchslav)


*   Евгений Довбуш (@edovbysh)
*   Андрей Марченко (@we_r_1110)


*   Александр Яночкин (@AlError)






# Модель №1: LightGBM - Основная модель

### Импорт библиотек

In [1]:
!pip install statsmodels
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [56]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
import re
# from ydata_profiling import ProfileReport
from datetime import datetime, time

from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from catboost import CatBoostRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from tqdm import notebook

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

STATE = 12345

### Импорт данных

In [101]:
df_train = pd.read_csv('/content/train_dataset.csv')
df_test = pd.read_csv('/content/test_dataset.csv')

### Предобработка данных

In [102]:
df = df_train.append(df_test)

In [103]:
def index_date(data):
    data['time'] = pd.to_datetime(data['time'], format='%H').dt.time
    data['date_time'] = pd.to_datetime(
        data['date'] + 'T' + data['time'].astype(str),
        format='%Y-%m-%dT%H:%M:%S'
    )
    data = data.set_index('date_time')
    print(['Хронологический порядок индекса отсутствует',
           'Индекс соответствует хронологическому порядку'][data.index.is_monotonic])
    data.drop(columns=['date', 'time'], axis=1, inplace=True)

    return data

In [104]:
df = index_date(df)

Индекс соответствует хронологическому порядку


In [105]:
df = df.ffill(axis=0)
df.isnull().sum()

target          0
temp            0
temp_pred       0
weather_pred    0
weather_fact    0
dtype: int64

### Прогноз погоды

In [106]:
def rainsnow(x):
    if len(re.findall(r'\b\d+\b', x)) != 0:
        temp = re.findall(r'\b\d+\b', x)
        result = int(temp[0])
    elif 'дожд' in x or 'снег' in x or 'ливень' in x or 'снегопад' in x:
        result = 100
    else:
        result = 0
    return result


In [107]:
df['rain_snow'] = df['weather_pred'].apply(lambda x: rainsnow(x)) / 2400

#### Wing - ветер; Summer  - ясно/солнечно; Cloudy - пасмурно

In [108]:
df['wing'] = df['weather_pred'].apply(lambda x: [0, 1/24]['ветер' in x])
df['summer'] = df['weather_pred'].apply(lambda x: [0, 1/24]['ясно' in x or 'солнечно' in x])
df['cloudy'] = df['weather_pred'].apply(lambda x: [0, 1/24]['пас' in x])

### Удаление неинформативных столбцов

In [109]:
df = df.drop(columns=['temp', 'weather_pred', 'weather_fact'], axis=1)

Прогноз температуры

In [110]:
df.temp_pred = ((df.temp_pred + 20) / 53) / 24

In [111]:
k = 0
for index, row in df.iterrows():
    try:
        float(row['target'])
    except ValueError:
        print(index, k, row['target'])
    k += 1

### Обучение

In [112]:
df_proba = df.copy()

### Feature Extraction

Для создания признаков напишем функцию make_features(), куда передадим датасет, количество отстающих значений и скользящее среднее

In [113]:
def make_features(data, max_lag, rolling_mean_size):
    data['date'] = data.index.day
    data['dayofweek'] = data.index.dayofweek
    data['month'] = data.index.month
    data['year'] = data.index.year
    data['hour'] = data.index.hour

    for lag in range(1, max_lag + 1):
        data['lag_{}'.format(lag)] = data['target'].shift(lag)

    data['rolling_mean'] = data['target'].shift().rolling(rolling_mean_size).mean()

Количество отстающих значений примем 96, скользящее среднее 10.

In [114]:
make_features(df_proba, 10, 10)

### Обучающие и тестовые выборки

Разобъем датасет на обучающую и тестовую выборки

In [115]:
train = df_proba[:'2023-03-31']
test = df_proba['2023-04-01':]

In [116]:
for i in [train, test]:
    print(i.index.min(), i.index.max())

2019-01-01 00:00:00 2023-03-31 23:00:00
2023-04-01 00:00:00 2023-07-31 23:00:00


In [117]:
train = train.dropna()

In [118]:
X_train = train.drop('target', axis=1)
y_train = train['target']

X_test = test.drop('target', axis=1)
y_test = test['target']

### LightGBM

Произведем замену значений категориальных признаков на 'category' и сохраним тренировочные и тестовую выборку под новыми переменными X_train_cat, X_test_cat.

In [119]:
category_features = ['rain_snow', 'wing', 'summer', 'cloudy', 'date', 'dayofweek', 'month', 'year', 'hour']
X_train_cat = X_train.copy()
X_test_cat = X_test.copy()

for i in (X_train_cat, X_test_cat):
    for j in category_features:
        i[j] = i[j].astype('category')
    print(i.info())
    print()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 37214 entries, 2019-01-01 10:00:00 to 2023-03-31 23:00:00
Data columns (total 21 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   temp_pred     37214 non-null  float64 
 1   rain_snow     37214 non-null  category
 2   wing          37214 non-null  category
 3   summer        37214 non-null  category
 4   cloudy        37214 non-null  category
 5   date          37214 non-null  category
 6   dayofweek     37214 non-null  category
 7   month         37214 non-null  category
 8   year          37214 non-null  category
 9   hour          37214 non-null  category
 10  lag_1         37214 non-null  float64 
 11  lag_2         37214 non-null  float64 
 12  lag_3         37214 non-null  float64 
 13  lag_4         37214 non-null  float64 
 14  lag_5         37214 non-null  float64 
 15  lag_6         37214 non-null  float64 
 16  lag_7         37214 non-null  float64 
 17  lag_8         3

Подбор гиперпараметров

In [120]:
%%time

params_cat = {
    'depth': [3, 6, 9],
    'learning_rate': [0.01, 0.001]
}

tscv = TimeSeriesSplit()

booster = lgb.LGBMRegressor(
    objective='regression_l2',
    n_estimators=1000,
    categorical_feature = category_features,
    random_state=STATE)

grid_lgb = GridSearchCV(
    estimator=booster,
    param_grid=params_cat,
    scoring='neg_mean_absolute_error',
    cv=tscv,
    n_jobs=-1,
    verbose=0)

grid_lgb.fit(X_train_cat, y_train)

Please use categorical_feature argument of the Dataset constructor to pass this parameter.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3052
[LightGBM] [Info] Number of data points in the train set: 37214, number of used features: 21
[LightGBM] [Info] Start training from score 483.746796
CPU times: user 7.22 s, sys: 247 ms, total: 7.47 s
Wall time: 2min 22s


In [121]:
print('Лучшие параметры:', grid_lgb.best_params_)
print('MAE train LightGBM:', -round(grid_lgb.best_score_, 2))

Лучшие параметры: {'depth': 3, 'learning_rate': 0.01}
MAE train LightGBM: 4.99


Прогноз с метриками

In [122]:
predict_lgb = grid_lgb.predict(X_test_cat)
df_predict_lgb = pd.DataFrame(data=predict_lgb, index=X_test.index, columns=['predict'])
df_res_test = pd.DataFrame(data=y_test, index=y_test.index, columns=['target'])
df_res_test = df_res_test.merge(df_predict_lgb, left_index=True, right_index=True)
df_res_test_day = df_res_test.resample('1D').sum()
print(f'MAE: {round(mean_absolute_error(df_res_test_day.target, df_res_test_day.predict), 3)}')

MAE: 23.048


# Модель №2: Prophet

### Импорт библиотек

In [None]:
!pip install prophet



In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
from matplotlib import pyplot
from prophet import Prophet
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

### Импорт данных

In [None]:
df_train = pd.read_csv('/content/train_dataset.csv', delimiter=',')
df_test = pd.read_csv('/content/test_dataset.csv', delimiter=',')

df_train['datetime'] = df_train['date'] + ' ' + df_train['time'].astype(str) + ':00'
df_train['datetime'] = pd.to_datetime(df_train['datetime'], format="%Y-%m-%d %H:%M")
df_test['datetime'] = df_test['date'] + ' ' + df_test['time'].astype(str) + ':00'
df_test['datetime'] = pd.to_datetime(df_test['datetime'], format="%Y-%m-%d %H:%M")


### Предобработка данных

In [None]:
dataset_train = df_train.copy()
dataset_test = df_test.copy()


dataset_train = dataset_train.drop(columns=["date", "time",  "weather_pred", "weather_fact", "temp_pred"])#
dataset_train.rename(columns = {'target':'y', 'datetime':'ds'}, inplace = True )
dataset_test = dataset_test.drop(columns=["date", "time",  "weather_pred", "weather_fact", "temp_pred"]) #
dataset_test.rename(columns = {'target':'y', 'datetime':'ds'}, inplace = True )

predictions_len = len(dataset_test['y'].to_list())

### Подбор гиперпараметров (запускать не надо)

Создание сетки гиперпараметров

In [None]:
from sklearn.model_selection import ParameterGrid
params_grid = {
    'seasonality_mode': ['additive'],
    'weekly_prior_scale': [0.001, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.9],
    'yearly_prior_scale': [0.001, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.9],
    'changepoint_prior_scale':[0.001, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.9],
    'n_changepoints' : [1, 5, 10, 25, 50, 100]
}

grid = ParameterGrid(params_grid)
cnt = 0
for p in grid:
    cnt = cnt+1

print('Total Possible Models', cnt)

Подбор наиболее подходящих

In [None]:
import random

model_parameters = pd.DataFrame(columns = ['MAPE', 'MAE', 'MSE', 'Parameters'])
for p in grid:
    test = pd.DataFrame()
    print(p)
    random.seed(42)
    train_model = Prophet(
        changepoint_prior_scale = p['changepoint_prior_scale'],
        n_changepoints = p['n_changepoints'],
        seasonality_mode = p['seasonality_mode'],
        weekly_seasonality=False,
        daily_seasonality = True,
        yearly_seasonality = False
        )

    train_model.add_seasonality(
        name='weekly',
        period=7,
        fourier_order=5,
        prior_scale=p['weekly_prior_scale'],
        mode='additive'
    )

    train_model.add_seasonality(
        name='yearly',
        period=365.25,
        fourier_order=5,
        prior_scale=p['yearly_prior_scale'],
        mode='additive'
    )
    train_model.fit(dataset_train)
    future = train_model.make_future_dataframe(predictions_len, freq='H')
    forecast = train_model.predict(future)

    MAPE = mean_absolute_percentage_error(dataset_test['y'].to_list(), forecast.yhat[-predictions_len:].to_list()) * 100
    MAE = mean_absolute_error(dataset_test['y'].to_list(), forecast.yhat[-predictions_len:].to_list())
    MSE = mean_squared_error(dataset_test['y'].to_list(), forecast.yhat[-predictions_len:].to_list())

    print('Mean Absolute Error(MAE)----------------------------------------',MAE)
    print('Mean Absolute Percentage Error(MAPE)----------------------------',MAPE)
    print('Mean Squared Error(MSE)-----------------------------------------',MSE)
    model_parameters = model_parameters.append({'MAPE': MAPE, 'MAE': MAE, 'MSE': MSE, 'Parameters': p}, ignore_index=True)



### Инициализация модели с подобранными гиперпараметрами. Модель строит ежечасовой прогноз.

In [None]:
m = Prophet(
    yearly_seasonality=False,
    weekly_seasonality=False,
    daily_seasonality=True,
    seasonality_mode='additive',
    changepoint_prior_scale=0.2,
    n_changepoints=25
)

m.add_seasonality(
    name='weekly',
    period=7,
    fourier_order=5,
    prior_scale=0.2,
    mode='additive'
)
m.add_seasonality(
    name='yearly',
    period=365.25,
    fourier_order=5,
    prior_scale=0.05,
    mode='additive'
)

m.fit(dataset_train)
future = m.make_future_dataframe(predictions_len, freq='H')
forecast = m.predict(future)

DEBUG:cmdstanpy:input tempfile: /tmp/tmppjf6dnyi/iairod0q.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmppjf6dnyi/toblyth7.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=77756', 'data', 'file=/tmp/tmppjf6dnyi/iairod0q.json', 'init=/tmp/tmppjf6dnyi/toblyth7.json', 'output', 'file=/tmp/tmppjf6dnyi/prophet_modelj05yp34i/prophet_model-20231023112127.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
11:21:27 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
11:22:21 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


### Группировка результатов прогноза по дням и расчет метрик

In [None]:
forecast_df = forecast.copy()
forecast_df['date'] = forecast_df['ds'].dt.date
forecast_df = pd.DataFrame(forecast_df.groupby('date').sum())
forecast_df = forecast_df.loc[:, forecast_df.columns.intersection(['date','yhat'])]
forecast_df.reset_index(inplace=True)

dataset_test['date'] = dataset_test['ds'].dt.date
dataset_test = pd.DataFrame(dataset_test.groupby('date').sum())
dataset_test.reset_index(inplace=True)

predictions_len = len(dataset_test)

print("MAE:", mean_absolute_error(dataset_test['y'].to_list(), forecast_df.yhat[-predictions_len:].to_list()))
print("MAPE:", mean_absolute_percentage_error(dataset_test['y'].to_list(), forecast_df.yhat[-predictions_len:].to_list()) * 100, "%")

MAE: 280.1792423457857
MAPE: 2.711239857701512 %


  forecast_df = pd.DataFrame(forecast_df.groupby('date').sum())
  dataset_test = pd.DataFrame(dataset_test.groupby('date').sum())


# Модель №3: CatBoost

### Импорт библиотек

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.feature_extraction.text import TfidfVectorizer
import datetime

### Импорт данных

In [None]:
train_df = pd.read_csv('/content/train_dataset.csv')
test_df = pd.read_csv('/content/test_dataset.csv')

### Предобработка данных

In [None]:
df = train_df.groupby('date').agg({'target': 'sum', 'temp': 'mean', 'temp_pred': 'std', 'time': 'std', 'weather_fact': 'last', 'weather_pred': 'last'})
df_t = test_df.groupby('date').agg({'target': 'sum', 'temp': 'mean', 'temp_pred': 'std', 'time': 'std', 'weather_fact': 'last', 'weather_pred': 'last'})

In [None]:
time_list = train_df.groupby('date').agg({'target': 'std'}).iloc[:]['target'].apply(lambda x: np.sin(x) + 1).values
df['time_noise'] = time_list

In [None]:
df = df.drop('time', axis=1)
test_time_list = test_df.groupby('date').agg({'target': 'std'}).iloc[:]['target'].apply(lambda x: np.sin(x) + 1).values
df_t['time_noise'] = test_time_list

In [None]:
df_t = df_t.drop('time', axis=1)
df_t = df_t.reset_index()

In [None]:
df.reset_index(inplace=True)
df['date'] = pd.to_datetime(df['date'])
df_t['date'] = pd.to_datetime(df_t['date'])


### Разбиение на обучающую и тестовую выборки

In [None]:
X_test, X_train =  df_t.drop('target', axis=1), df.drop('target', axis=1)
Y_test, Y_train = df_t['target'], df['target']

### CatBoost

In [None]:
model = CatBoostRegressor(text_features=['weather_fact', 'weather_pred'], learning_rate=0.01, max_depth=13, l2_leaf_reg=0.1, task_type='GPU', n_estimators=1000, min_child_samples=3)
model.fit(X_train, Y_train)

0:	learn: 1822.8361914	total: 470ms	remaining: 7m 49s
1:	learn: 1807.8749721	total: 769ms	remaining: 6m 23s
2:	learn: 1792.8398143	total: 1.06s	remaining: 5m 54s
3:	learn: 1777.7542814	total: 1.08s	remaining: 4m 29s
4:	learn: 1762.8231738	total: 1.38s	remaining: 4m 33s
5:	learn: 1747.9716988	total: 1.68s	remaining: 4m 39s
6:	learn: 1733.9441665	total: 1.7s	remaining: 4m 1s
7:	learn: 1720.4356247	total: 1.76s	remaining: 3m 37s
8:	learn: 1706.1840388	total: 1.96s	remaining: 3m 35s
9:	learn: 1693.3755834	total: 1.97s	remaining: 3m 15s
10:	learn: 1679.3526738	total: 2.27s	remaining: 3m 24s
11:	learn: 1665.6075159	total: 2.3s	remaining: 3m 9s
12:	learn: 1651.8469557	total: 2.62s	remaining: 3m 18s
13:	learn: 1638.2292968	total: 2.91s	remaining: 3m 25s
14:	learn: 1624.6100622	total: 3.21s	remaining: 3m 30s
15:	learn: 1611.6057228	total: 3.52s	remaining: 3m 36s
16:	learn: 1598.8356305	total: 3.56s	remaining: 3m 25s
17:	learn: 1585.6864333	total: 3.85s	remaining: 3m 30s
18:	learn: 1572.6165280	

<catboost.core.CatBoostRegressor at 0x7cc16c248910>

Проверка метрик

In [None]:
MAE = mean_absolute_error(Y_test, model.predict(X_test))
MAPE = mean_absolute_percentage_error(Y_test, model.predict(X_test)) * 100
print(f"MAE: {MAE}\nMAPE: {MAPE} %")

MAE: 969.4987976869545
MAPE: 9.900422556015258 %


### Предобработка данных №2

In [None]:
train_df['weather_pred'] = train_df['weather_pred'].fillna(method='backfill')
train_df['weather_fact'] = train_df['weather_fact'].fillna(method='backfill')

In [None]:
text_wf = train_df['weather_fact'].unique()
tfidf = TfidfVectorizer()
tfidf.fit(text_wf)

In [None]:
df_for_tf = train_df[['weather_fact', 'date', 'target']].drop_duplicates()
index = list(df_for_tf.index)

In [None]:
mean_list = []
for i in index:
    f = pd.DataFrame(tfidf.transform([df_for_tf['weather_fact'][i]]).T.todense(),
                  index=tfidf.get_feature_names_out(),
                  columns=['tfidf'])
    mean_list.append(100*f['tfidf'].mean())
mean_list

[0.7352941176470588,
 0.7352941176470588,
 0.7352941176470588,
 1.0395234771633404,
 1.0395234771633404,
 1.0395234771633404,
 1.2725296589060697,
 1.2725296589060697,
 1.2725296589060697,
 1.2725296589060697,
 1.2725296589060697,
 1.2725296589060697,
 1.2725296589060697,
 1.2725296589060697,
 1.2725296589060697,
 1.2725296589060697,
 1.2725296589060697,
 1.2725296589060697,
 1.0245154174194735,
 1.0245154174194735,
 1.0245154174194735,
 1.0245154174194735,
 1.0245154174194735,
 1.0245154174194735,
 1.2725296589060697,
 1.2725296589060697,
 1.2725296589060697,
 1.2725296589060697,
 1.2725296589060697,
 1.2725296589060697,
 1.0395234771633404,
 1.0395234771633404,
 1.0395234771633404,
 1.0395234771633404,
 1.0395234771633404,
 1.0395234771633404,
 1.3783302285554555,
 1.3783302285554555,
 1.3783302285554555,
 1.3783302285554555,
 1.3783302285554555,
 1.3783302285554555,
 1.3783302285554555,
 1.3783302285554555,
 1.3783302285554555,
 1.3783302285554555,
 1.3783302285554555,
 1.3783302285

In [None]:
df_for_tf['tfidf_fact'] = mean_list
df_for_tf = df_for_tf.drop(columns=["weather_fact"])
df_for_tf

Unnamed: 0,date,target,tfidf_fact
0,2019-01-01,481.510,0.735294
1,2019-01-01,462.872,0.735294
2,2019-01-01,449.718,0.735294
3,2019-01-01,430.908,1.039523
4,2019-01-01,415.163,1.039523
...,...,...,...
37219,2023-03-31,552.960,1.018757
37220,2023-03-31,563.985,1.018757
37221,2023-03-31,560.191,1.018757
37222,2023-03-31,538.796,1.018757


In [None]:
text_wp = train_df['weather_pred'].unique()
tfidf = TfidfVectorizer()
tfidf.fit(text_wp)

In [None]:
df_for_tfwp = train_df[['weather_pred', 'date', 'target']].drop_duplicates()
index = list(df_for_tfwp.index)

In [None]:
mean_list2 = []
for i in index:
    f = pd.DataFrame(tfidf.transform([df_for_tfwp['weather_pred'][i]]).T.todense(),
                  index=tfidf.get_feature_names_out(),
                  columns=['tfidf'])
    mean_list2.append(100*f['tfidf'].mean())
mean_list2

[0.7878906556213314,
 0.7878906556213314,
 0.7878906556213314,
 0.7878906556213314,
 0.7878906556213314,
 0.7878906556213314,
 0.7878906556213314,
 0.7878906556213314,
 0.7878906556213314,
 0.852280290184671,
 0.852280290184671,
 0.852280290184671,
 0.852280290184671,
 0.852280290184671,
 0.852280290184671,
 0.852280290184671,
 0.852280290184671,
 0.852280290184671,
 0.852280290184671,
 0.852280290184671,
 0.852280290184671,
 0.852280290184671,
 0.852280290184671,
 0.852280290184671,
 0.9854167006869199,
 0.9854167006869199,
 0.9854167006869199,
 0.9854167006869199,
 0.9854167006869199,
 0.9854167006869199,
 0.7878906556213314,
 0.7878906556213314,
 0.7878906556213314,
 0.938255728107616,
 0.938255728107616,
 0.938255728107616,
 0.938255728107616,
 0.938255728107616,
 0.938255728107616,
 0.938255728107616,
 0.938255728107616,
 0.938255728107616,
 0.938255728107616,
 0.938255728107616,
 0.938255728107616,
 0.938255728107616,
 0.938255728107616,
 0.938255728107616,
 0.8613820028476924,
 

In [None]:
df_for_tfwp['tfidf_pred'] = mean_list2
df_for_tfwp = df_for_tfwp.drop(columns=['weather_pred'])
df_for_tfwp

Unnamed: 0,date,target,tfidf_pred
0,2019-01-01,481.510,0.787891
1,2019-01-01,462.872,0.787891
2,2019-01-01,449.718,0.787891
3,2019-01-01,430.908,0.787891
4,2019-01-01,415.163,0.787891
...,...,...,...
37219,2023-03-31,552.960,0.701693
37220,2023-03-31,563.985,0.701693
37221,2023-03-31,560.191,0.701693
37222,2023-03-31,538.796,0.701693


In [None]:
tf = pd.concat([df_for_tf, df_for_tfwp.drop(['target', 'date'], axis=1)], axis=1)
tf = tf.groupby('date').agg({'tfidf_fact': 'mean', 'tfidf_pred': 'std'}).reset_index()

In [None]:
train_df

Unnamed: 0,date,time,target,temp,temp_pred,weather_pred,weather_fact
0,2019-01-01,0,481.510,2.9,2.0,"пасм, ветер",ветер
1,2019-01-01,1,462.872,2.9,2.0,"пасм, ветер",ветер
2,2019-01-01,2,449.718,2.9,2.0,"пасм, ветер",ветер
3,2019-01-01,3,430.908,4.3,2.0,"пасм, ветер","ветер, пасм"
4,2019-01-01,4,415.163,4.3,2.0,"пасм, ветер","ветер, пасм"
...,...,...,...,...,...,...,...
37219,2023-03-31,19,552.960,7.9,6.0,"пасм, 61%","пасм, дымка"
37220,2023-03-31,20,563.985,7.9,6.0,"пасм, 61%","пасм, дымка"
37221,2023-03-31,21,560.191,6.3,5.0,"пасм, 61%","пасм, дымка"
37222,2023-03-31,22,538.796,6.3,5.0,"пасм, 61%","пасм, дымка"


In [None]:
new_train = pd.merge(train_df, tf, how='inner', on=['date'])

In [None]:
new_train['date'] = pd.to_datetime(new_train['date'])

In [None]:
test_df['weather_pred'] = test_df['weather_pred'].fillna(method='backfill')
test_df['weather_fact'] = test_df['weather_fact'].fillna(method='backfill')


In [None]:
test_wf = test_df['weather_fact'].unique()
tfidf = TfidfVectorizer()
tfidf.fit(test_wf)
test_for_tf = test_df[['weather_fact', 'date', 'target']].drop_duplicates()
index = list(test_for_tf.index)
mean_list = []
for i in index:
    f = pd.DataFrame(tfidf.transform([test_for_tf['weather_fact'][i]]).T.todense(),
                  index=tfidf.get_feature_names_out(),
                  columns=['tfidf'])
    mean_list.append(100*f['tfidf'].mean())
mean_list
test_for_tf['tfidf_fact'] = mean_list
test_for_tf = test_for_tf.drop('weather_fact', axis=1)

In [None]:
test_wp = test_df['weather_pred'].unique()
tfidf = TfidfVectorizer()
tfidf.fit(test_wp)
test_for_tfwp = test_df[['weather_pred', 'date', 'target']].drop_duplicates()
index = list(test_for_tfwp.index)
mean_list2 = []
for i in index:
    f = pd.DataFrame(tfidf.transform([test_for_tfwp['weather_pred'][i]]).T.todense(),
                  index=tfidf.get_feature_names_out(),
                  columns=['tfidf'])
    mean_list2.append(100*f['tfidf'].mean())
mean_list2
test_for_tfwp['tfidf_pred'] = mean_list2
test_for_tfwp = test_for_tfwp.drop('weather_pred', axis=1)

In [None]:
tf_test = pd.concat([test_for_tf, test_for_tfwp.drop(['target', 'date'], axis=1)], axis=1)
tf_test = tf_test.groupby('date').agg({'tfidf_fact': 'mean', 'tfidf_pred': 'std'}).reset_index()


In [None]:
new_test = pd.merge(test_df, tf_test, how='inner', on=['date'])


In [None]:
new_test['date'] = pd.to_datetime(new_test['date'])

In [None]:
new_train

Unnamed: 0,date,time,target,temp,temp_pred,weather_pred,weather_fact,tfidf_fact,tfidf_pred
0,2019-01-01,0,481.510,2.9,2.0,"пасм, ветер",ветер,1.114246,0.031843
1,2019-01-01,1,462.872,2.9,2.0,"пасм, ветер",ветер,1.114246,0.031843
2,2019-01-01,2,449.718,2.9,2.0,"пасм, ветер",ветер,1.114246,0.031843
3,2019-01-01,3,430.908,4.3,2.0,"пасм, ветер","ветер, пасм",1.114246,0.031843
4,2019-01-01,4,415.163,4.3,2.0,"пасм, ветер","ветер, пасм",1.114246,0.031843
...,...,...,...,...,...,...,...,...,...
37219,2023-03-31,19,552.960,7.9,6.0,"пасм, 61%","пасм, дымка",0.958296,0.088839
37220,2023-03-31,20,563.985,7.9,6.0,"пасм, 61%","пасм, дымка",0.958296,0.088839
37221,2023-03-31,21,560.191,6.3,5.0,"пасм, 61%","пасм, дымка",0.958296,0.088839
37222,2023-03-31,22,538.796,6.3,5.0,"пасм, 61%","пасм, дымка",0.958296,0.088839


In [None]:
new_train['month'] = new_train.reset_index()['date'].iloc[:].apply(lambda x: np.datetime64(x).astype(datetime.date).month)
new_test['month'] = new_test.reset_index()['date'].iloc[:].apply(lambda x: np.datetime64(x).astype(datetime.date).month)

In [None]:
new_train['iswinter'] = new_train.iloc[:]['month'].apply(lambda x: 1 if x in [1, 2, 3, 11, 12] else 0)
new_test['iswinter'] = new_test.iloc[:]['month'].apply(lambda x: 1 if x in [1, 2, 3, 11, 12] else 0)


In [None]:
new_train['iscorona'] = new_train.reset_index().iloc[:]['date'].apply(lambda x: 1 if ((x >= datetime.datetime(2020, 1, 1)) & (x <= datetime.datetime(2020, 12, 31))) else 0)
new_test['iscorona'] = new_test.reset_index().iloc[:]['date'].apply(lambda x: 1 if ((x >= datetime.datetime(2020, 1, 1)) & (x <= datetime.datetime(2020, 12, 31))) else 0)


In [None]:
df_new_train = new_train.groupby('date').agg({'tfidf_fact': 'mean', 'tfidf_pred': 'median', 'target': 'sum', 'temp': 'mean', 'temp_pred': 'std', 'time': 'std', 'weather_fact': 'last', 'weather_pred': 'last', 'month': 'last', 'iswinter': 'first', 'iscorona': 'first'})

In [None]:
df_new_test = new_test.groupby('date').agg({'tfidf_fact': 'first', 'tfidf_pred': 'first', 'target': 'sum', 'temp': 'mean', 'temp_pred': 'std', 'time': 'std', 'weather_fact': 'last', 'weather_pred': 'last',  'month': 'last', 'iswinter': 'first', 'iscorona': 'first'})

Разбиение на обучающую и тестовую выборку

In [None]:
X_test, X_train =  df_new_test.reset_index().drop('target', axis=1), df_new_train.reset_index().drop('target', axis=1)
Y_test, Y_train = df_new_test.reset_index()['target'],  df_new_train.reset_index()['target']

CatBoost №2

In [None]:
model = CatBoostRegressor(text_features=['weather_fact', 'weather_pred'], learning_rate=0.1, max_depth=15, l2_leaf_reg=0.1, task_type='GPU', n_estimators=100, min_child_samples=2)

In [None]:
model.fit(X_train.set_index('date').drop('time', axis=1), Y_train)


0:	learn: 1678.3439690	total: 203ms	remaining: 20.1s
1:	learn: 1530.2549987	total: 348ms	remaining: 17s
2:	learn: 1396.2118336	total: 439ms	remaining: 14.2s
3:	learn: 1277.3915395	total: 536ms	remaining: 12.9s
4:	learn: 1174.7530945	total: 625ms	remaining: 11.9s
5:	learn: 1079.8555865	total: 714ms	remaining: 11.2s
6:	learn: 995.4031599	total: 809ms	remaining: 10.8s
7:	learn: 917.3999196	total: 898ms	remaining: 10.3s
8:	learn: 848.5069346	total: 987ms	remaining: 9.97s
9:	learn: 790.6148396	total: 1.07s	remaining: 9.64s
10:	learn: 736.9240968	total: 1.17s	remaining: 9.44s
11:	learn: 686.3329781	total: 1.25s	remaining: 9.17s
12:	learn: 643.5143357	total: 1.34s	remaining: 8.97s
13:	learn: 606.5224164	total: 1.43s	remaining: 8.77s
14:	learn: 570.9643770	total: 1.52s	remaining: 8.6s
15:	learn: 539.9346332	total: 1.6s	remaining: 8.41s
16:	learn: 513.6348014	total: 1.69s	remaining: 8.26s
17:	learn: 489.1066924	total: 1.82s	remaining: 8.3s
18:	learn: 469.4953050	total: 1.96s	remaining: 8.36s
19

<catboost.core.CatBoostRegressor at 0x7cc1607890f0>

In [None]:
MAE = mean_absolute_error(Y_test, model.predict(X_test.set_index('date').drop('time', axis=1)))
MAPE = mean_absolute_percentage_error(Y_test, model.predict(X_test.set_index('date').drop('time', axis=1))) * 100
print(f"MAE: {MAE}; MAPE: {MAPE} %")

MAE: 388.56975273410285; MAPE: 3.890435129107722 %


---



# Модель №4: XGBoost
### Сделано на скорую руку

### Импорт библиотек

In [None]:
import xgboost as xgb
import numpy as np
import pandas as pd
import plotly.express as px
from matplotlib import pyplot
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

### Импорт данных

In [None]:
df_train = pd.read_csv('/content/train_dataset.csv', delimiter=',')
df_test = pd.read_csv('/content/test_dataset.csv', delimiter=',')

df_train['datetime'] = df_train['date'] + ' ' + df_train['time'].astype(str) + ':00'
df_train['datetime'] = pd.to_datetime(df_train['datetime'], format="%Y-%m-%d %H:%M")
df_test['datetime'] = df_test['date'] + ' ' + df_test['time'].astype(str) + ':00'
df_test['datetime'] = pd.to_datetime(df_test['datetime'], format="%Y-%m-%d %H:%M")


Предобработка данных

In [None]:
df_train = pd.read_csv('/content/train_dataset.csv', delimiter=',')
df_test = pd.read_csv('/content/test_dataset.csv', delimiter=',')

df_train['datetime'] = df_train['date'] + ' ' + df_train['time'].astype(str) + ':00'
df_train['datetime'] = pd.to_datetime(df_train['datetime'], format="%Y-%m-%d %H:%M")
df_test['datetime'] = df_test['date'] + ' ' + df_test['time'].astype(str) + ':00'
df_test['datetime'] = pd.to_datetime(df_test['datetime'], format="%Y-%m-%d %H:%M")

df_train2 = pd.DataFrame(df_train.groupby('date').sum()).drop(columns=['time'])
df_test2 =  pd.DataFrame(df_test.groupby('date').sum()).drop(columns=['time'])

df_train2.reset_index(inplace=True)
df_test2.reset_index(inplace=True)

fig = px.line(df_train2, x="date", y="target")
fig.show()

In [None]:
def code_mean(data, cat_feature, real_feature):
    """
    Возвращает словарь, где ключами являются уникальные категории признака cat_feature,
    а значениями - средние по real_feature
    """
    return dict(data.groupby(cat_feature)[real_feature].mean())

In [None]:
dataset = df_train.copy()

dat = pd.DataFrame(dataset)
dat['weekday'] = pd.to_datetime(dat['datetime']).dt.dayofweek  # monday = 0, sunday = 6
dat["hour"] = pd.to_datetime(dat['datetime']).dt.hour
dat['is_weekend'] = 0          # Initialize the column with default value of 0
dat.loc[dat['weekday'].isin([5, 6]), 'is_weekend'] = 1  # 5 and 6 correspond to Sat and Sun
dat

Unnamed: 0,date,time,target,temp,temp_pred,weather_pred,weather_fact,datetime,weekday,hour,is_weekend
0,2019-01-01,0,481.510,2.9,2.0,"пасм, ветер",ветер,2019-01-01 00:00:00,1,0,0
1,2019-01-01,1,462.872,2.9,2.0,"пасм, ветер",ветер,2019-01-01 01:00:00,1,1,0
2,2019-01-01,2,449.718,2.9,2.0,"пасм, ветер",ветер,2019-01-01 02:00:00,1,2,0
3,2019-01-01,3,430.908,4.3,2.0,"пасм, ветер","ветер, пасм",2019-01-01 03:00:00,1,3,0
4,2019-01-01,4,415.163,4.3,2.0,"пасм, ветер","ветер, пасм",2019-01-01 04:00:00,1,4,0
...,...,...,...,...,...,...,...,...,...,...,...
37219,2023-03-31,19,552.960,7.9,6.0,"пасм, 61%","пасм, дымка",2023-03-31 19:00:00,4,19,0
37220,2023-03-31,20,563.985,7.9,6.0,"пасм, 61%","пасм, дымка",2023-03-31 20:00:00,4,20,0
37221,2023-03-31,21,560.191,6.3,5.0,"пасм, 61%","пасм, дымка",2023-03-31 21:00:00,4,21,0
37222,2023-03-31,22,538.796,6.3,5.0,"пасм, 61%","пасм, дымка",2023-03-31 22:00:00,4,22,0


In [None]:
dataset = df_test.copy()

dat2 = pd.DataFrame(dataset)
dat2['weekday'] = pd.to_datetime(dat2['datetime']).dt.dayofweek  # monday = 0, sunday = 6
dat2["hour"] = pd.to_datetime(dat2['datetime']).dt.hour
dat2['is_weekend'] = 0          # Initialize the column with default value of 0
dat2.loc[dat['weekday'].isin([5, 6]), 'is_weekend'] = 1  # 5 and 6 correspond to Sat and Sun
dat2

Unnamed: 0,date,time,target,temp,temp_pred,weather_pred,weather_fact,datetime,weekday,hour,is_weekend
0,2023-04-01,0,479.282,5.7,5.0,"пасм, 58%","пасм, морось",2023-04-01 00:00:00,5,0,0
1,2023-04-01,1,445.182,5.7,5.0,"пасм, 58%","пасм, морось",2023-04-01 01:00:00,5,1,0
2,2023-04-01,2,424.225,5.7,5.0,"пасм, 58%","пасм, морось",2023-04-01 02:00:00,5,2,0
3,2023-04-01,3,413.866,5.0,4.0,"пасм, 71% дождь","пасм, дымка",2023-04-01 03:00:00,5,3,0
4,2023-04-01,4,408.146,5.0,4.0,"пасм, 71% дождь","пасм, дымка",2023-04-01 04:00:00,5,4,0
...,...,...,...,...,...,...,...,...,...,...,...
2923,2023-07-31,19,453.173,17.2,18.0,"пасм, 24% дождь",п/обл,2023-07-31 19:00:00,0,19,0
2924,2023-07-31,20,446.287,17.2,18.0,"пасм, 24% дождь",п/обл,2023-07-31 20:00:00,0,20,0
2925,2023-07-31,21,444.373,15.8,17.0,"пасм, 24% дождь",п/обл,2023-07-31 21:00:00,0,21,0
2926,2023-07-31,22,449.078,15.8,17.0,"пасм, 24% дождь",п/обл,2023-07-31 22:00:00,0,22,0


Преобразование данных для xgboost

In [None]:
def prepareData(data, lag_start=5, lag_end=20, test_size=0.15):

    data = pd.DataFrame(data.copy())

    # добавляем лаги исходного ряда в качестве признаков
    for i in range(lag_start, lag_end):
        data["lag_{}".format(i)] = data["target"].shift(i)

    # считаем средние только по тренировочной части, чтобы избежать лика
    data['weekday_average'] = pd.Series(map(code_mean(data, 'weekday', "target").get, data["weekday"]))
    data['weekend_average'] = pd.Series(map(code_mean(data, 'is_weekend', "target").get, data["is_weekend"]))
    data['hour_average'] = pd.Series(map(code_mean(data, 'hour', "target").get, data["hour"]))
    data['datetime'] = data['datetime'].values.astype("float64")
    data['temp_average'] = pd.Series(map(code_mean(data, 'hour', "temp").get, data["temp"]))

    # выкидываем закодированные средними признаки
    data.drop(["weekday", "hour", "is_weekend", "weather_pred", "weather_fact", "temp_pred", "date", "time"], axis=1, inplace=True) # , "Date", "Time"

    data = data.dropna()
    data = data.reset_index(drop=True)

    X_train = data.drop(["target"], axis=1)
    y_train = data["target"]



    return X_train, y_train

X_train, y_train  = prepareData(dat, lag_start=7, lag_end=196) # y_train, y_test
X_test, y_test = prepareData(dat2, lag_start=7, lag_end=196)


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

### XGBoost

In [None]:

# fit the model
my_xgb = xgb.XGBRegressor()
my_xgb.fit(X_train, y_train)

# predict on the same period
prediction = my_xgb.predict(X_test)

In [None]:
boost_forecast_df = pd.DataFrame(prediction, y_test)
boost_forecast_df.reset_index(inplace=True)
boost_forecast_df = boost_forecast_df.groupby(boost_forecast_df.index // 24).sum()
boost_forecast_df.rename(columns = {0:'predict'}, inplace = True )

MAE = mean_absolute_error(boost_forecast_df['target'].to_list(), boost_forecast_df['predict'].to_list())
MAPE = mean_absolute_percentage_error(boost_forecast_df['target'].to_list(), boost_forecast_df['predict'].to_list()) * 100

print("MAE:", MAE)
print("MAPE:", MAPE , "%")

MAE: 110.87035876464863
MAPE: 1.1812301412152952 %
