In [1]:
# импортируем необходимые библиотеки, классы и функции
import pandas as pd
import numpy as np
import pathlib
from tqdm.notebook import tqdm

from etna.datasets import TSDataset
from etna.pipeline import Pipeline
from etna.metrics import MSE
from etna.transforms import (LagTransform,
                             MinMaxScalerTransform,
                             MeanTransform,
                             DateFlagsTransform)

from etna_utils import LGBMMultiSegmentModel

# отключаем предупреждения
import warnings
warnings.filterwarnings('ignore')



# Загрузка данных

In [2]:
# задаем константную часть пути
DATA_PREFIX = pathlib.PurePath('/Users/artemgruzdev/Documents/GitHub/'
                               'Time_Series/Code/Data/store_sales')
# задаем горизонт
HORIZON = 16
# задаем стартовую дату
start_date = '2015-01-01'

In [3]:
# загружаем исторический набор
train = pd.read_csv(
    DATA_PREFIX.joinpath('train.csv'),
    parse_dates=['date'], 
    infer_datetime_format=True,
)
train.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


In [4]:
# загружаем набор новых данных
test = pd.read_csv(
    DATA_PREFIX.joinpath('test.csv'), 
    parse_dates=['date'], 
    infer_datetime_format=True,
)
test.head()

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0


# Создание объекта `TSDataset`

In [5]:
# функция подготовки исторического набора эндогенных
# переменных и набора с экзогенными переменными
# (исторический период + прогнозируемый период)
def prepare_df(df, start_date=None, num_segments=None):
    df = df.drop(columns=['onpromotion', 'id'])
    # создаем сегменты
    df['segment'] = [f'{x}::{y}' for x, y in zip(df['store_nbr'], 
                                                 df['family'])]
    # переименовываем столбцы в соответствии с требованиями ETNA
    df.rename(columns={
        'date': 'timestamp', 
        'sales': 'target', 
    }, inplace=True)
    
    # если стартовая дата не задана, используем ранее
    # созданную стартовую дату
    if start_date is not None:
        df = df[df['timestamp'] >= start_date]
    
    # если количество сегментов задано, формируем сегменты на основе 
    # первых num_segments уникальных значений столбца segments
    if num_segments is not None:
        segments = df['segment'].unique()[:num_segments]
        df = df[df['segment'].isin(segments)]
    
    # формируем датафрейм из 3 обязательных столбцов
    df = df[['timestamp', 'segment', 'target']]
    # выполняем логарифмирование зависимой переменной
    df['target'] = np.log1p(df['target'])  
    # стартовая дата
    start_date = df['timestamp'].min()
    # последняя дата: последняя дата исторического набора + горизонт
    end_date = df['timestamp'].max() + pd.Timedelta(HORIZON, 'D')
    # создаем индекс дат
    timestamp = pd.date_range(start=start_date, end=end_date, freq='D')
    # создаем пустой список
    df_exog_list = []
    # создаем датафрейм со столбцом 
    # timestamp на основе индекса дат
    df_exog_segment_template = pd.DataFrame({'timestamp': timestamp})
    # формируем датафрейм с экзогенными переменными
    for segment in tqdm(df['segment'].unique()):
        df_exog_segment = df_exog_segment_template.copy()
        df_exog_segment['segment'] = segment
        df_exog_segment['regressor_store_nbr'] = segment.split('::')[0]
        df_exog_segment['regressor_family'] = segment.split('::')[1]
        df_exog_list.append(df_exog_segment)
    df_exog = pd.concat(df_exog_list, ignore_index=True)
    # признакам, которые хотим обрабатывать как
    # категориальные, присваиваем тип category
    df_exog['regressor_store_nbr'] = df_exog['regressor_store_nbr'].astype(
        'category')
    df_exog['regressor_family'] = df_exog['regressor_family'].astype(
        'category')
    
    return df, df_exog

In [6]:
# создаем исторический набор с эндогенными переменными
# и набор с экзогенными переменными (исторический 
# период + прогнозируемый период)
train, train_exog = prepare_df(train, start_date=start_date)

  0%|          | 0/1782 [00:00<?, ?it/s]

In [7]:
# смотрим набор c эндогенными переменными
train

Unnamed: 0,timestamp,segment,target
1297296,2015-01-01,1::AUTOMOTIVE,0.000000
1297297,2015-01-01,1::BABY CARE,0.000000
1297298,2015-01-01,1::BEAUTY,0.000000
1297299,2015-01-01,1::BEVERAGES,0.000000
1297300,2015-01-01,1::BOOKS,0.000000
...,...,...,...
3000883,2017-08-15,9::POULTRY,6.084802
3000884,2017-08-15,9::PREPARED FOODS,5.046987
3000885,2017-08-15,9::PRODUCE,7.791824
3000886,2017-08-15,9::SCHOOL AND OFFICE SUPPLIES,4.804021


In [8]:
# смотрим набор с экзогенными переменными
train_exog

Unnamed: 0,timestamp,segment,regressor_store_nbr,regressor_family
0,2015-01-01,1::AUTOMOTIVE,1,AUTOMOTIVE
1,2015-01-02,1::AUTOMOTIVE,1,AUTOMOTIVE
2,2015-01-03,1::AUTOMOTIVE,1,AUTOMOTIVE
3,2015-01-04,1::AUTOMOTIVE,1,AUTOMOTIVE
4,2015-01-05,1::AUTOMOTIVE,1,AUTOMOTIVE
...,...,...,...,...
1735663,2017-08-27,9::SEAFOOD,9,SEAFOOD
1735664,2017-08-28,9::SEAFOOD,9,SEAFOOD
1735665,2017-08-29,9::SEAFOOD,9,SEAFOOD
1735666,2017-08-30,9::SEAFOOD,9,SEAFOOD


In [9]:
# создаем объединенный набор
ts = TSDataset(
    df=TSDataset.to_dataset(train), 
    df_exog=TSDataset.to_dataset(train_exog), 
    freq='D', known_future='all'
)

In [10]:
# смотрим объединенный набор
ts

segment,10::AUTOMOTIVE,10::AUTOMOTIVE,10::AUTOMOTIVE,10::BABY CARE,10::BABY CARE,10::BABY CARE,10::BEAUTY,10::BEAUTY,10::BEAUTY,10::BEVERAGES,...,9::PREPARED FOODS,9::PRODUCE,9::PRODUCE,9::PRODUCE,9::SCHOOL AND OFFICE SUPPLIES,9::SCHOOL AND OFFICE SUPPLIES,9::SCHOOL AND OFFICE SUPPLIES,9::SEAFOOD,9::SEAFOOD,9::SEAFOOD
feature,regressor_family,regressor_store_nbr,target,regressor_family,regressor_store_nbr,target,regressor_family,regressor_store_nbr,target,regressor_family,...,target,regressor_family,regressor_store_nbr,target,regressor_family,regressor_store_nbr,target,regressor_family,regressor_store_nbr,target
timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2015-01-01,AUTOMOTIVE,10,0.000000,BABY CARE,10,0.0,BEAUTY,10,0.000000,BEVERAGES,...,0.000000,PRODUCE,9,0.000000,SCHOOL AND OFFICE SUPPLIES,9,0.000000,SEAFOOD,9,0.000000
2015-01-02,AUTOMOTIVE,10,1.386294,BABY CARE,10,0.0,BEAUTY,10,1.098612,BEVERAGES,...,4.821571,PRODUCE,9,0.000000,SCHOOL AND OFFICE SUPPLIES,9,0.000000,SEAFOOD,9,3.178054
2015-01-03,AUTOMOTIVE,10,1.609438,BABY CARE,10,0.0,BEAUTY,10,0.693147,BEVERAGES,...,4.110874,PRODUCE,9,0.000000,SCHOOL AND OFFICE SUPPLIES,9,0.000000,SEAFOOD,9,2.944439
2015-01-04,AUTOMOTIVE,10,0.693147,BABY CARE,10,0.0,BEAUTY,10,1.791759,BEVERAGES,...,3.920209,PRODUCE,9,0.000000,SCHOOL AND OFFICE SUPPLIES,9,0.000000,SEAFOOD,9,3.465736
2015-01-05,AUTOMOTIVE,10,1.945910,BABY CARE,10,0.0,BEAUTY,10,1.098612,BEVERAGES,...,3.608916,PRODUCE,9,0.000000,SCHOOL AND OFFICE SUPPLIES,9,0.000000,SEAFOOD,9,2.833213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-11,AUTOMOTIVE,10,1.098612,BABY CARE,10,0.0,BEAUTY,10,1.098612,BEVERAGES,...,4.728272,PRODUCE,9,7.282127,SCHOOL AND OFFICE SUPPLIES,9,4.948760,SEAFOOD,9,3.212093
2017-08-12,AUTOMOTIVE,10,1.098612,BABY CARE,10,0.0,BEAUTY,10,1.386294,BEVERAGES,...,4.874464,PRODUCE,9,7.258598,SCHOOL AND OFFICE SUPPLIES,9,4.934474,SEAFOOD,9,2.882508
2017-08-13,AUTOMOTIVE,10,0.000000,BABY CARE,10,0.0,BEAUTY,10,1.791759,BEVERAGES,...,4.665032,PRODUCE,9,7.435206,SCHOOL AND OFFICE SUPPLIES,9,5.303305,SEAFOOD,9,3.044522
2017-08-14,AUTOMOTIVE,10,0.693147,BABY CARE,10,0.0,BEAUTY,10,1.945910,BEVERAGES,...,4.745975,PRODUCE,9,7.207434,SCHOOL AND OFFICE SUPPLIES,9,5.209486,SEAFOOD,9,2.890372


In [11]:
# смотрим регрессоры
ts.regressors

['regressor_family', 'regressor_store_nbr']

In [12]:
num_lags = 50

# задаем список преобразований/признаков
transforms = [
    MinMaxScalerTransform(in_column='target'),
    LagTransform(in_column='target', 
                 lags=[HORIZON + i for i in range(num_lags)],
                 out_column='lag'),
    DateFlagsTransform(
        day_number_in_week=True,
        day_number_in_month=True,
        is_weekend=True,
        out_column='datetime'),
    MeanTransform(in_column='target', window=32, out_column='mean32')
]

# Перекрестная проверка расширяющимся окном

In [13]:
# создаем модель
model = LGBMMultiSegmentModel(n_estimators=400,
                              learning_rate=0.08,
                              min_data_in_leaf=80,
                              subsample=0.6)
# передаем в конвейер модель, горизонт 
# и набор преобразований/признаков
pipeline = Pipeline(model=model,
                    horizon=HORIZON, 
                    transforms=transforms)
# запускаем перекрестную проверку 
# расширяющимся окном
metrics_df, forecast_df, fold_info_df = pipeline.backtest(
    ts=ts, metrics=[MSE()], n_folds=3
)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.




[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.3min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  3.0min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  4.6min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  4.6min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   31.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.6min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.6min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   22.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   46.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.1m

In [14]:
# смотрим метрики по 3 тестовым выборках 
# для первых 2 сегментов
n_folds = 3
n_segments = 2
n = n_folds * n_segments
metrics_df.head(n)

Unnamed: 0,segment,MSE,fold_number
0,10::AUTOMOTIVE,0.335921,0
0,10::AUTOMOTIVE,0.469482,1
0,10::AUTOMOTIVE,0.404746,2
1,10::BABY CARE,4.4e-05,0
1,10::BABY CARE,1.4e-05,1
1,10::BABY CARE,9.6e-05,2


In [15]:
# смотрим значение RMSE, усредненное по сегментам
metric = metrics_df['MSE'].mean()**(1/2)
print(f'RMSE: {metric:.4f}')

RMSE: 0.4309


# Обучение на всем историческом наборе

In [16]:
# создаем модель
model = LGBMMultiSegmentModel(n_estimators=400,
                              learning_rate=0.08, 
                              min_data_in_leaf=80,
                              subsample=0.6)
# передаем в конвейер модель, горизонт 
# и набор преобразований/признаков
pipeline = Pipeline(model=model, 
                    horizon=HORIZON, 
                    transforms=transforms)

In [17]:
# обучаем конвейер на всем историческом наборе
pipeline.fit(ts=ts);



In [18]:
# получаем набор с прогнозами
forecasted = pipeline.forecast()
forecasted

segment,10::AUTOMOTIVE,10::AUTOMOTIVE,10::AUTOMOTIVE,10::AUTOMOTIVE,10::AUTOMOTIVE,10::AUTOMOTIVE,10::AUTOMOTIVE,10::AUTOMOTIVE,10::AUTOMOTIVE,10::AUTOMOTIVE,...,9::SEAFOOD,9::SEAFOOD,9::SEAFOOD,9::SEAFOOD,9::SEAFOOD,9::SEAFOOD,9::SEAFOOD,9::SEAFOOD,9::SEAFOOD,9::SEAFOOD
feature,datetime_day_number_in_month,datetime_day_number_in_week,datetime_is_weekend,lag_16,lag_17,lag_18,lag_19,lag_20,lag_21,lag_22,...,lag_60,lag_61,lag_62,lag_63,lag_64,lag_65,mean32,regressor_family,regressor_store_nbr,target
timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2017-08-16,16,2,False,0.355418,0.0,0.0,0.224244,0.0,0.224244,0.0,...,0.787679,0.719993,0.704803,0.604633,0.82326,0.67347,0.696034,SEAFOOD,9,2.815512
2017-08-17,17,3,False,0.224244,0.355418,0.0,0.0,0.224244,0.0,0.224244,...,0.810536,0.787679,0.719993,0.704803,0.604633,0.82326,0.695239,SEAFOOD,9,2.795479
2017-08-18,18,4,False,0.0,0.224244,0.355418,0.0,0.0,0.224244,0.0,...,0.676614,0.810536,0.787679,0.719993,0.704803,0.604633,0.699005,SEAFOOD,9,2.769263
2017-08-19,19,5,True,0.355418,0.0,0.224244,0.355418,0.0,0.0,0.224244,...,0.656358,0.676614,0.810536,0.787679,0.719993,0.704803,0.702999,SEAFOOD,9,3.162933
2017-08-20,20,6,True,0.520678,0.355418,0.0,0.224244,0.355418,0.0,0.0,...,0.697126,0.656358,0.676614,0.810536,0.787679,0.719993,0.704971,SEAFOOD,9,3.206512
2017-08-21,21,0,False,0.355418,0.520678,0.355418,0.0,0.224244,0.355418,0.0,...,0.661814,0.697126,0.656358,0.676614,0.810536,0.787679,0.710023,SEAFOOD,9,2.816202
2017-08-22,22,1,False,0.355418,0.355418,0.520678,0.355418,0.0,0.224244,0.355418,...,0.537138,0.661814,0.697126,0.656358,0.676614,0.810536,0.710067,SEAFOOD,9,2.712336
2017-08-23,23,2,False,0.448488,0.355418,0.355418,0.520678,0.355418,0.0,0.224244,...,0.801038,0.537138,0.661814,0.697126,0.656358,0.676614,0.704422,SEAFOOD,9,2.770114
2017-08-24,24,3,False,0.0,0.448488,0.355418,0.355418,0.520678,0.355418,0.0,...,0.784231,0.801038,0.537138,0.661814,0.697126,0.656358,0.704008,SEAFOOD,9,2.700248
2017-08-25,25,4,False,0.448488,0.0,0.448488,0.355418,0.355418,0.520678,0.355418,...,0.625484,0.784231,0.801038,0.537138,0.661814,0.697126,0.70467,SEAFOOD,9,2.782664


# Формирование файла посылки

In [19]:
# загружаем файл посылки
sample_submission = pd.read_csv(
    DATA_PREFIX.joinpath('sample_submission.csv')
)
# выводим первые 5 наблюдений посылки
sample_submission.head()

Unnamed: 0,id,sales
0,3000888,0.0
1,3000889,0.0
2,3000890,0.0
3,3000891,0.0
4,3000892,0.0


In [20]:
# выводим первые 5 наблюдений набора новых данных
test.head()

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0


In [21]:
# берем прогнозы
forecasted_target = forecasted[:, :, 'target']
# создаем серию из столбца sales файла посылки
sales = sample_submission['sales'].copy()
# записываем прогнозы в только что созданную серию
for i, row in tqdm(test.iterrows(), total=test.shape[0]):
    date = row['date']
    segment = f"{row['store_nbr']}::{row['family']}"
    try:
        target = forecasted_target.loc[date, segment].item()
        sales.iloc[i] = target
    except IndexError:
        pass

  0%|          | 0/28512 [00:00<?, ?it/s]

In [22]:
# смотрим статистики серии, выясняем,
# есть ли отрицательные прогнозы
sales.describe()

count    28512.000000
mean         3.619754
std          2.492622
min         -0.040792
25%          1.538671
50%          3.386431
75%          5.578472
max          9.723729
Name: sales, dtype: float64

In [23]:
# серия с прогнозами становится столбцом sales файла посылки
# если есть отрицательные продажи, берем 0, и выполняем 
# экспоненцирование (поскольку ранее логарифмировали
# зависимую переменную)
sample_submission['sales'] = np.expm1(np.maximum(sales, 0))

In [24]:
# смотрим первые 5 наблюдений файла посылки
sample_submission.head()

Unnamed: 0,id,sales
0,3000888,3.948688
1,3000889,0.007215
2,3000890,3.289083
3,3000891,2385.625169
4,3000892,0.12647


In [25]:
# записываем посылку в виде CSV-файла
sample_submission.to_csv('etna_store_sales_regressors.csv', 
                         index=False)