В данных 54 магазина и 33 семейства продуктов.                     
Временной ряд тренировочных данных с 01.01.2013 по 15.08.2017.              
Временной ряд тестовых данных для submition составляет 16 дней после последней даты тренировочных данных: с 16.08.2017 по 31.08.2017.                   
                                 
Нужно составить прогноз продаж для каждого из семейств продуктов в каждом из магазинов.     
                             
В отдельном ноутбуке проведен Feature engineering и сформированы необходимые датасеты.
                                
**Текущие датасеты**                   
1. featured_data - объединенные данные test и train с новыми признаками.
2. zero_prediction - данные тех товаров, которые не продавались в конкретном магазине с начала 2013 года, исходя из чего можно предположить, что данные товары не будут продаваться в ближайшие 16 дней. Этот датасет мы будем объединять с предсказанными данными перед отправкой в submit. 

In [196]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import pickle
import datetime as dt

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
pd.set_option('display.max_columns', None)

# Import Data

In [197]:
# DATA_DIR = 'DATA/'
DATA_DIR = '../input/f-data/'

In [198]:
# Import
df = pd.read_csv(DATA_DIR + "final_featured_data.csv")

zero_prediction = pd.read_csv("../input/zero-prediction/zero_prediction.csv")
zero_prediction = zero_prediction.set_index(['store_nbr', 'family', 'date']).sort_index()

# Datetime
df["date"] = pd.to_datetime(df.date)

# Метрики

In [199]:
def compute_metrics(real, forecast):
    result = {}
    real=np.array(real)
    forecast=np.array(forecast)
    result['MSE'] = round(((real-forecast)**2).mean(),4)
    result['RMSE'] = round((((real-forecast)**2)**(1/2)).mean(),4)
    mape_list = []
    smap_list = []
    for i in range(len(real)):
        if real[i] == 0:
            mape_list.append(0)
            smap_list.append(0)
        else:
            mape_list.append(abs(real[i]-forecast[i])/real[i])
            smap_list.append(2.0 * np.mean(np.abs(forecast[i] - real[i]) / (np.abs(forecast[i]) + np.abs(real[i]))))
        
    result['MAPE'] = round(np.mean(mape_list),4)
    result['SMAP'] = round(np.mean(smap_list),4)                   
    #result['SMAP'] = round(2.0 * np.mean(np.abs(forecast - real) / (np.abs(forecast) + np.abs(real))),4)
    return pd.Series(result)

# Признаки из лагов

In [200]:
# Генерируем новые признаки, сдвигая целевой ряд.
for lag in tqdm(range(1,15)):
    df[f"lag{lag}"] = df.groupby(['store_nbr', 'family']).sales.transform(lambda x: x.shift(lag))
df["mean_sales"] = df.groupby(['store_nbr', 'family']).sales.transform(lambda x: x.mean())
df["std_sales"] = df.groupby(['store_nbr', 'family']).sales.transform(lambda x: x.std())

  0%|          | 0/14 [00:00<?, ?it/s]

# Validation

In [201]:
df['is_active_family'] = df.is_active_family.astype(int)
df = df[df.date > '2016-01-15'].copy()
train = df[df.date<'2017-08-16'].copy()
test = df[df.date>='2017-08-16'].copy()

In [202]:
# Удаляем первые 14 строк каждого из временных рядов, поскольку лаги заполнялись со смещением
train['n_row'] = train.groupby(['store_nbr', 'family']).sales.transform(lambda x: np.arange(1,len(x)+1))
train = train[train.n_row>14].drop('n_row', axis=1)

In [203]:
train_df = train[train.date<'2017-08-01'].copy()
val_df = train[train.date>='2017-08-01'].copy()

# CatBoost

In [157]:
from catboost import CatBoostRegressor

In [158]:
# # Загрузка сохраненной модели
# model_cat = pickle.load(open('CBReg_model', 'rb'))

In [159]:
RANDOM_SEED = 42

In [204]:
train_df = train_df.set_index('date')
val_df = val_df.set_index('date')

In [205]:
X_train = train_df.drop(['year','id','sales'], axis = 1)
y_train = train_df['sales']
X_test = val_df.drop(['year','id','sales'], axis = 1)
y_test = val_df['sales']

In [206]:
cat_features = ['family', 'store_nbr', 'events_Terremoto_Manabi',
       'is_month_end', 'is_month_start', 'holiday_national_binary', 'workday',
       'is_active_location', 'is_wknd', 'oil_over_70', 'is_active_cluster',
       'season', 'quarter', 'is_active_type', 'week_of_month', 'day_of_week',
       'month', 'is_active_family', 'day_of_month',
       'week_of_year', 'day_of_year']

In [163]:
model_cat = CatBoostRegressor(iterations = 2000,
                              #depth=10,
                              #learning_rate = 0.1,
                              random_seed = RANDOM_SEED,
                              eval_metric='MAPE',
                              custom_metric=['RMSE', 'SMAPE'],
                              od_wait=500,
                              #task_type='GPU',
                              cat_features=cat_features
                         )
model_cat.fit(X_train, y_train,
              cat_features=cat_features,
             eval_set=(X_test, y_test),
             verbose_eval=500,
             use_best_model=True,
             #plot=True
             )

Learning rate set to 0.149
0:	learn: 101.1920658	test: 91.8406723	best: 91.8406723 (0)	total: 1.2s	remaining: 20m
500:	learn: 1.5518911	test: 0.8885851	best: 0.7411072 (115)	total: 9m 12s	remaining: 9m 10s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.7411071857
bestIteration = 115

Shrink model to first 116 iterations.


<catboost.core.CatBoostRegressor at 0x7f07eaebafd0>

In [None]:
pickle.dump(model_cat, open('CBReg_model', 'wb')) #Saving the model

In [207]:
val_predict_catboost = model_cat.predict(X_test)

In [208]:
compute_metrics(val_df.sales, val_predict_catboost)

MSE     53851.0799
RMSE       75.2894
MAPE        0.4483
SMAP        0.4037
dtype: float64

# Submit for CatBoost

In [209]:
test.mean_sales = test.mean_sales.fillna(0)
test.std_sales = test.std_sales.fillna(0)

In [211]:
# доращиваем тестовый датасет для генерации лагов
val_df = val_df.reset_index()
lag_edge = val_df.date.max() - dt.timedelta(days=14)
to_test_lag = val_df[val_df.date > lag_edge]
d = to_test_lag.append(test).copy()

In [212]:
days = test.date.unique()
for day in days:
    day_predict = model_cat.predict(d.loc[d.date == day].drop(['year','id','sales'], axis = 1).set_index('date'))
    test.loc[test.date == day, 'sales'] = day_predict
    d.loc[d.date == day, 'sales'] = day_predict
    for lag in tqdm(range(1,15)):
        d[f"lag{lag}"] = d.groupby(['store_nbr', 'family']).sales.transform(lambda x: x.shift(lag))

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

In [214]:
test = test.set_index(['store_nbr', 'family', 'date']).sort_index()
for i in zero_prediction.index:
    test.loc[i,'Cat_Boost']=0

In [215]:
test=test.reset_index()
y_Cat_Boost = test[['id','sales']].copy()
y_Cat_Boost['sales'] = y_Cat_Boost.sales.clip(0)
y_Cat_Boost.to_csv('submission_Cat_Boost.csv', index=False)

In [216]:
y_Cat_Boost.tail()

Unnamed: 0,id,sales
28507,3022139,2.685914
28508,3023921,2.212267
28509,3025703,3.705171
28510,3027485,1.486935
28511,3029267,9.189767


**Результат на kaggle 0.80817**