## Zaimportowanie bilbiotek i deklaracja używanych później funkcji

In [2]:
from warnings import simplefilter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.signal import periodogram
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
simplefilter("ignore")

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(20, 7))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
)
%config InlineBackend.figure_format = 'retina'


def lagplot(x, y=None, lag=1, standardize=False, ax=None, **kwargs):
    from matplotlib.offsetbox import AnchoredText
    x_ = x.shift(lag)
    if standardize:
        x_ = (x_ - x_.mean()) / x_.std()
    if y is not None:
        y_ = (y - y.mean()) / y.std() if standardize else y
    else:
        y_ = x
    corr = y_.corr(x_)
    if ax is None:
        fig, ax = plt.subplots()
    scatter_kws = dict(
        alpha=0.75,
        s=3,
    )
    line_kws = dict(color='C3', )
    ax = sns.regplot(x=x_,
                     y=y_,
                     scatter_kws=scatter_kws,
                     line_kws=line_kws,
                     lowess=True,
                     ax=ax,
                     **kwargs)
    at = AnchoredText(
        f"{corr:.2f}",
        prop=dict(size="large"),
        frameon=True,
        loc="upper left",
    )
    at.patch.set_boxstyle("square, pad=0.0")
    ax.add_artist(at)
    ax.set(title=f"Lag {lag}", xlabel=x_.name, ylabel=y_.name)
    return ax


def plot_lags(x, y=None, lags=6, nrows=1, lagplot_kwargs={}, **kwargs):
    import math
    kwargs.setdefault('nrows', nrows)
    kwargs.setdefault('ncols', math.ceil(lags / nrows))
    kwargs.setdefault('figsize', (kwargs['ncols'] * 2, nrows * 2 + 0.5))
    fig, axs = plt.subplots(sharex=True, sharey=True, squeeze=False, **kwargs)
    for ax, k in zip(fig.get_axes(), range(kwargs['nrows'] * kwargs['ncols'])):
        if k + 1 <= lags:
            ax = lagplot(x, y, lag=k + 1, ax=ax, **lagplot_kwargs)
            ax.set_title(f"Lag {k + 1}", fontdict=dict(fontsize=14))
            ax.set(xlabel="", ylabel="")
        else:
            ax.axis('off')
    plt.setp(axs[-1, :], xlabel=x.name)
    plt.setp(axs[:, 0], ylabel=y.name if y is not None else x.name)
    fig.tight_layout(w_pad=0.1, h_pad=0.1)
    return fig


def make_lags(ts, lags):
    return pd.concat(
        {f'y_lag_{i}': ts.shift(i) for i in range(1, lags + 1)},axis=1)

def make_multistep_target(ts, steps):
    return pd.concat(
        {f'y_step_{i + 1}': ts.shift(-i)
         for i in range(steps)},
        axis=1)

## Wczytanie głównego zbioru treningowego

In [3]:
store_sales = pd.read_csv(
    'train.csv',
    usecols=['store_nbr', 'family', 'date', 'sales', 'onpromotion'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
        'onpromotion': 'uint32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)

store_sales['date'] = store_sales.date.dt.to_period('D')
store_sales = store_sales.set_index(['store_nbr', 'family', 'date']).sort_index()
store_sales

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales,onpromotion
store_nbr,family,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,AUTOMOTIVE,2013-01-01,0.000000,0
1,AUTOMOTIVE,2013-01-02,2.000000,0
1,AUTOMOTIVE,2013-01-03,3.000000,0
1,AUTOMOTIVE,2013-01-04,3.000000,0
1,AUTOMOTIVE,2013-01-05,5.000000,0
...,...,...,...,...
9,SEAFOOD,2017-08-11,23.830999,0
9,SEAFOOD,2017-08-12,16.859001,4
9,SEAFOOD,2017-08-13,20.000000,0
9,SEAFOOD,2017-08-14,17.000000,0


## Zamodelowanie trendów

In [4]:
y = store_sales.unstack(['store_nbr', 'family'])

dp = DeterministicProcess(
    index=y.index,
    constant=True,
    order=1,
    drop=True,
)
X = dp.in_sample()

model = LinearRegression(fit_intercept=False)
model.fit(X, y)
y_pred = pd.DataFrame(model.predict(X), index=X.index, columns=y.columns)

## Zamodelowanie sezonowości

In [5]:
y = store_sales.unstack(['store_nbr', 'family'])
fourier = CalendarFourier("m",4)
dp = DeterministicProcess(
    index=y.index,
    constant=True,
    order=1,
    seasonal=True,
    drop=True,
    additional_terms=[fourier]
)
X_time = dp.in_sample()
X_time

Unnamed: 0_level_0,const,trend,"s(2,7)","s(3,7)","s(4,7)","s(5,7)","s(6,7)","s(7,7)","sin(1,freq=M)","cos(1,freq=M)","sin(2,freq=M)","cos(2,freq=M)","sin(3,freq=M)","cos(3,freq=M)","sin(4,freq=M)","cos(4,freq=M)"
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2013-01-01,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000
2013-01-02,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.201299,0.979530,0.394356,0.918958,0.571268,0.820763,0.724793,0.688967
2013-01-03,1.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.394356,0.918958,0.724793,0.688967,0.937752,0.347305,0.998717,-0.050649
2013-01-04,1.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.571268,0.820763,0.937752,0.347305,0.968077,-0.250653,0.651372,-0.758758
2013-01-05,1.0,5.0,0.0,0.0,0.0,1.0,0.0,0.0,0.724793,0.688967,0.998717,-0.050649,0.651372,-0.758758,-0.101168,-0.994869
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-11,1.0,1680.0,0.0,0.0,0.0,0.0,0.0,1.0,0.897805,-0.440394,-0.790776,-0.612106,-0.201299,0.979530,0.968077,-0.250653
2017-08-12,1.0,1681.0,0.0,0.0,0.0,0.0,0.0,0.0,0.790776,-0.612106,-0.968077,-0.250653,0.394356,0.918958,0.485302,-0.874347
2017-08-13,1.0,1682.0,1.0,0.0,0.0,0.0,0.0,0.0,0.651372,-0.758758,-0.988468,0.151428,0.848644,0.528964,-0.299363,-0.954139
2017-08-14,1.0,1683.0,0.0,1.0,0.0,0.0,0.0,0.0,0.485302,-0.874347,-0.848644,0.528964,0.998717,-0.050649,-0.897805,-0.440394


In [8]:
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=30, shuffle=False)

# model = LinearRegression(fit_intercept=False).fit(X_train, y_train)
# y_fit = pd.DataFrame(model.predict(X_train), index=X_train.index, columns = y_train.columns).clip(0.0)
# y_pred = pd.DataFrame(model.predict(X_valid), index=X_valid.index, columns = y_valid.columns).clip(0.0)
# rmsle_train = mean_squared_log_error(y_train, y_fit) ** 0.5
# rmsle_valid = mean_squared_log_error(y_valid, y_pred) ** 0.5
# print(f'Training RMSLE: {rmsle_train:.5f}')
# print(f'Validation RMSLE: {rmsle_valid:.5f}')
# y_avg = (y.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())
# y_fit_avg = (y_fit.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())
# y_pred_avg = (y_pred.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())

In [9]:
# ax = y_avg.plot(**plot_params, alpha=0.5, title="Average Sales", ylabel="items sold")
# ax = y_fit_avg.plot(ax=ax, label="Fitted", color='C0')
# ax = y_pred_avg.plot(ax=ax, label="Forecast", color='C3')
# ax.legend();

In [10]:
# df_test = pd.read_csv(
#     'test.csv',
#     dtype={
#         'store_nbr': 'category',
#         'family': 'category',
#         'onpromotion': 'uint32',
#     },
#     parse_dates=['date'],
#     infer_datetime_format=True,
# )
# df_test['date'] = df_test.date.dt.to_period('D')
# df_test = df_test.set_index(['store_nbr', 'family', 'date']).sort_index()

In [11]:
X = dp.in_sample()


y = store_sales.unstack(['store_nbr', 'family']).loc['2017', 'sales']
all_promotion = store_sales.unstack(['store_nbr', 'family']).loc(axis=1)['onpromotion'].loc['2017']
X_lags = make_lags(y, lags=4)

X_promo = pd.concat([
    make_lags(all_promotion, lags=3),
    all_promotion,
], axis=1)

#putting together on promotion lag and lead data, seasonality, trends, holidays, and sales lags
X_whole = pd.concat([X, X_lags, X_promo], axis=1).dropna()
#X
y_whole = store_sales.unstack(['store_nbr', 'family']).loc['2017', 'sales']
y = make_multistep_target(y, steps=16).dropna()
y, X = y.align(X_whole, join='inner', axis = 0)
X_fore = X_whole.loc['2017-08']

X_fore

Unnamed: 0_level_0,const,trend,"s(2,7)","s(3,7)","s(4,7)","s(5,7)","s(6,7)","s(7,7)","sin(1,freq=M)","cos(1,freq=M)",...,"(9, MAGAZINES)","(9, MEATS)","(9, PERSONAL CARE)","(9, PET SUPPLIES)","(9, PLAYERS AND ELECTRONICS)","(9, POULTRY)","(9, PREPARED FOODS)","(9, PRODUCE)","(9, SCHOOL AND OFFICE SUPPLIES)","(9, SEAFOOD)"
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-08-01,1.0,1670.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,9.0,0.0,1.0,0.0,0.0,151.0,9.0,0.0
2017-08-02,1.0,1671.0,0.0,0.0,0.0,1.0,0.0,0.0,0.201299,0.97953,...,0.0,0.0,9.0,0.0,0.0,0.0,1.0,8.0,7.0,0.0
2017-08-03,1.0,1672.0,0.0,0.0,0.0,0.0,1.0,0.0,0.394356,0.918958,...,0.0,21.0,10.0,0.0,0.0,1.0,0.0,8.0,6.0,0.0
2017-08-04,1.0,1673.0,0.0,0.0,0.0,0.0,0.0,1.0,0.571268,0.820763,...,0.0,0.0,11.0,0.0,0.0,23.0,3.0,8.0,7.0,0.0
2017-08-05,1.0,1674.0,0.0,0.0,0.0,0.0,0.0,0.0,0.724793,0.688967,...,0.0,0.0,12.0,0.0,0.0,1.0,1.0,6.0,9.0,3.0
2017-08-06,1.0,1675.0,1.0,0.0,0.0,0.0,0.0,0.0,0.848644,0.528964,...,0.0,0.0,12.0,0.0,0.0,0.0,1.0,7.0,9.0,0.0
2017-08-07,1.0,1676.0,0.0,1.0,0.0,0.0,0.0,0.0,0.937752,0.347305,...,0.0,0.0,12.0,0.0,0.0,0.0,1.0,6.0,10.0,0.0
2017-08-08,1.0,1677.0,0.0,0.0,1.0,0.0,0.0,0.0,0.988468,0.151428,...,0.0,0.0,11.0,0.0,0.0,0.0,1.0,144.0,7.0,0.0
2017-08-09,1.0,1678.0,0.0,0.0,0.0,1.0,0.0,0.0,0.998717,-0.050649,...,0.0,0.0,10.0,0.0,0.0,1.0,1.0,6.0,8.0,0.0
2017-08-10,1.0,1679.0,0.0,0.0,0.0,0.0,1.0,0.0,0.968077,-0.250653,...,0.0,20.0,8.0,0.0,0.0,0.0,1.0,6.0,10.0,0.0


In [13]:
X

Unnamed: 0_level_0,const,trend,"s(2,7)","s(3,7)","s(4,7)","s(5,7)","s(6,7)","s(7,7)","sin(1,freq=M)","cos(1,freq=M)",...,"(9, MAGAZINES)","(9, MEATS)","(9, PERSONAL CARE)","(9, PET SUPPLIES)","(9, PLAYERS AND ELECTRONICS)","(9, POULTRY)","(9, PREPARED FOODS)","(9, PRODUCE)","(9, SCHOOL AND OFFICE SUPPLIES)","(9, SEAFOOD)"
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-05,1.0,1462.0,0.0,0.0,0.0,0.0,1.0,0.0,0.724793,0.688967,...,0.0,21.0,8.0,0.0,0.0,1.0,1.0,5.0,0.0,1.0
2017-01-06,1.0,1463.0,0.0,0.0,0.0,0.0,0.0,1.0,0.848644,0.528964,...,0.0,0.0,12.0,0.0,0.0,22.0,2.0,7.0,0.0,0.0
2017-01-07,1.0,1464.0,0.0,0.0,0.0,0.0,0.0,0.0,0.937752,0.347305,...,0.0,0.0,13.0,0.0,0.0,2.0,1.0,7.0,0.0,2.0
2017-01-08,1.0,1465.0,1.0,0.0,0.0,0.0,0.0,0.0,0.988468,0.151428,...,0.0,0.0,14.0,0.0,0.0,2.0,1.0,6.0,0.0,0.0
2017-01-09,1.0,1466.0,0.0,1.0,0.0,0.0,0.0,0.0,0.998717,-0.050649,...,0.0,0.0,12.0,0.0,0.0,1.0,1.0,6.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-07-27,1.0,1665.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.848644,0.528964,...,0.0,21.0,9.0,0.0,0.0,0.0,1.0,9.0,0.0,0.0
2017-07-28,1.0,1666.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.724793,0.688967,...,0.0,0.0,9.0,0.0,0.0,22.0,3.0,8.0,4.0,0.0
2017-07-29,1.0,1667.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.571268,0.820763,...,0.0,0.0,9.0,0.0,0.0,0.0,1.0,8.0,7.0,4.0
2017-07-30,1.0,1668.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.394356,0.918958,...,0.0,0.0,9.0,0.0,0.0,1.0,1.0,7.0,8.0,0.0


In [12]:
y

Unnamed: 0_level_0,y_step_1,y_step_1,y_step_1,y_step_1,y_step_1,y_step_1,y_step_1,y_step_1,y_step_1,y_step_1,...,y_step_16,y_step_16,y_step_16,y_step_16,y_step_16,y_step_16,y_step_16,y_step_16,y_step_16,y_step_16
store_nbr,1,1,1,1,1,1,1,1,1,1,...,9,9,9,9,9,9,9,9,9,9
family,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,...,MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2017-01-05,2.0,0.0,3.0,2617.0,0.0,533.479980,40.0,918.0,853.0,137.005997,...,1.0,285.218994,275.0,6.0,13.0,408.058990,136.686996,1239.003052,1.0,40.586998
2017-01-06,2.0,0.0,10.0,2761.0,0.0,442.910004,9.0,799.0,927.0,162.621994,...,5.0,479.380005,580.0,11.0,9.0,667.434021,213.098999,2178.283936,2.0,18.346001
2017-01-07,5.0,0.0,6.0,2503.0,0.0,428.220001,14.0,767.0,836.0,136.477005,...,7.0,393.569000,686.0,8.0,20.0,676.979980,208.341995,2683.158936,4.0,31.000000
2017-01-08,0.0,0.0,1.0,1356.0,1.0,165.123001,1.0,345.0,447.0,61.691002,...,4.0,372.351990,382.0,3.0,15.0,424.250000,78.189003,1452.160034,0.0,19.000000
2017-01-09,2.0,0.0,6.0,4100.0,0.0,443.518005,21.0,878.0,853.0,170.764999,...,4.0,380.153015,353.0,5.0,14.0,375.291016,116.255997,2330.061035,4.0,17.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-07-27,5.0,0.0,1.0,2002.0,0.0,321.000000,21.0,660.0,722.0,116.137001,...,5.0,309.244995,373.0,11.0,2.0,525.223999,112.099998,1453.078003,140.0,23.830999
2017-07-28,7.0,0.0,2.0,2358.0,0.0,403.644989,20.0,714.0,711.0,178.408997,...,2.0,260.298004,400.0,7.0,10.0,383.386993,129.903992,1419.264038,138.0,16.859001
2017-07-29,4.0,0.0,3.0,2161.0,0.0,330.035004,6.0,667.0,676.0,122.680000,...,3.0,327.205994,510.0,2.0,9.0,412.458008,105.168999,1693.607056,200.0,20.000000
2017-07-30,1.0,0.0,2.0,1212.0,0.0,153.807999,0.0,238.0,316.0,54.296001,...,12.0,330.975006,445.0,2.0,14.0,283.428986,114.120003,1348.425049,182.0,17.000000


In [12]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=16, shuffle=False)
linear_model = LinearRegression(fit_intercept=False).fit(X_train, y_train)

linear_y_fit = pd.DataFrame(linear_model.predict(X_train), index=X_train.index, columns = y_train.columns).clip(0.0)
linear_y_pred = pd.DataFrame(linear_model.predict(X_valid), index=X_valid.index, columns = y_valid.columns).clip(0.0)

print("OK")

OK


In [22]:
y_avg = (y_whole.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())
linear_y_fit_avg = (linear_y_fit.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())
linear_y_pred_avg = (linear_y_pred.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())

#reorder the columns based on the proper order of steps
linear_y_fit_avg = linear_y_fit_avg.reindex(columns=['y_step_1', 'y_step_2', 'y_step_3',
       'y_step_4', 'y_step_5', 'y_step_6', 'y_step_7', 'y_step_8', 'y_step_9', 'y_step_10', 'y_step_11', 'y_step_12', 'y_step_13',
       'y_step_14', 'y_step_15', 'y_step_16'])

linear_y_pred_avg = linear_y_pred_avg.reindex(columns=['y_step_1', 'y_step_2', 'y_step_3',
       'y_step_4', 'y_step_5', 'y_step_6', 'y_step_7', 'y_step_8', 'y_step_9', 'y_step_10', 'y_step_11', 'y_step_12', 'y_step_13',
       'y_step_14', 'y_step_15', 'y_step_16'])

#create the forecasted values:
check = linear_y_pred.loc['2017-07-31']
check.index.names=['date', 'store_nbr', 'family']
check = check.groupby('date').mean()
check.index = y_whole.loc['2017-07-31':'2017-08-15'].index

check

date
2017-07-31    442.089426
2017-08-01    483.531470
2017-08-02    414.647416
2017-08-03    368.210040
2017-08-04    479.852706
2017-08-05    593.239677
2017-08-06    644.699255
2017-08-07    443.491561
2017-08-08    430.683927
2017-08-09    453.512536
2017-08-10    338.555637
2017-08-11    489.956087
2017-08-12    569.458959
2017-08-13    546.412519
2017-08-14    348.685127
2017-08-15    292.226595
Freq: D, Name: 2017-07-31, dtype: float64

In [14]:
linear_model = LinearRegression(fit_intercept=False).fit(X, y)

linear_y_fit_full = pd.DataFrame(linear_model.predict(X), index=X.index, columns = y.columns).clip(0.0)
linear_y_forecast = pd.DataFrame(linear_model.predict(X_fore), index=X_fore.index, columns = y.columns).clip(0.0)


In [None]:
y_avg = (y_whole.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())
linear_y_fit_full_avg = (linear_y_fit_full.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())
linear_y_forecast_avg = (linear_y_forecast.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())

#reorder the columns based on the proper order of steps
linear_y_fit_full_avg = linear_y_fit_full_avg.reindex(columns=['y_step_1', 'y_step_2', 'y_step_3',
       'y_step_4', 'y_step_5', 'y_step_6', 'y_step_7', 'y_step_8', 'y_step_9', 'y_step_10', 'y_step_11', 'y_step_12', 'y_step_13',
       'y_step_14', 'y_step_15', 'y_step_16'])

linear_y_forecast_avg = linear_y_forecast_avg.reindex(columns=['y_step_1', 'y_step_2', 'y_step_3',
       'y_step_4', 'y_step_5', 'y_step_6', 'y_step_7', 'y_step_8', 'y_step_9', 'y_step_10', 'y_step_11', 'y_step_12', 'y_step_13',
       'y_step_14', 'y_step_15', 'y_step_16'])

#create the forecasted values:
linear_forecast = linear_y_forecast.loc['2017-08-15']
linear_forecast.index.names=['date', 'store_nbr', 'family']
linear_forecast_avg = linear_forecast.groupby('date').mean()
linear_forecast_avg.index = df_test.unstack(['store_nbr', 'family']).index

print("OK")

In [None]:
all_promotion = store_sales.unstack(['store_nbr', 'family']).loc(axis=1)['onpromotion'].loc['2017']
#using lags
X_2_lags = make_lags(y_whole, lags=4)
X_2_lags = X_2_lags.stack(['store_nbr', 'family'])

#using promotional data
X_2_promo = pd.concat([
    make_lags(all_promotion, lags=3).stack(['store_nbr', 'family']),
    all_promotion.stack(['store_nbr', 'family']),
    make_leads(all_promotion, leads=1).stack(['store_nbr', 'family']),
], axis=1)

#putting them both together
X_2_whole = pd.concat([X_2_lags, X_2_promo], axis=1).dropna()

#label encoding the family column
le = LabelEncoder()
X_2_whole = (X_2_whole
    .reset_index('family')  # convert index to column
    .assign(family=lambda x: le.fit_transform(x.family)))

#stacking the y value for the benefit of a not linear model like XGBoost
y_2 = y.stack(['store_nbr', 'family'])
#fixing the order
y_2 = y_2.reindex(columns=['y_step_1', 'y_step_2', 'y_step_3',
       'y_step_4', 'y_step_5', 'y_step_6', 'y_step_7', 'y_step_8', 'y_step_9', 'y_step_10', 'y_step_11', 'y_step_12', 'y_step_13',
       'y_step_14', 'y_step_15', 'y_step_16'])
X_2 = X_2_whole.loc['2017':'2017-07-31']
X_2_fore = X_2_whole.loc['2017-08':'2017-08-15']

print("OK")

In [None]:
DirRec_xgboost = RegressorChain(base_estimator=XGBRegressor())
DirRec_xgboost.fit(X_2, y_2)
y_2_fit = pd.DataFrame(
   DirRec_xgboost.predict(X_2),
   index=y_2.index,
   columns=y_2.columns,
).clip(0.0)
y_2_pred = pd.DataFrame(
   DirRec_xgboost.predict(X_2_fore),
    index=y_whole.stack(['store_nbr', 'family']).loc['2017-08'].index,
   columns=y_2.loc['2017-07-15':'2017-07-31'].columns,
).clip(0.0)

print("OK")

In [None]:
y_avg = (y_whole.stack(['store_nbr', 'family']).groupby('date').mean().squeeze())
y_2_fit_avg = (y_2_fit.groupby('date').mean().squeeze())
y_2_pred_avg = (y_2_pred.groupby('date').mean().squeeze())

#create the forecasted values:
y_2_forecast = y_2_pred.loc['2017-08-15']
y_2_forecast_avg = y_2_pred.groupby('date').mean()
y_2_forecast_avg = y_2_forecast_avg.loc['2017-08-15']
y_2_forecast_avg.index = df_test.unstack(['store_nbr', 'family']).index
y_2_forecast_avg