# Model regresji liniowej wykorzystujący trendy, sezonowość, promocje oraz lagi

### Zaimportowanie bilbiotek i deklaracja używanych później funkcji

In [1]:
from warnings import simplefilter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.signal import periodogram
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from pathlib import Path
simplefilter("ignore")

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(20, 7))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
)
%config InlineBackend.figure_format = 'retina'


def lagplot(x, y=None, lag=1, standardize=False, ax=None, **kwargs):
    from matplotlib.offsetbox import AnchoredText
    x_ = x.shift(lag)
    if standardize:
        x_ = (x_ - x_.mean()) / x_.std()
    if y is not None:
        y_ = (y - y.mean()) / y.std() if standardize else y
    else:
        y_ = x
    corr = y_.corr(x_)
    if ax is None:
        fig, ax = plt.subplots()
    scatter_kws = dict(
        alpha=0.75,
        s=3,
    )
    line_kws = dict(color='C3', )
    ax = sns.regplot(x=x_,
                     y=y_,
                     scatter_kws=scatter_kws,
                     line_kws=line_kws,
                     lowess=True,
                     ax=ax,
                     **kwargs)
    at = AnchoredText(
        f"{corr:.2f}",
        prop=dict(size="large"),
        frameon=True,
        loc="upper left",
    )
    at.patch.set_boxstyle("square, pad=0.0")
    ax.add_artist(at)
    ax.set(title=f"Lag {lag}", xlabel=x_.name, ylabel=y_.name)
    return ax


def plot_lags(x, y=None, lags=6, nrows=1, lagplot_kwargs={}, **kwargs):
    import math
    kwargs.setdefault('nrows', nrows)
    kwargs.setdefault('ncols', math.ceil(lags / nrows))
    kwargs.setdefault('figsize', (kwargs['ncols'] * 2, nrows * 2 + 0.5))
    fig, axs = plt.subplots(sharex=True, sharey=True, squeeze=False, **kwargs)
    for ax, k in zip(fig.get_axes(), range(kwargs['nrows'] * kwargs['ncols'])):
        if k + 1 <= lags:
            ax = lagplot(x, y, lag=k + 1, ax=ax, **lagplot_kwargs)
            ax.set_title(f"Lag {k + 1}", fontdict=dict(fontsize=14))
            ax.set(xlabel="", ylabel="")
        else:
            ax.axis('off')
    plt.setp(axs[-1, :], xlabel=x.name)
    plt.setp(axs[:, 0], ylabel=y.name if y is not None else x.name)
    fig.tight_layout(w_pad=0.1, h_pad=0.1)
    return fig


def make_lags(ts, lags):
    return pd.concat(
        {f'y_lag_{i}': ts.shift(i) for i in range(1, lags + 1)},axis=1)

def make_lag(ts, lag):
    return pd.concat(
    {
      f'y_lag_{lag}': ts.shift(lag)
    },
    axis=1)

def make_multistep_target(ts, steps):
    return pd.concat(
        {f'y_step_{i + 1}': ts.shift(-i)
         for i in range(steps)},
        axis=1)

### Wczytanie danych

In [2]:
#load the data (kaggle)
# comp_dir = Path('../input/store-sales-time-series-forecasting')
# store_sales = pd.read_csv(
#     comp_dir / 'train.csv',
#     usecols=['store_nbr', 'family', 'date', 'sales', 'onpromotion'],
#     dtype={
#         'store_nbr': 'category',
#         'family': 'category',
#         'sales': 'float32',
#         'onpromotion': 'uint32',
#     },
#     parse_dates=['date'],
#     infer_datetime_format=True,
# )
# df_test = pd.read_csv(
#     comp_dir / 'test.csv',
#     dtype={
#         'store_nbr': 'category',
#         'family': 'category',
#         'onpromotion': 'uint32',
#     },
#     parse_dates=['date'],
#     infer_datetime_format=True,
# )
#end load the data (kaggle) 

#load the data (local)
store_sales = pd.read_csv('train.csv',
    usecols=['store_nbr', 'family', 'date', 'sales', 'onpromotion'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
        'onpromotion': 'uint32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)

df_test = pd.read_csv('test.csv',
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'onpromotion': 'uint32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)

#end load the data(local)

df_test['date'] = df_test.date.dt.to_period('D')
df_test = df_test.set_index(['store_nbr', 'family', 'date']).sort_index()

store_sales['date'] = store_sales.date.dt.to_period('D')
store_sales = store_sales.set_index(['store_nbr', 'family', 'date']).sort_index()
store_sales

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales,onpromotion
store_nbr,family,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,AUTOMOTIVE,2013-01-01,0.000000,0
1,AUTOMOTIVE,2013-01-02,2.000000,0
1,AUTOMOTIVE,2013-01-03,3.000000,0
1,AUTOMOTIVE,2013-01-04,3.000000,0
1,AUTOMOTIVE,2013-01-05,5.000000,0
...,...,...,...,...
9,SEAFOOD,2017-08-11,23.830999,0
9,SEAFOOD,2017-08-12,16.859001,4
9,SEAFOOD,2017-08-13,20.000000,0
9,SEAFOOD,2017-08-14,17.000000,0


### Przygotowanie trendów i sezonowości sprzedaży jako cech do wytrenowania modelu

In [3]:
y = store_sales.unstack(['store_nbr', 'family'])
fourier = CalendarFourier("m",4)
dp = DeterministicProcess(
    index=y.index,
    constant=True,
    order=1,
    seasonal=True,
    drop=True,
    additional_terms=[fourier]
)
X = dp.in_sample()
X['NewYear'] = (X.index.dayofyear == 1)
X

Unnamed: 0_level_0,const,trend,"s(2,7)","s(3,7)","s(4,7)","s(5,7)","s(6,7)","s(7,7)","sin(1,freq=M)","cos(1,freq=M)","sin(2,freq=M)","cos(2,freq=M)","sin(3,freq=M)","cos(3,freq=M)","sin(4,freq=M)","cos(4,freq=M)",NewYear
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2013-01-01,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,True
2013-01-02,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.201299,0.979530,0.394356,0.918958,0.571268,0.820763,0.724793,0.688967,False
2013-01-03,1.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.394356,0.918958,0.724793,0.688967,0.937752,0.347305,0.998717,-0.050649,False
2013-01-04,1.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.571268,0.820763,0.937752,0.347305,0.968077,-0.250653,0.651372,-0.758758,False
2013-01-05,1.0,5.0,0.0,0.0,0.0,1.0,0.0,0.0,0.724793,0.688967,0.998717,-0.050649,0.651372,-0.758758,-0.101168,-0.994869,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-11,1.0,1680.0,0.0,0.0,0.0,0.0,0.0,1.0,0.897805,-0.440394,-0.790776,-0.612106,-0.201299,0.979530,0.968077,-0.250653,False
2017-08-12,1.0,1681.0,0.0,0.0,0.0,0.0,0.0,0.0,0.790776,-0.612106,-0.968077,-0.250653,0.394356,0.918958,0.485302,-0.874347,False
2017-08-13,1.0,1682.0,1.0,0.0,0.0,0.0,0.0,0.0,0.651372,-0.758758,-0.988468,0.151428,0.848644,0.528964,-0.299363,-0.954139,False
2017-08-14,1.0,1683.0,0.0,1.0,0.0,0.0,0.0,0.0,0.485302,-0.874347,-0.848644,0.528964,0.998717,-0.050649,-0.897805,-0.440394,False


In [4]:
model = LinearRegression(fit_intercept=False)
model.fit(X, y)
y_pred = pd.DataFrame(model.predict(X), index=X.index, columns=y.columns)

y_pred = y_pred.loc['2017','sales']

In [5]:
y = store_sales.unstack(['store_nbr', 'family']).loc['2017', 'sales']
y

store_nbr,1,1,1,1,1,1,1,1,1,1,...,9,9,9,9,9,9,9,9,9,9
family,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,...,MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2017-01-01,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000
2017-01-02,5.0,0.0,0.0,1434.0,0.0,166.819000,0.0,332.0,376.0,44.980000,...,5.0,659.570007,1243.0,11.0,41.0,843.596008,115.188995,3136.895996,1.0,23.000000
2017-01-03,4.0,0.0,4.0,3081.0,2.0,519.348022,15.0,952.0,1045.0,209.300003,...,2.0,547.364014,876.0,6.0,15.0,714.659973,133.039001,3229.558105,1.0,14.000000
2017-01-04,1.0,0.0,4.0,3039.0,2.0,543.250977,17.0,1055.0,1029.0,135.944000,...,3.0,395.287994,677.0,6.0,13.0,536.830017,75.201004,1491.416992,7.0,0.000000
2017-01-05,2.0,0.0,3.0,2617.0,0.0,533.479980,40.0,918.0,853.0,137.005997,...,2.0,470.768005,604.0,7.0,10.0,414.100006,113.698997,1566.821045,1.0,17.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-11,1.0,0.0,1.0,1006.0,0.0,145.606995,4.0,341.0,343.0,64.302002,...,5.0,309.244995,373.0,11.0,2.0,525.223999,112.099998,1453.078003,140.0,23.830999
2017-08-12,6.0,0.0,3.0,1659.0,0.0,243.220001,3.0,351.0,526.0,99.487999,...,2.0,260.298004,400.0,7.0,10.0,383.386993,129.903992,1419.264038,138.0,16.859001
2017-08-13,1.0,0.0,1.0,803.0,0.0,136.679001,1.0,169.0,266.0,47.770000,...,3.0,327.205994,510.0,2.0,9.0,412.458008,105.168999,1693.607056,200.0,20.000000
2017-08-14,1.0,0.0,6.0,2201.0,0.0,346.037994,4.0,571.0,699.0,154.578003,...,12.0,330.975006,445.0,2.0,14.0,283.428986,114.120003,1348.425049,182.0,17.000000


In [6]:
y_deseasoned = y - y_pred

### Przygotowanie lagów z oryginalnych danych o sprzedaży i promocjach

In [7]:
all_promotion = store_sales.unstack(['store_nbr', 'family']).loc(axis=1)['onpromotion'].loc['2017']
X_lag1 = make_lag(, lag=1)
X_lag6 = make_lag(y_deseasoned, lag=6)
X_lags = pd.concat([X_lag1,X_lag6],axis=1)
X_promo = pd.concat([
    make_lags(all_promotion, lags=1),
    all_promotion,
], axis=1)

y

store_nbr,1,1,1,1,1,1,1,1,1,1,...,9,9,9,9,9,9,9,9,9,9
family,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,...,MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2017-01-01,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000
2017-01-02,5.0,0.0,0.0,1434.0,0.0,166.819000,0.0,332.0,376.0,44.980000,...,5.0,659.570007,1243.0,11.0,41.0,843.596008,115.188995,3136.895996,1.0,23.000000
2017-01-03,4.0,0.0,4.0,3081.0,2.0,519.348022,15.0,952.0,1045.0,209.300003,...,2.0,547.364014,876.0,6.0,15.0,714.659973,133.039001,3229.558105,1.0,14.000000
2017-01-04,1.0,0.0,4.0,3039.0,2.0,543.250977,17.0,1055.0,1029.0,135.944000,...,3.0,395.287994,677.0,6.0,13.0,536.830017,75.201004,1491.416992,7.0,0.000000
2017-01-05,2.0,0.0,3.0,2617.0,0.0,533.479980,40.0,918.0,853.0,137.005997,...,2.0,470.768005,604.0,7.0,10.0,414.100006,113.698997,1566.821045,1.0,17.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-11,1.0,0.0,1.0,1006.0,0.0,145.606995,4.0,341.0,343.0,64.302002,...,5.0,309.244995,373.0,11.0,2.0,525.223999,112.099998,1453.078003,140.0,23.830999
2017-08-12,6.0,0.0,3.0,1659.0,0.0,243.220001,3.0,351.0,526.0,99.487999,...,2.0,260.298004,400.0,7.0,10.0,383.386993,129.903992,1419.264038,138.0,16.859001
2017-08-13,1.0,0.0,1.0,803.0,0.0,136.679001,1.0,169.0,266.0,47.770000,...,3.0,327.205994,510.0,2.0,9.0,412.458008,105.168999,1693.607056,200.0,20.000000
2017-08-14,1.0,0.0,6.0,2201.0,0.0,346.037994,4.0,571.0,699.0,154.578003,...,12.0,330.975006,445.0,2.0,14.0,283.428986,114.120003,1348.425049,182.0,17.000000


### Połączenie wszystkich cech w jedną macierz

In [8]:
X_whole = pd.concat([X, X_lags, X_promo], axis=1).dropna()
# X_whole = pd.concat([X, X_lags], axis=1).dropna()

### Stworzenie multistep target

In [9]:
y = make_multistep_target(y, steps=16).dropna()

### Połączenie tabel

In [10]:
y, X = y.align(X_whole, join='inner', axis = 0)

y

Unnamed: 0_level_0,y_step_1,y_step_1,y_step_1,y_step_1,y_step_1,y_step_1,y_step_1,y_step_1,y_step_1,y_step_1,...,y_step_16,y_step_16,y_step_16,y_step_16,y_step_16,y_step_16,y_step_16,y_step_16,y_step_16,y_step_16
store_nbr,1,1,1,1,1,1,1,1,1,1,...,9,9,9,9,9,9,9,9,9,9
family,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,...,MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2017-01-07,5.0,0.0,6.0,2503.0,0.0,428.220001,14.0,767.0,836.0,136.477005,...,7.0,393.569000,686.0,8.0,20.0,676.979980,208.341995,2683.158936,4.0,31.000000
2017-01-08,0.0,0.0,1.0,1356.0,1.0,165.123001,1.0,345.0,447.0,61.691002,...,4.0,372.351990,382.0,3.0,15.0,424.250000,78.189003,1452.160034,0.0,19.000000
2017-01-09,2.0,0.0,6.0,4100.0,0.0,443.518005,21.0,878.0,853.0,170.764999,...,4.0,380.153015,353.0,5.0,14.0,375.291016,116.255997,2330.061035,4.0,17.000000
2017-01-10,3.0,0.0,10.0,2370.0,3.0,402.117004,8.0,920.0,923.0,129.809006,...,3.0,318.673004,274.0,9.0,7.0,317.437012,74.239998,1241.109009,0.0,13.000000
2017-01-11,10.0,0.0,5.0,2607.0,0.0,485.901001,10.0,947.0,958.0,132.117996,...,2.0,429.313995,248.0,4.0,4.0,272.709991,49.461998,1217.786987,3.0,9.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-07-27,5.0,0.0,1.0,2002.0,0.0,321.000000,21.0,660.0,722.0,116.137001,...,5.0,309.244995,373.0,11.0,2.0,525.223999,112.099998,1453.078003,140.0,23.830999
2017-07-28,7.0,0.0,2.0,2358.0,0.0,403.644989,20.0,714.0,711.0,178.408997,...,2.0,260.298004,400.0,7.0,10.0,383.386993,129.903992,1419.264038,138.0,16.859001
2017-07-29,4.0,0.0,3.0,2161.0,0.0,330.035004,6.0,667.0,676.0,122.680000,...,3.0,327.205994,510.0,2.0,9.0,412.458008,105.168999,1693.607056,200.0,20.000000
2017-07-30,1.0,0.0,2.0,1212.0,0.0,153.807999,0.0,238.0,316.0,54.296001,...,12.0,330.975006,445.0,2.0,14.0,283.428986,114.120003,1348.425049,182.0,17.000000


In [11]:
X

Unnamed: 0_level_0,const,trend,"s(2,7)","s(3,7)","s(4,7)","s(5,7)","s(6,7)","s(7,7)","sin(1,freq=M)","cos(1,freq=M)",...,"(9, MAGAZINES)","(9, MEATS)","(9, PERSONAL CARE)","(9, PET SUPPLIES)","(9, PLAYERS AND ELECTRONICS)","(9, POULTRY)","(9, PREPARED FOODS)","(9, PRODUCE)","(9, SCHOOL AND OFFICE SUPPLIES)","(9, SEAFOOD)"
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-07,1.0,1464.0,0.0,0.0,0.0,0.0,0.0,0.0,0.937752,0.347305,...,0.0,0.0,13.0,0.0,0.0,2.0,1.0,7.0,0.0,2.0
2017-01-08,1.0,1465.0,1.0,0.0,0.0,0.0,0.0,0.0,0.988468,0.151428,...,0.0,0.0,14.0,0.0,0.0,2.0,1.0,6.0,0.0,0.0
2017-01-09,1.0,1466.0,0.0,1.0,0.0,0.0,0.0,0.0,0.998717,-0.050649,...,0.0,0.0,12.0,0.0,0.0,1.0,1.0,6.0,0.0,0.0
2017-01-10,1.0,1467.0,0.0,0.0,1.0,0.0,0.0,0.0,0.968077,-0.250653,...,0.0,0.0,12.0,0.0,0.0,0.0,2.0,148.0,0.0,0.0
2017-01-11,1.0,1468.0,0.0,0.0,0.0,1.0,0.0,0.0,0.897805,-0.440394,...,0.0,0.0,13.0,0.0,0.0,0.0,10.0,7.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-07-27,1.0,1665.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.848644,0.528964,...,0.0,21.0,9.0,0.0,0.0,0.0,1.0,9.0,0.0,0.0
2017-07-28,1.0,1666.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.724793,0.688967,...,0.0,0.0,9.0,0.0,0.0,22.0,3.0,8.0,4.0,0.0
2017-07-29,1.0,1667.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.571268,0.820763,...,0.0,0.0,9.0,0.0,0.0,0.0,1.0,8.0,7.0,4.0
2017-07-30,1.0,1668.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.394356,0.918958,...,0.0,0.0,9.0,0.0,0.0,1.0,1.0,7.0,8.0,0.0


### Ekstrakcja cech, które będą użyte w multistep targetingu

In [12]:
X_fore = X_whole.loc['2017-08']
X_fore

Unnamed: 0_level_0,const,trend,"s(2,7)","s(3,7)","s(4,7)","s(5,7)","s(6,7)","s(7,7)","sin(1,freq=M)","cos(1,freq=M)",...,"(9, MAGAZINES)","(9, MEATS)","(9, PERSONAL CARE)","(9, PET SUPPLIES)","(9, PLAYERS AND ELECTRONICS)","(9, POULTRY)","(9, PREPARED FOODS)","(9, PRODUCE)","(9, SCHOOL AND OFFICE SUPPLIES)","(9, SEAFOOD)"
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-08-01,1.0,1670.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,9.0,0.0,1.0,0.0,0.0,151.0,9.0,0.0
2017-08-02,1.0,1671.0,0.0,0.0,0.0,1.0,0.0,0.0,0.201299,0.97953,...,0.0,0.0,9.0,0.0,0.0,0.0,1.0,8.0,7.0,0.0
2017-08-03,1.0,1672.0,0.0,0.0,0.0,0.0,1.0,0.0,0.394356,0.918958,...,0.0,21.0,10.0,0.0,0.0,1.0,0.0,8.0,6.0,0.0
2017-08-04,1.0,1673.0,0.0,0.0,0.0,0.0,0.0,1.0,0.571268,0.820763,...,0.0,0.0,11.0,0.0,0.0,23.0,3.0,8.0,7.0,0.0
2017-08-05,1.0,1674.0,0.0,0.0,0.0,0.0,0.0,0.0,0.724793,0.688967,...,0.0,0.0,12.0,0.0,0.0,1.0,1.0,6.0,9.0,3.0
2017-08-06,1.0,1675.0,1.0,0.0,0.0,0.0,0.0,0.0,0.848644,0.528964,...,0.0,0.0,12.0,0.0,0.0,0.0,1.0,7.0,9.0,0.0
2017-08-07,1.0,1676.0,0.0,1.0,0.0,0.0,0.0,0.0,0.937752,0.347305,...,0.0,0.0,12.0,0.0,0.0,0.0,1.0,6.0,10.0,0.0
2017-08-08,1.0,1677.0,0.0,0.0,1.0,0.0,0.0,0.0,0.988468,0.151428,...,0.0,0.0,11.0,0.0,0.0,0.0,1.0,144.0,7.0,0.0
2017-08-09,1.0,1678.0,0.0,0.0,0.0,1.0,0.0,0.0,0.998717,-0.050649,...,0.0,0.0,10.0,0.0,0.0,1.0,1.0,6.0,8.0,0.0
2017-08-10,1.0,1679.0,0.0,0.0,0.0,0.0,1.0,0.0,0.968077,-0.250653,...,0.0,20.0,8.0,0.0,0.0,0.0,1.0,6.0,10.0,0.0


### Wytrenowanie modelu oraz użycie go z przygotowanymi danymi

In [13]:
linear_model = LinearRegression(fit_intercept=False).fit(X, y)

linear_y_forecast = pd.DataFrame(linear_model.predict(X_fore), index=X_fore.index, columns = y.columns).clip(0.0)

### Uporządkowanie danych

In [14]:
linear_y_forecast.loc['2017-08-15']

           store_nbr  family                    
y_step_1   1          AUTOMOTIVE                       9.984147
                      BABY CARE                        0.000000
                      BEAUTY                           4.034978
                      BEVERAGES                     2019.764963
                      BOOKS                            0.495023
                                                       ...     
y_step_16  9          POULTRY                        265.551334
                      PREPARED FOODS                  52.514249
                      PRODUCE                       1217.769317
                      SCHOOL AND OFFICE SUPPLIES      80.274299
                      SEAFOOD                          9.511495
Name: 2017-08-15, Length: 28512, dtype: float64

### Formatowanie wyników

In [15]:
linear_forecast = linear_y_forecast.loc['2017-08-15']
linear_forecast.index.names=['date', 'store_nbr', 'family']
forecast_formatted = linear_forecast.unstack('date', 'family').stack('date')
forecast_formatted.index = df_test.index

pd.DataFrame(forecast_formatted, columns = ['sales'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales
store_nbr,family,date,Unnamed: 3_level_1
1,AUTOMOTIVE,2017-08-16,9.984147
1,AUTOMOTIVE,2017-08-17,2.943338
1,AUTOMOTIVE,2017-08-18,1.830150
1,AUTOMOTIVE,2017-08-19,8.789018
1,AUTOMOTIVE,2017-08-20,4.686842
...,...,...,...
9,SEAFOOD,2017-08-27,27.939455
9,SEAFOOD,2017-08-28,25.070167
9,SEAFOOD,2017-08-29,16.839944
9,SEAFOOD,2017-08-30,10.467455


### Zapisanie predykcji w submission.csv

In [16]:
forecast_formatted = pd.DataFrame(forecast_formatted, columns = ['sales'])
y_submit = forecast_formatted.join(df_test.id).reindex(columns=['id', 'sales'])
y_submit.to_csv('submission.csv', index=False)