## Store-sales time-series forecasting Model 2

Building up from model 1 we want now to create a model with unique features for each store and/or product family. This unique feature is going to be the specific holidays in the store, which depend on ins location for the  regional and city specific holidays.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

In [2]:
dtype = {
    'id': 'uint64',
    'store_nbr': 'category',
    'family': 'category',
    'sales': 'float32',
    'onpromotion': 'uint64',
}

train = pd.read_csv('train.csv', parse_dates = ['date'], dtype=dtype)
train['date'] = train.date.dt.to_period('D')
train = train.set_index(['store_nbr', 'family']).sort_index()


test = pd.read_csv('test.csv', parse_dates = ['date'], dtype=dtype)
test['date'] = test.date.dt.to_period('D')
test = test.set_index(['store_nbr', 'family']).sort_index()

oil = pd.read_csv('oil.csv',parse_dates = True, index_col=['date']).to_period('D')
holidays_events = pd.read_csv('holidays_events.csv', parse_dates = True, index_col=['date']).to_period('D')
stores = pd.read_csv('stores.csv', dtype={'store_nbr': 'category', 'city': 'category', 'state': 'category'}, usecols=['store_nbr','city','state'])


In [6]:
start = '2017-04-15'
end = '2017-07-15'
#index = pd.date_range(start=start, end=end, freq='D')
index =train.loc[('1', 'BABY CARE'),['date']].set_index('date')[start:end].index

#fourier_1 = CalendarFourier('A', 4)
#fourier_2 = CalendarFourier('M', 2)

dp = DeterministicProcess(index = index,
#                          constant=False,
                          order = 1,
                          drop= True,
                          seasonal = True,
                          period = 7,
#                          additional_terms=[fourier_1,fourier_2]
#                          additional_terms=[fourier_2]
                         )
X_seasons = dp.in_sample()
X_seasons_f = dp.out_of_sample(steps=16)
index_f = X_seasons_f.index
y_submit = None


#sf = ('1', 'BABY CARE')
for sf in train.index.unique():
    print(f"Store Number: {sf[0].ljust(2)}", end='\r')
    train_by_sf = train.loc[sf,['date','id','sales','onpromotion']].set_index(['date']).sort_index()
#    train_by_sf = train_by_sf.resample('D').fillna(method = None).fillna(0)

    y = train_by_sf.loc[index,'sales']
    X_promo = train_by_sf.loc[index,'onpromotion']
    X = X_seasons.join(X_promo)

    if end =='2017-08-15':
        test_by_sf = test.loc[sf,['date','id','onpromotion']].set_index(['date']).sort_index()
        fore_ids = test_by_sf['id'].values
        X_promo_f = test_by_sf['onpromotion']
    else:
        fore_ids = train_by_sf.loc[index_f,'id'].values
        X_promo_f = train_by_sf.loc[index_f,'onpromotion']

    X_f = X_seasons_f.join(X_promo_f)


    city = stores.query('store_nbr  == @sf[0]')['city'].iloc[0]
    state = stores.query('store_nbr  == @sf[0]')['state'].iloc[0]
    hd_at_store = holidays_events.query('(locale == "National") | (locale == "Regional" and locale_name == @state) | (locale == "Local" and locale_name == @city)')[['description']]
    X_holidays = pd.DataFrame(
        OneHotEncoder(sparse=False, categories= [hd_at_store.description.unique()]).fit_transform(hd_at_store),
        index=hd_at_store.index,
        columns=hd_at_store.description.unique(),
    ).drop_duplicates()

    X = X.join(X_holidays).fillna(0)
    X_f = X_f.join(X_holidays).fillna(0)

    model = LinearRegression(fit_intercept=False)
    model.fit(X, y)
    y_pred = pd.DataFrame(model.predict(X), index = index, columns=['sales'])

    y_forecast = pd.DataFrame({'id': fore_ids , 'sales': model.predict(X_f)})

    y_submit = pd.concat([y_submit,y_forecast])
                         
y_submit.loc[y_submit['sales'] < 0, 'sales'] = 0

if end =='2017-08-15':
    y_submit.to_csv('submission.csv', index=False, columns =['id','sales'])
else:
    y_f_true = train.set_index('date').loc[index_f,'sales'].values
    y_test   = y_submit.set_index('id').sort_index()['sales'].values
    RMSLE = np.sqrt((np.sum(np.log((1+y_test)/(1+y_f_true))**2)/len(y_f_true)))
    print(f'   RMSLE = {RMSLE:.3f}', end='\r')

   RMSLE = 0.467

Unfortunately it does not seem to improve the forecast.