In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

from learntools.core import binder
binder.bind(globals())
from learntools.time_series.ex3 import *

# Setup notebook
from pathlib import Path
from learntools.time_series.style import *  # plot style settings
from learntools.time_series.utils import plot_periodogram, seasonal_plot

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess, Fourier
from xgboost import XGBRegressor



# Feature fetching and engineering 

In [3]:
path = '/kaggle/input/store-sales-time-series-forecasting/'

calendar = pd.DataFrame(index=pd.date_range('2013-01-01','2017-08-31'))

train =  pd.read_csv(path + 'train.csv',
                       usecols=['store_nbr', 'family', 'date', 'sales'],
                       dtype={'store_nbr': 'category', 'family': 'category', 'sales': 'float32'},
                       parse_dates=['date'], infer_datetime_format=True)

train.date = train.date.dt.to_period('D')
train = train.set_index(['store_nbr', 'family', 'date']).sort_index()

test = pd.read_csv(path + 'test.csv',
                       usecols=['store_nbr', 'family', 'date'],
                       dtype={'store_nbr': 'category', 'family': 'category'},
                       parse_dates=['date'], infer_datetime_format=True)

test.date = test.date.dt.to_period('D')
test = test.set_index(['store_nbr', 'family', 'date']).sort_index()

oil = pd.read_csv(path + 'oil.csv', parse_dates=['date'], infer_datetime_format = True, index_col='date')
oil['ma_oil'] = oil['dcoilwtico'].rolling(7).mean()

calendar = calendar.merge(oil, how='left', left_index=True, right_index=True)
calendar['ma_oil'].fillna(method='ffill', inplace=True)


calendar['dofw'] = calendar.index.dayofweek

stores = pd.read_csv(path + "stores.csv")

transactions = pd.read_csv(path + "transactions.csv")

df_hev = pd.read_csv(path + 'holidays_events.csv', parse_dates=['date'], infer_datetime_format = True)
df_hev['date'] = df_hev['date'].replace({'2013-04-29': pd.to_datetime('2013-03-29')}) # Good Friday correction
df_hev = df_hev.set_index('date').sort_index()
df_hev = df_hev[df_hev.locale == 'National'] 
df_hev = df_hev.groupby(df_hev.index).first()

sample_submission = pd.read_csv(path + "sample_submission.csv")

calendar['wd'] = True
calendar.loc[calendar.dofw > 4, 'wd'] = False

calendar = calendar.merge(df_hev, how='left', left_index=True, right_index=True)

calendar.loc[calendar.type == 'Bridge', 'wd'] = False
calendar.loc[calendar.type == 'Work Day', 'wd'] = True
calendar.loc[calendar.type == 'Transfer', 'wd'] = False
calendar.loc[(calendar.type == 'Holiday') & (calendar.transferred == False), 'wd'] = False
calendar.loc[(calendar.type == 'Holiday') & (calendar.transferred == True), 'wd'] = True



# Set up

In [5]:
sdate = '2017-01-01'
edate = '2017-08-15'

# Create target matrix
y = train.unstack(['store_nbr', 'family']).loc[sdate:edate]

In [6]:
fourier = CalendarFourier(freq = 'W', order = 4)

dp = DeterministicProcess(index=y.index,
                          constant=False,
                          order=1,
                          seasonal=False,
                          additional_terms=[fourier],
                          drop=True)
X = dp.in_sample()

# Adding calendar features to training feature matrix X

X['oil']  = calendar.loc[sdate:edate]['ma_oil'].values
X['dofw'] = calendar.loc[sdate:edate]['dofw'].values
X['wd']   = calendar.loc[sdate:edate]['wd'].values
X['type'] = calendar.loc[sdate:edate]['type'].values

X = pd.get_dummies(X, columns=['dofw'], drop_first=True)
X = pd.get_dummies(X, columns=['type'], drop_first=False)

holidays_events = pd.read_csv(
    path + "holidays_events.csv",
    dtype={
        'type': 'category',
        'locale': 'category',
        'locale_name': 'category',
        'description': 'category',
        'transferred': 'bool',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
holidays_events = holidays_events.set_index('date').to_period('D')
holidays = (
    holidays_events
    .query("locale in ['National', 'Regional']")
    .loc['2017':'2017-08-15', ['description']]
    .assign(description=lambda x: x.description.cat.remove_unused_categories())
)

from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False)

X_holidays = pd.DataFrame(
    ohe.fit_transform(holidays),
    index=holidays.index,
    columns=holidays.description.unique(),
)



X2 = X.join(X_holidays, on='date').fillna(0.0)

# Define Boosted Hybrid model

In [7]:
class BoostedHybrid:
    def __init__(self, model_1, model_2):
        self.model_1 = model_1
        self.model_2 = model_2
        self.y_columns = None 

In [8]:
def fit(self, X, y):

    self.model_1.fit(X,y)
    y_fit = pd.DataFrame(self.model_1.predict(X),index=X.index, columns=y.columns)
    y_resid = y - y_fit
    self.model_2.fit(X, y_resid)

    self.y_columns = y.columns

BoostedHybrid.fit = fit

In [9]:
def predict(self, X):
    y_pred1 = pd.DataFrame(self.model_1.predict(X),index=X.index, columns=self.y_columns)
    y_pred2 = pd.DataFrame(self.model_2.predict(X),index=X.index, columns=self.y_columns)     
    y_pred = y_pred1 + y_pred2 
    
    return y_pred

BoostedHybrid.predict = predict

# Train model 


In [86]:

model = BoostedHybrid(                                              
        model_1=LinearRegression(),
        model_2 = RandomForestRegressor(n_estimators=300))

model.fit(X, y)
y_pred= model.predict(X)
y_pred = y_pred.clip(0.0)

In [2]:
from sklearn.metrics import mean_squared_error
y_pred  = y_pred.stack(['store_nbr', 'family']).reset_index()
y_target = y.stack(['store_nbr', 'family']).reset_index().copy()

y_target['sales_pred'] = y_pred['sales']
print('MSLE:', y_target.groupby('family').apply(lambda a: np.sqrt(mean_squared_log_error(a['sales'],a['sales_pred']))).sum())
y_target.groupby('family').apply(lambda a: np.sqrt(mean_squared_log_error(a['sales'],a['sales_pred'])))

NameError: name 'y_pred' is not defined

# Prediction

In [17]:
stest = '2017-08-16'
etest = '2017-08-31'

X_test = dp.out_of_sample(steps=16)

# Adding calendar features to test feature matrix X_test

X_test['oil']  = calendar.loc[stest:etest]['ma_oil'].values
X_test['dofw'] = calendar.loc[stest:etest]['dofw'].values
X_test['wd']   = calendar.loc[stest:etest]['wd'].values

X_test = pd.get_dummies(X_test, columns=['dofw'], drop_first=True)

# No national level events in this period
X_test[['type_Additional', 'type_Event', 'type_Holiday', 'type_Transfer' ]] = 0       

sales_pred = pd.DataFrame(model.predict(X_test), index = X_test.index,columns = y.columns)  
sales_pred = sales_pred.stack(['store_nbr', 'family'])

In [13]:
df_sub = pd.read_csv(path + 'sample_submission.csv', index_col='id')
df_sub.sales = sales_pred.values
df_sub.to_csv('submission.csv', index=True)