<a href="https://colab.research.google.com/github/Mikd14/Projects/blob/main/Machine-learning/New_store_sales_time_series_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import time
import os
from itertools import product
from pylab import rcParams
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_log_error
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess, Fourier
from google.colab import drive


In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#!pip install statsmodels 

Collecting statsmodels
  Downloading statsmodels-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 4.4 MB/s 
Installing collected packages: statsmodels
Successfully installed statsmodels-0.13.1


In [None]:
train = pd.read_csv('/content/drive/MyDrive/Store Sales -Time Series/train.csv', usecols=['store_nbr', 'family', 'date', 'sales'],
                       dtype={'store_nbr': 'category', 'family': 'category', 'sales': 'float32'},
                       parse_dates=['date'], infer_datetime_format=True)
holidays = pd.read_csv('/content/drive/MyDrive/Store Sales -Time Series/holidays_events.csv', parse_dates=['date'], index_col='date', infer_datetime_format=True)
oil_prices = pd.read_csv('/content/drive/MyDrive/Store Sales -Time Series/oil.csv', parse_dates=['date'], index_col='date', infer_datetime_format=True)
transactions = pd.read_csv('/content/drive/MyDrive/Store Sales -Time Series/transactions.csv')
stores = pd.read_csv('/content/drive/MyDrive/Store Sales -Time Series/stores.csv')
test = pd.read_csv('/content/drive/MyDrive/Store Sales -Time Series/test.csv',
                      usecols=['store_nbr', 'family', 'date'],
                      dtype={'store_nbr': 'category', 'family': 'category'},
                      parse_dates=['date'], infer_datetime_format=True)

test.head()

Unnamed: 0,date,store_nbr,family
0,2017-08-16,1,AUTOMOTIVE
1,2017-08-16,1,BABY CARE
2,2017-08-16,1,BEAUTY
3,2017-08-16,1,BEVERAGES
4,2017-08-16,1,BOOKS


In [None]:
 #create calendar from 2013-2017
 calendar = pd.DataFrame(index=pd.date_range('2013-01-01', '2017-08-31'))

#fill missing values in oil prices and create 7 day rolling average 
oil_prices['ma_oil'] = oil_prices['dcoilwtico'].rolling(7).mean()

calendar = calendar.merge(oil_prices, how='left', left_index=True, right_index=True)
calendar['ma_oil'].fillna(method='ffill', inplace=True)
calendar['ma_oil'].fillna(method='bfill', inplace=True)



Unnamed: 0,dcoilwtico,ma_oil
2013-01-01,,93.218571
2013-01-02,93.14,93.218571
2013-01-03,92.97,93.218571
2013-01-04,93.12,93.218571
2013-01-05,,93.218571


In [None]:
#add days of the week
calendar['dofw'] = calendar.index.dayofweek


In [None]:
#Add holidays
holidays = holidays[holidays.locale == 'National'] #only keep national holidays
holidays = holidays.groupby(holidays.index).first() #only one event per day
holidays = holidays.drop(['locale_name', 'description', 'locale'], axis=1)

In [None]:
holidays

In [None]:
new_holiday = holidays.copy()
new_calendar = calendar.copy()

#sets whether each day is a work day or not

new_calendar['wd'] = True
new_calendar.loc[new_calendar.dofw > 4, 'wd'] = False
new_calendar = new_calendar.merge(new_holiday, how='left', left_index=True, right_index=True)
new_calendar.loc[new_calendar.type == 'Bridge'  , 'wd'] = False
new_calendar.loc[new_calendar.type == 'Work Day', 'wd'] = True
new_calendar.loc[new_calendar.type == 'Transfer', 'wd'] = False
new_calendar.loc[(new_calendar.type == 'Holiday') & (new_calendar.transferred == False), 'wd'] = False
new_calendar.loc[(new_calendar.type == 'Holiday') & (new_calendar.transferred == True ), 'wd'] = True
new_calendar.head(20)

In [None]:
train.head()

Unnamed: 0,date,store_nbr,family,sales
0,2013-01-01,1,AUTOMOTIVE,0.0
1,2013-01-01,1,BABY CARE,0.0
2,2013-01-01,1,BEAUTY,0.0
3,2013-01-01,1,BEVERAGES,0.0
4,2013-01-01,1,BOOKS,0.0


In [None]:
#sort train and test data
train['date'] = train['date'].dt.to_period('D')
train = train.set_index(['store_nbr', 'family', 'date']).sort_index()

test.date = test.date.dt.to_period('D')
test = test.set_index(['store_nbr', 'family', 'date']).sort_index()

test.head()

store_nbr,family,date
1,AUTOMOTIVE,2017-08-16
1,AUTOMOTIVE,2017-08-17
1,AUTOMOTIVE,2017-08-18
1,AUTOMOTIVE,2017-08-19
1,AUTOMOTIVE,2017-08-20


In [None]:
end_date='2017-08-15'
start_date='2017-04-01'

In [None]:

y = train.unstack(['store_nbr', 'family']).loc[start_date:end_date]
#create Fourier features
fourier = CalendarFourier(freq='W', order=4)
dp = DeterministicProcess(index=y.index,
                          constant=False,
                          order=1,
                          seasonal=False,
                          additional_terms=[fourier],
                          drop=True)
X = dp.in_sample()

#add extensions to X from calendar
X['oil']  = new_calendar.loc[start_date:end_date]['ma_oil'].values
X['dofw'] = new_calendar.loc[start_date:end_date]['dofw'].values
X['wd']   = new_calendar.loc[start_date:end_date]['wd'].values
X['type'] = new_calendar.loc[start_date:end_date]['type'].values

X = pd.get_dummies(X, columns=['dofw'], drop_first=True)
X = pd.get_dummies(X, columns=['type'], drop_first=False)

In [None]:
y.head(20)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

model = make_pipeline(StandardScaler(with_mean=False), Ridge(fit_intercept=True, solver='auto', alpha=0.5))

model.fit(X, y)
y_pred = pd.DataFrame(model.predict(X), index=X.index, columns=y.columns)

In [None]:
y_pred.head(20)

In [None]:
def results(X,y_pred, y):

  y_pred  = y_pred.stack(['store_nbr', 'family']).reset_index()
  y_target = y.stack(['store_nbr', 'family']).reset_index().copy()
  y_target['sales_pred'] = y_pred['sales'].clip(0.) 
  print(y_target.groupby('family').apply(lambda r: mean_squared_log_error(r['sales'], r['sales_pred'])))

#office supplies too noisy for linear regression

In [None]:
#try randomforest instead
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor()
rf_model.fit(X,y)
rf_y_pred = pd.DataFrame(rf_model.predict(X), index=X.index, columns=y.columns)

results(X,rf_y_pred, y)

In [None]:
end_test='2017-08-31'
start_test='2017-08-16'
X_test = dp.out_of_sample(steps=16)

X_test['oil']  = new_calendar.loc[start_test:end_test]['ma_oil'].values
X_test['dofw'] = new_calendar.loc[start_test:end_test]['dofw'].values
X_test['wd']   = new_calendar.loc[start_test:end_test]['wd'].values

X_test = pd.get_dummies(X_test, columns=['dofw'], drop_first=True)

# No national level events in this period
X_test[['type_Additional', 'type_Event', 'type_Holiday', 'type_Transfer']] = 0

sales_pred = pd.DataFrame(model.predict(X_test), index=X_test.index, columns=y.columns)
sales_pred = sales_pred.stack(['store_nbr', 'family'])
sales_pred[sales_pred < 0] = 0. 

In [None]:
#submission
sub = pd.read_csv('/content/drive/MyDrive/Store Sales -Time Series/sample_submission.csv', index_col='id')
sub['sales'] = sales_pred.values
sub.to_csv('Submission.csv', index=True)

from google.colab import files

files.download('Submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>