This notebook aims to have a look of the 3 bests models built by doing some predictions for some stores

In [1]:
import sys
import os
import logging
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

sns.set_style("darkgrid")
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

sys.path.append(os.path.abspath(os.path.join("../scripts")))

from clean import dataCleaning

sys.path.append(os.path.abspath(os.path.join("../models"))) 

import pickle

In [2]:
logging.basicConfig(filename='../exploration_logfile.log', filemode='a',
                    encoding='utf-8', level=logging.DEBUG)

## Load models

The best models are:
 - XGB
 - CatBoost
 - Random Forest

In [3]:
XGBModel = pickle.load(open('09-09-2022-17-58-55-00-XGB.pkl', 'rb'))
CatBoostModel = pickle.load(open('09-09-2022-18-03-50-00-CatBoost.pkl', 'rb'))
RandomForest = pickle.load(open('09-09-2022-14-44-50-00-RandomForest.pkl', 'rb'))

### Stores to predict

We will choose one store per store type.

In [4]:
import dvc.api
import io

path = 'data/test_data.csv'
repo = './'
version='vt.2.1'

data_url = dvc.api.get_url(
    path=path,
	repo=repo,
	rev=version
	)

test = pd.read_csv(data_url, sep=",")
test = dataCleaning(test).cleanStateHoliday2()

In [5]:
import dvc.api
import io

path = 'data/train_data.csv'
repo = './'
version='v2.1'

data_url = dvc.api.get_url(
    path=path,
	repo=repo,
	rev=version
	)

train = pd.read_csv(data_url, sep=",")
train = dataCleaning(train).cleanStateHoliday2()

In [6]:
store1 = pd.read_csv('../data/store1_pred.csv')
store1.head()

Unnamed: 0.1,Unnamed: 0,Date,Store,DayOfWeek,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,...,IsHoliday,PromoPerCompetitionDistance,Promo2PerCompetitionDistance,Year,Month,Day,WeekOfYear,BeginMonth,MidMonth,EndMonth
0,0,2015-07-31,1,4,1,0,0,c,a,1270.0,...,0,5085.78902,6727.628859,2015,7,31,31,0,0,1
1,1,2015-08-01,1,3,1,0,0,c,a,1270.0,...,0,5085.78902,6727.628859,2015,8,1,31,1,0,0
2,2,2015-08-02,1,2,1,0,0,c,a,1270.0,...,0,5085.78902,6727.628859,2015,8,2,31,1,0,0
3,3,2015-08-03,1,1,1,0,0,c,a,1270.0,...,0,5085.78902,6727.628859,2015,8,3,32,1,0,0
4,4,2015-08-04,1,6,0,0,0,c,a,1270.0,...,0,5189.642611,6727.628859,2015,8,4,32,1,0,0


In [7]:
test.head()

Unnamed: 0,Store,DayOfWeek,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,WeekDays,WeekEnds,IsHoliday,PromoPerCompetitionDistance,Promo2PerCompetitionDistance
0,1,4,1,0,0,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0,1,0,0,5085.78902,6727.628859
1,1,3,1,0,0,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0,1,0,0,5085.78902,6727.628859
2,1,2,1,0,0,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0,1,0,0,5085.78902,6727.628859
3,1,1,1,0,0,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0,1,0,0,5085.78902,6727.628859
4,1,6,0,0,0,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0,0,1,0,5189.642611,6727.628859


In [8]:
store703 = test[test['Store'] == 703]
store274 = test[test['Store'] == 274]
store1112 = test[test['Store'] == 1112]
store13 = test[test['Store'] == 13]

In [9]:
print(len(store703))
print('------------------------------------')
print(len(store274))
print('------------------------------------')
print(len(store1112))
print('------------------------------------')
print(len(store13))


37
------------------------------------
43
------------------------------------
41
------------------------------------
39


In [10]:
### Create date for test
dat = test[test['Store'] == 274]
from prophet import Prophet
touse = pd.DataFrame(columns=['ds', 'y'])
touse['ds'] = train[train['Store'] == 1]['Date']
touse['y'] = train[train['Store'] == 1]['Sales']
m = Prophet(interval_width=0.95,yearly_seasonality=True)
m.fit(touse)
testsdate = m.make_future_dataframe(periods=43)
testsdate = testsdate.tail(43)

11:42:35 - cmdstanpy - INFO - Chain [1] start processing
11:42:36 - cmdstanpy - INFO - Chain [1] done processing


In [11]:
store703['Date'] = list(testsdate['ds'])[0:37]
store274['Date'] = list(testsdate['ds'])
store1112['Date'] = list(testsdate['ds'])[0:41]
store13['Date'] = list(testsdate['ds'])[0:39]

def newfeatures(df): 
    df.set_index('Date', inplace=True)
    df['Year'] = df.index.year
    df['Month'] = df.index.month
    df['Day'] = df.index.day
    df['WeekOfYear'] = df.index.weekofyear

    df.reset_index(inplace=True)

    df['BeginMonth'] = (((df['Day'])//7) == 0)*1
    df['MidMonth'] = (((df['Day'])//10) == 1)*1
    df['EndMonth'] = (((df['Day'])//7) >= 3)*1

    
    df.drop(['Date', 'Store'], axis=1)

    return df

store703_pred = newfeatures(store703)
store274_pred = newfeatures(store274)
store1112_pred = newfeatures(store1112)
store13_pred = newfeatures(store13)

### Predictions

### Store703 (Type a)

In [12]:
store13['XGBpred'] = XGBModel.predict(store13_pred)
store13['CBpred'] = CatBoostModel.predict(store13_pred)
store13['RFpred'] = RandomForest.predict(store13_pred)

In [17]:
import plotly.express as px
import plotly
fig = px.line(store13, x='Date', y=store13.columns[27:30])
# fig.update_xaxes(rangeslider_visible=True)
fig.show()
plotly.offline.plot(fig, filename='../predictions_images/Store_a.html')

'../predictions_images/Store_a.html'

### Store274 (Type b)

In [14]:
store274['XGBpred'] = XGBModel.predict(store274_pred)
store274['CBpred'] = CatBoostModel.predict(store274_pred)
store274['RFpred'] = RandomForest.predict(store274_pred)

In [23]:
import plotly.express as px
fig = px.line(store274, x='Date', y=store274.columns[27:30])
fig.show()
plotly.offline.plot(fig, filename='../predictions_images/Store_b.html')

'../predictions_images/Store_b.html'

### Store1112 (Type c)

In [16]:
store1112['XGBpred'] = XGBModel.predict(store1112_pred)
store1112['CBpred'] = CatBoostModel.predict(store1112_pred)
store1112['RFpred'] = RandomForest.predict(store1112_pred)

In [22]:
import plotly.express as px
fig = px.line(store1112, x='Date', y=store1112.columns[27:30])
fig.show()
# html file
plotly.offline.plot(fig, filename='../predictions_images/Store_c.html')

'../predictions_images/Store_c.html'

### Store13 (Type d)

In [18]:
store13['XGBpred'] = XGBModel.predict(store13_pred)
store13['CBpred'] = CatBoostModel.predict(store13_pred)
store13['RFpred'] = RandomForest.predict(store13_pred)

In [28]:
import plotly.express as px
fig = px.line(store13, x='Date', y=store13.columns[27:30])
fig.show()
plotly.offline.plot(fig, filename='../predictions_images/Store_d.html')

'../predictions_images/Store_d.html'