In [1]:
import sys
import os
import logging
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

sns.set_style("darkgrid")
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

sys.path.append(os.path.abspath(os.path.join("../scripts"))) 
from clean import dataCleaning

In [2]:
logging.basicConfig(filename='../post_analysis_logfile.log', filemode='a',
                    encoding='utf-8', level=logging.DEBUG)

In [3]:
import dvc.api
import io

path = 'data/train_data.csv'
repo = './'
version='v2.1'

data_url = dvc.api.get_url(
    path=path,
	repo=repo,
	rev=version
	)

train = pd.read_csv(data_url, sep=",")
train = dataCleaning(train).cleanStateHoliday2()

In [4]:
import dvc.api
import io

path = 'data/test_data.csv'
repo = './'
version='vt.2.1'

data_url = dvc.api.get_url(
    path=path,
	repo=repo,
	rev=version
	)

test = pd.read_csv(data_url, sep=",")
test = dataCleaning(test).cleanStateHoliday2()

In [5]:
print(train.columns)
print('----------------------------------------------')
print(test.columns)

Index(['Date', 'Store', 'DayOfWeek', 'Sales', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'Year', 'Month', 'Day', 'WeekOfYear', 'StoreType',
       'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'PromoInterval', 'WeekDays', 'WeekEnds', 'IsHoliday',
       'PromoPerCompetitionDistance', 'Promo2PerCompetitionDistance',
       'BeginMonth', 'MidMonth', 'EndMonth'],
      dtype='object')
----------------------------------------------
Index(['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday',
       'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'WeekDays',
       'WeekEnds', 'IsHoliday', 'PromoPerCompetitionDistance',
       'Promo2PerCompetitionDistance'],
      dtype='object')


In [6]:
train.tail()

Unnamed: 0,Date,Store,DayOfWeek,Sales,Promo,StateHoliday,SchoolHoliday,Year,Month,Day,...,Promo2SinceYear,PromoInterval,WeekDays,WeekEnds,IsHoliday,PromoPerCompetitionDistance,Promo2PerCompetitionDistance,BeginMonth,MidMonth,EndMonth
844333,2015-07-31,612,5,8161,1,0,0,2015,7,31,...,2009.0,"Jan,Apr,Jul,Oct",1,0,0,5429.265711,4096.150745,0,0,1
844334,2015-07-31,235,5,6756,1,0,1,2015,7,31,...,2009.0,"Jan,Apr,Jul,Oct",1,0,0,5429.265711,9701.437949,0,0,1
844335,2015-07-31,1078,5,9732,1,0,1,2015,7,31,...,2011.0,"Jan,Apr,Jul,Oct",1,0,0,5429.265711,2253.297998,0,0,1
844336,2015-07-31,845,5,5151,1,0,1,2015,7,31,...,2011.0,"Jan,Apr,Jul,Oct",1,0,0,5429.265711,5617.897084,0,0,1
844337,2015-07-31,1,5,5263,1,0,1,2015,7,31,...,0.0,0,1,0,0,5429.265711,6549.550665,0,0,1


In [7]:
### Create date for test
dat = test[test['Store'] == 1]
from prophet import Prophet
touse = pd.DataFrame(columns=['ds', 'y'])
touse['ds'] = train[train['Store'] == 1]['Date']
touse['y'] = train[train['Store'] == 1]['Sales']
m = Prophet(interval_width=0.95,yearly_seasonality=True)
m.fit(touse)
testsdate = m.make_future_dataframe(periods=40)
testsdate = testsdate.tail(41)

00:15:02 - cmdstanpy - INFO - Chain [1] start processing
00:15:03 - cmdstanpy - INFO - Chain [1] done processing


In [8]:
train.dtypes

Date                             object
Store                             int64
DayOfWeek                         int64
Sales                             int64
Promo                             int64
StateHoliday                     object
SchoolHoliday                     int64
Year                              int64
Month                             int64
Day                               int64
WeekOfYear                        int64
StoreType                        object
Assortment                       object
CompetitionDistance             float64
CompetitionOpenSinceMonth       float64
CompetitionOpenSinceYear        float64
Promo2                            int64
Promo2SinceWeek                 float64
Promo2SinceYear                 float64
PromoInterval                    object
WeekDays                          int64
WeekEnds                          int64
IsHoliday                         int64
PromoPerCompetitionDistance     float64
Promo2PerCompetitionDistance    float64


In [9]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.preprocessing import PolynomialFeatures

from sklearn.metrics import accuracy_score, mean_squared_error

In [10]:
categorical_features = ['StateHoliday', 'StoreType', 'Assortment', 'PromoInterval']
numerical_features = ['CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2SinceWeek',
       'Promo2SinceYear', 'PromoPerCompetitionDistance', 'Promo2PerCompetitionDistance', 'DayOfWeek', 'Year', 'Month', 'Day', 'WeekOfYear']
# catego_to_main = ['Promo', 'SchoolHoliday', 'Promo2', 'WeekDays',
#        'WeekEnds', 'IsHoliday', 'BeginMonth', 'MidMonth', 'EndMonth']

 

numerical_action = make_pipeline(StandardScaler())
categorical_action = make_pipeline(OneHotEncoder())

preprocessor = make_column_transformer((numerical_action, numerical_features), (categorical_action, categorical_features))

models = {'RandomForest': RandomForestRegressor(), 'GradientBoost': GradientBoostingRegressor(), 'XGB': XGBRegressor(), 'SGD': SGDRegressor(), 'SVR': SVR(), 'CatBoost': CatBoostRegressor(),
 'BayesianRidge': BayesianRidge(), 'KernelRidge': KernelRidge(), 'ElasticNet': ElasticNet(), 'LinearRegression': LinearRegression(), 'DecisionTree': DecisionTreeRegressor(), 'KNeighbors': KNeighborsRegressor()}

In [11]:
len(models)

12

In [12]:
X = train.drop(['Date', 'Sales', 'Store'], axis=1)
y = train['Sales']

X = X.sample(10000) 
y = y.loc[X.index]


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)


import numpy as np

In [13]:
X_train.columns

Index(['DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday', 'Year', 'Month',
       'Day', 'WeekOfYear', 'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'WeekDays',
       'WeekEnds', 'IsHoliday', 'PromoPerCompetitionDistance',
       'Promo2PerCompetitionDistance', 'BeginMonth', 'MidMonth', 'EndMonth'],
      dtype='object')

In [14]:
## Loss function
def rmsle(ytest, yhat):
    from sklearn.metrics import mean_squared_error
    return np.sqrt(mean_squared_error(ytest, yhat))


In [15]:
list(models.keys())[0]

'RandomForest'

In [16]:
import pickle

class Modeling():

    def __init__(self, X_train, y_train, X_test, y_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        
        self.models = {'RandomForest': RandomForestRegressor(n_estimators=200), 'GradientBoost': GradientBoostingRegressor(),
                    'XGB': XGBRegressor(), 'SGD': SGDRegressor(), 'SVR': SVR(), 'CatBoost': CatBoostRegressor(),
                    'BayesianRidge': BayesianRidge(), 'KernelRidge': KernelRidge(), 'ElasticNet': ElasticNet(), 
                    'LinearRegression': LinearRegression(), 'DecisionTree': DecisionTreeRegressor(), 'KNeighbors': KNeighborsRegressor()}

    def useModel(self, model_to_use: str, model_name: str):
        model = make_pipeline(preprocessor, self.models[model_to_use])
        model.fit(self.X_train, self.y_train)
        yhat = model.predict(self.X_test)
        error = rmsle(self.y_test, yhat)

        real_accuracy = round(model.score(self.X_test, self.y_test),3)
        
        print('Train Score =', round(model.score(self.X_train, self.y_train),3))
        print('Test Score =', real_accuracy)
        print('Root Mean Square Error = ', round(error, 3))

        #serialize
        pickle.dump(model, open(model_name, 'wb'))

        return model, real_accuracy


    def predict(self, model, test):
        predictions = model.predict(test)
        return predictions
    

In [17]:
## Test to predict: predicition for store == 1

from xml.sax.xmlreader import IncrementalParser


test_pred = test[test['Store'] == 1]
test_pred['Date'] = list(testsdate['ds'])

def newfeatures(df): 
    df.set_index('Date', inplace=True)
    df['Year'] = df.index.year
    df['Month'] = df.index.month
    df['Day'] = df.index.day
    df['WeekOfYear'] = df.index.weekofyear

    df.reset_index(inplace=True)

    df['BeginMonth'] = (((df['Day'])//7) == 0)*1
    df['MidMonth'] = (((df['Day'])//10) == 1)*1
    df['EndMonth'] = (((df['Day'])//7) >= 3)*1

    
    df.drop(['Date', 'Store'], axis=1)

    return df

test_pred = newfeatures(test_pred)

In [18]:
store1 = test_pred.drop(['Date', 'Store'], axis=1)
store1.to_csv('../data/store1.csv')
test_pred.to_csv('../data/store1_pred.csv')

### Modeling

In [154]:
Model = Modeling(X_train, y_train, X_test, y_test)

In [155]:
Model.models

{'RandomForest': RandomForestRegressor(n_estimators=200),
 'GradientBoost': GradientBoostingRegressor(),
 'XGB': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, gamma=None,
              gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=None,
              reg_alpha=None, reg_lambda=None, ...),
 'SGD': SGDRegressor(),
 'SVR': SVR(),
 'CatBoost': <catboost.core.CatBoostRegressor at 0x7f46562621d0>,
 'BayesianRidge': BayesianRidge(),
 'Kern

### Random Forest

In [156]:
model1, accuracy1 = Model.useModel(model_to_use= 'RandomForest', model_name='09-09-2022-14-44-50-00-RandomForest.pkl')

Train Score = 0.941
Test Score = 0.585
Root Mean Square Error =  1930.926


In [157]:
RandomForestPred = Model.predict(model1, store1)

### GradientBoost

In [158]:
model2, accuracy2 = Model.useModel(model_to_use= 'GradientBoost', model_name='09-09-2022-17-57-50-00-GradientBoost.pkl')

Train Score = 0.431
Test Score = 0.422
Root Mean Square Error =  2279.732


In [159]:
GradientBoostPred = Model.predict(model2, store1)

### XGB

In [160]:
model3, accuracy3 = Model.useModel(model_to_use= 'XGB', model_name='09-09-2022-17-58-55-00-XGB.pkl')

Train Score = 0.901
Test Score = 0.749
Root Mean Square Error =  1503.285


In [161]:
XGBPred = Model.predict(model3, store1)

### SGD

In [162]:
model4, accuracy4 = Model.useModel(model_to_use= 'SGD', model_name='09-09-2022-17-59-50-00-SGD.pkl')

Train Score = 0.21
Test Score = 0.229
Root Mean Square Error =  2632.595


In [163]:
SGDPred = Model.predict(model4, store1)

### SVR

In [164]:
model5, accuracy5 = Model.useModel(model_to_use= 'SVR', model_name='09-09-2022-18-02-50-00-SVR.pkl')

Train Score = -0.008
Test Score = -0.002
Root Mean Square Error =  3001.773


In [165]:
SVRPred = Model.predict(model5, store1)

### CatBoost

In [166]:
model6, accuracy6 = Model.useModel(model_to_use= 'CatBoost', model_name='09-09-2022-18-03-50-00-CatBoost.pkl')

Learning rate set to 0.057936
0:	learn: 3115.5776699	total: 58ms	remaining: 57.9s
1:	learn: 3078.4805280	total: 62ms	remaining: 30.9s
2:	learn: 3046.3906154	total: 64.9ms	remaining: 21.6s
3:	learn: 3014.8972642	total: 67.5ms	remaining: 16.8s
4:	learn: 2989.6027385	total: 74.7ms	remaining: 14.9s
5:	learn: 2963.2395170	total: 77.8ms	remaining: 12.9s
6:	learn: 2939.8563978	total: 82.8ms	remaining: 11.8s
7:	learn: 2917.5659272	total: 87.2ms	remaining: 10.8s
8:	learn: 2898.9874452	total: 90.7ms	remaining: 9.98s
9:	learn: 2878.6082066	total: 95.1ms	remaining: 9.41s
10:	learn: 2860.7997488	total: 98ms	remaining: 8.81s
11:	learn: 2842.7268548	total: 102ms	remaining: 8.4s
12:	learn: 2828.4213816	total: 108ms	remaining: 8.23s
13:	learn: 2814.0902370	total: 111ms	remaining: 7.84s
14:	learn: 2800.5382769	total: 115ms	remaining: 7.57s
15:	learn: 2788.6166951	total: 122ms	remaining: 7.52s
16:	learn: 2778.9947170	total: 127ms	remaining: 7.33s
17:	learn: 2766.6854690	total: 136ms	remaining: 7.41s
18:	

In [167]:
CatBoostPred = Model.predict(model6, store1)

### BayesianRidge

In [168]:
model7, accuracy7 = Model.useModel(model_to_use= 'BayesianRidge', model_name='09-09-2022-18-05-50-00-BayesianRidge.pkl')

Train Score = 0.217
Test Score = 0.236
Root Mean Square Error =  2620.942


In [169]:
BayesianRidgePred = Model.predict(model7, store1)

### KernelRidge

In [170]:
model8, accuracy8 = Model.useModel(model_to_use= 'KernelRidge', model_name='09-09-2022-18-06-50-00-KernelRidge.pkl')

Train Score = 0.217
Test Score = 0.234
Root Mean Square Error =  2624.218


In [171]:
KernelRidgePred = Model.predict(model8, store1)

### ElasticNet

In [172]:
model9, accuracy9 = Model.useModel(model_to_use= 'ElasticNet', model_name='09-09-2022-18-10-50-00-ElasticNet.pkl')

Train Score = 0.159
Test Score = 0.18
Root Mean Square Error =  2715.188


In [173]:
ElasticNetPred = Model.predict(model9, store1)

### LinearRegression

In [174]:
model10, accuracy10 = Model.useModel(model_to_use= 'LinearRegression', model_name='09-09-2022-18-15-50-00-LinearRegression.pkl')

Train Score = 0.217
Test Score = 0.234
Root Mean Square Error =  2624.6


In [175]:
LinearRegressionPred = Model.predict(model10, store1)

### DecisionTree

In [176]:
model11, accuracy11 = Model.useModel(model_to_use= 'DecisionTree', model_name='09-09-2022-17-59-50-00-DecisionTree.pkl')

Train Score = 1.0
Test Score = 0.189
Root Mean Square Error =  2700.464


In [177]:
DecisionTreePred = Model.predict(model11, store1)

### KNeighbors

In [178]:
model12, accuracy12 = Model.useModel(model_to_use= 'KNeighbors', model_name='09-09-2022-17-59-50-00-KNeighbors.pkl')

Train Score = 0.463
Test Score = 0.196
Root Mean Square Error =  2688.071


In [179]:
KNeighborsPred = Model.predict(model12, store1)

### Accuracy of the models

In [180]:
accuracy = [accuracy1, accuracy2, accuracy3, accuracy4, accuracy5, accuracy6, accuracy7, accuracy8, accuracy9, accuracy10, accuracy11, accuracy12]
for i in range(len(list(models.keys()))):
    print(f'Accuracy of model {(list(models.keys()))[i]} is: ', accuracy[i])

Accuracy of model RandomForest is:  0.585
Accuracy of model GradientBoost is:  0.422
Accuracy of model XGB is:  0.749
Accuracy of model SGD is:  0.229
Accuracy of model SVR is:  -0.002
Accuracy of model CatBoost is:  0.703
Accuracy of model BayesianRidge is:  0.236
Accuracy of model KernelRidge is:  0.234
Accuracy of model ElasticNet is:  0.18
Accuracy of model LinearRegression is:  0.234
Accuracy of model DecisionTree is:  0.189
Accuracy of model KNeighbors is:  0.196
