In [1]:
import sys
import os
import missingno as msno
import logging
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

sns.set_style("darkgrid")
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

sys.path.append(os.path.abspath(os.path.join("../scripts"))) 
from clean import dataCleaning

In [2]:
logging.basicConfig(filename='../exploration_logfile.log', filemode='a',
                    encoding='utf-8', level=logging.DEBUG)

In [3]:
import dvc.api
import io

path = 'data/train_data.csv'
repo = './'
version='v2.1'

data_url = dvc.api.get_url(
    path=path,
	repo=repo,
	rev=version
	)

train = pd.read_csv(data_url, sep=",")
train = dataCleaning(train).cleanStateHoliday2()

In [4]:
import dvc.api
import io

path = 'data/test_data.csv'
repo = './'
version='vt.2.1'

data_url = dvc.api.get_url(
    path=path,
	repo=repo,
	rev=version
	)

test = pd.read_csv(data_url, sep=",")
test = dataCleaning(test).cleanStateHoliday2()

In [5]:
print(train.columns)
print('----------------------------------------------')
print(test.columns)

Index(['Date', 'Store', 'DayOfWeek', 'Sales', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'Year', 'Month', 'Day', 'WeekOfYear', 'StoreType',
       'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'PromoInterval', 'WeekDays', 'WeekEnds', 'IsHoliday',
       'PromoPerCompetitionDistance', 'Promo2PerCompetitionDistance',
       'BeginMonth', 'MidMonth', 'EndMonth'],
      dtype='object')
----------------------------------------------
Index(['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday',
       'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'WeekDays',
       'WeekEnds', 'IsHoliday', 'PromoPerCompetitionDistance',
       'Promo2PerCompetitionDistance'],
      dtype='object')


In [6]:
train.tail()

Unnamed: 0,Date,Store,DayOfWeek,Sales,Promo,StateHoliday,SchoolHoliday,Year,Month,Day,...,Promo2SinceYear,PromoInterval,WeekDays,WeekEnds,IsHoliday,PromoPerCompetitionDistance,Promo2PerCompetitionDistance,BeginMonth,MidMonth,EndMonth
844333,2015-07-31,612,5,8161,1,0,0,2015,7,31,...,2009.0,"Jan,Apr,Jul,Oct",1,0,0,5429.265711,4096.150745,0,0,1
844334,2015-07-31,235,5,6756,1,0,1,2015,7,31,...,2009.0,"Jan,Apr,Jul,Oct",1,0,0,5429.265711,9701.437949,0,0,1
844335,2015-07-31,1078,5,9732,1,0,1,2015,7,31,...,2011.0,"Jan,Apr,Jul,Oct",1,0,0,5429.265711,2253.297998,0,0,1
844336,2015-07-31,845,5,5151,1,0,1,2015,7,31,...,2011.0,"Jan,Apr,Jul,Oct",1,0,0,5429.265711,5617.897084,0,0,1
844337,2015-07-31,1,5,5263,1,0,1,2015,7,31,...,0.0,0,1,0,0,5429.265711,6549.550665,0,0,1


In [7]:
### Create date for test
dat = test[test['Store'] == 1]
from prophet import Prophet
touse = pd.DataFrame(columns=['ds', 'y'])
touse['ds'] = train[train['Store'] == 1]['Date']
touse['y'] = train[train['Store'] == 1]['Sales']
m = Prophet(interval_width=0.95,yearly_seasonality=True)
m.fit(touse)
testsdate = m.make_future_dataframe(periods=40)
testsdate = testsdate.tail(41)

15:57:49 - cmdstanpy - INFO - Chain [1] start processing
15:57:49 - cmdstanpy - INFO - Chain [1] done processing


In [8]:
train.dtypes

Date                             object
Store                             int64
DayOfWeek                         int64
Sales                             int64
Promo                             int64
StateHoliday                     object
SchoolHoliday                     int64
Year                              int64
Month                             int64
Day                               int64
WeekOfYear                        int64
StoreType                        object
Assortment                       object
CompetitionDistance             float64
CompetitionOpenSinceMonth       float64
CompetitionOpenSinceYear        float64
Promo2                            int64
Promo2SinceWeek                 float64
Promo2SinceYear                 float64
PromoInterval                    object
WeekDays                          int64
WeekEnds                          int64
IsHoliday                         int64
PromoPerCompetitionDistance     float64
Promo2PerCompetitionDistance    float64


In [9]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import accuracy_score, mean_squared_error

In [10]:
categorical_features = ['StateHoliday', 'StoreType', 'Assortment', 'PromoInterval']
numerical_features = ['CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2SinceWeek',
       'Promo2SinceYear', 'PromoPerCompetitionDistance', 'Promo2PerCompetitionDistance', 'DayOfWeek', 'Year', 'Month', 'Day', 'WeekOfYear']
# catego_to_main = ['Promo', 'SchoolHoliday', 'Promo2', 'WeekDays',
#        'WeekEnds', 'IsHoliday', 'BeginMonth', 'MidMonth', 'EndMonth']

 

numerical_action = make_pipeline(StandardScaler())
categorical_action = make_pipeline(OneHotEncoder())

preprocessor = make_column_transformer((numerical_action, numerical_features), (categorical_action, categorical_features))

models = {'RandomForest': RandomForestRegressor(), 'GradientBoost': GradientBoostingRegressor(), 'XGB': XGBRegressor(), 'SGD': SGDRegressor(), 'SVR': SVR(), 'CatBoost': CatBoostRegressor(),
 'BayesianRidge': BayesianRidge(), 'KernelRidge': KernelRidge(), 'ElasticNet': ElasticNet(), 'LinearRegression': LinearRegression(), 'DecisionTree': DecisionTreeRegressor(), 'KNeighbors': KNeighborsRegressor()}

In [11]:
len(models)

12

In [12]:
X = train.drop(['Date', 'Sales', 'Store'], axis=1)
y = train['Sales']


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)


import numpy as np

In [13]:
X_train.columns

Index(['DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday', 'Year', 'Month',
       'Day', 'WeekOfYear', 'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'WeekDays',
       'WeekEnds', 'IsHoliday', 'PromoPerCompetitionDistance',
       'Promo2PerCompetitionDistance', 'BeginMonth', 'MidMonth', 'EndMonth'],
      dtype='object')

In [14]:
## Loss function
def rmsle(ytest, yhat):
    from sklearn.metrics import mean_squared_error
    return np.sqrt(mean_squared_error(true, pred))


In [15]:
list(models.keys())[0]

'RandomForest'

In [16]:
import pickle

class Modeling():

    def __init__(self, X_train, y_train, X_test, y_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.models = {'RandomForest': RandomForestRegressor(), 'GradientBoost': GradientBoostingRegressor(),
                      'XGB': XGBRegressor(), 'SGD': SGDRegressor(), 'SVR': SVR(), 'CatBoost': CatBoostRegressor(),
                        'BayesianRidge': BayesianRidge(), 'KernelRidge': KernelRidge(), 'ElasticNet': ElasticNet(),
                         'LinearRegression': LinearRegression(), 'DecisionTree': DecisionTreeRegressor(), 'KNeighbors': KNeighborsRegressor()}

    def useModel(self, model_to_use: str, model_name: str):
        model = make_pipeline(preprocessor, self.models[model_to_use])
        model.fit(self.X_train, self.y_train)
        yhat = model.predict(self.X_test)
        error = rmsle(self.y_test, yhat)
        
        print('Train Score =', model.score(self.X_train, self.y_train))
        print('Root Mean Square Error = ', round(error, 3))

        #serialize
        pickle.dump(model, open(model_name, 'wb'))

        return model


    def predict(self, model, test):
        predictions = model.predict(test)
        return predictions
    

In [17]:
## Test to predict: predicition for store == 1

from xml.sax.xmlreader import IncrementalParser


test_pred = test[test['Store'] == 1]
test_pred['Date'] = list(testsdate['ds'])

def newfeatures(df): 
    df.set_index('Date', inplace=True)
    df['Year'] = df.index.year
    df['Month'] = df.index.month
    df['Day'] = df.index.day
    df['WeekOfYear'] = df.index.weekofyear

    df.reset_index(inplace=True)

    df['BeginMonth'] = (((df['Day'])//7) == 0)*1
    df['MidMonth'] = (((df['Day'])//10) == 1)*1
    df['EndMonth'] = (((df['Day'])//7) >= 3)*1

    
    df.drop(['Date', 'Store'], axis=1)

    return df

test_pred = newfeatures(test_pred)

In [18]:
store1 = test_pred.drop(['Date', 'Store'], axis=1)
store1.to_csv('../data/store1.csv')

### Modeling

In [19]:
Model = Modeling(X_train, y_train, X_test, y_test)

In [20]:
Model.models

{'RandomForest': RandomForestRegressor(),
 'GradientBoost': GradientBoostingRegressor(),
 'XGB': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, gamma=None,
              gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=None,
              reg_alpha=None, reg_lambda=None, ...),
 'SGD': SGDRegressor(),
 'SVR': SVR(),
 'CatBoost': <catboost.core.CatBoostRegressor at 0x7fa084e77d90>,
 'BayesianRidge': BayesianRidge(),
 'KernelRidge': Kernel

### Random Forest

In [21]:
Model.useModel(model_to_use= 'RandomForest', model_name='09-09-2022-14-44-50-00-RandomForest.pkl')