In [1]:
import pandas as pd 
import scripts.functions as f
import numpy as np

from sklearn.model_selection import train_test_split

import category_encoders as ce 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler


from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from sklearn.metrics import mean_squared_error

from sklearn import set_config

# Scikit-Learn estimators will be rendered as interactive diagrams
set_config(display="diagram")

import warnings
warnings.filterwarnings('ignore')

# autoreload automatically reloads the modules before executing the code, 
# allowing you to see the changes immediately
%load_ext autoreload
%autoreload 2

In [2]:
data = pd.read_csv("./data/train.csv")

In [3]:
# Initialize Encoders
le = ce.OrdinalEncoder(mapping=[{'col': 'Assortment', 'mapping': {'a': 0, 'b': 1, 'c': 2}},
                                {'col': 'StoreType',  'mapping': {'a': 0, 'b': 1, 'c': 2, 'd':3}}])

ohe = ce.OneHotEncoder(cols=['Promo', 'Promo2', 'PromoInterval', 'StateHoliday'], handle_unknown="ignore")


# make pipelines for categorical and numerical features
cat_pipeline = Pipeline([ ("label_encoder", le), ('one_hot_encoder', ohe) ])
num_pipeline = make_pipeline(SimpleImputer(strategy="mean"), StandardScaler())


#set up the column transfomer
cat_features = ['Assortment', 'StoreType', 'Promo', 'Promo2', 'PromoInterval', 'StateHoliday']
num_features = ['Customers','CompetitionDistance', 'Competition_Since_X_months', 'weeks_since_promo2']

preprocessing = make_column_transformer(
                (cat_pipeline, cat_features),
                (num_pipeline, num_features),
                remainder='passthrough')

In [9]:
# join sales records and store data
df = f.join_with_store(data)

# split into train and validation data set
train_df, val_df = train_test_split(df, test_size=0.3, 
                                    random_state=42, stratify=df['Store'], shuffle=True)

print(f"train set shape: {train_df.shape}\n validation set shape: {val_df.shape}")

# check if all stores are present in train and validation data set
train_df.Store.nunique(), val_df.Store.nunique()

train set shape: (432931, 18)
 validation set shape: (185542, 18)


(1115, 1115)

In [10]:
train_df = f.join_with_store(train_df)

In [11]:
train_df.head()

Unnamed: 0,Date,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType_x,...,PromoInterval_x,StoreType_y,Assortment_y,CompetitionDistance_y,CompetitionOpenSinceMonth_y,CompetitionOpenSinceYear_y,Promo2_y,Promo2SinceWeek_y,Promo2SinceYear_y,PromoInterval_y
0,2013-10-25,221.0,5.0,6068.0,620.0,1.0,1.0,0.0,0.0,d,...,,d,c,13530.0,9.0,2013.0,0,,,
1,2013-02-14,221.0,4.0,4678.0,507.0,1.0,0.0,0.0,0.0,d,...,,d,c,13530.0,9.0,2013.0,0,,,
2,2013-02-18,221.0,1.0,6766.0,580.0,1.0,1.0,0.0,0.0,d,...,,d,c,13530.0,9.0,2013.0,0,,,
3,2013-09-09,221.0,1.0,7858.0,641.0,1.0,1.0,0.0,0.0,d,...,,d,c,13530.0,9.0,2013.0,0,,,
4,2014-03-02,221.0,7.0,0.0,,0.0,,0.0,0.0,d,...,,d,c,13530.0,9.0,2013.0,0,,,


In [None]:

train_df = f.data_cleaner(train_df)

In [None]:
X = train_df.drop(columns=['Sales'], axis=1)
y = train_df.Sales

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

#  best model
currently best model with model and hyper-parameters found in the model_test_local and model_test_colab notebooks

In [None]:
from sklearn.tree import DecisionTreeRegressor

# initialze tree pipeline
best_tree_reg = make_pipeline(preprocessing,
                         DecisionTreeRegressor(
                            criterion = 'friedman_mse',
                            max_depth = None,
                            max_features = None,
                            min_samples_leaf = 2,
                            min_samples_split = 10,
                            splitter = 'best'
                            ))

# fit-transform tree regressor
best_tree_reg.fit(X_train, y_train)

#make predictions
best_tree_pred = best_tree_reg.predict(X_test)

In [None]:
# score
best_rmspe = f.rmspe(best_tree_pred, y_test)
best_rmse = mean_squared_error(y_test, best_tree_pred, squared=False)
print(f'Decision Tree Regressor with best hyper-params performance metrics:')
print(f'Root Mean Square Error RMSE:             {best_rmse}')
print(f'Root Mean Square Percantage Error RMSPE: {best_rmspe}')

# test model on validation set

In [None]:
# validation 
X_val, y_val = val_df.drop.Sales, val_df['Sales']

# make predictions
dt_val_pred = best_tree_reg.predict(X_val)

# score
dt_val_rmspe = f.rmspe(dt_val_pred, y_test)
dt_val_mse = mean_squared_error(y_test, dt_val_pred, squared=False)
print(f'Decision Tree Regressor with best hyper-params performance metrics:')
print(f'Root Mean Square Error RMSE:             {dt_val_mse}')
print(f'Root Mean Square Percantage Error RMSPE: {dt_val_rmspe}')

In [None]:
##### Save Model for pipeline.py
import pickle

filename = 'dt_pipeline.pkl'
pickle.dump(best_tree_reg, open(filename, 'wb'))