In [54]:
import pandas as pd 
import functions as f
import numpy as np

import category_encoders as ce 
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from sklearn.metrics import mean_squared_error

from sklearn import set_config

# Scikit-Learn estimators will be rendered as interactive diagrams
set_config(display="diagram")

import warnings
warnings.filterwarnings('ignore')

# autoreload automatically reloads the modules before executing the code, 
# allowing you to see the changes immediately
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
data = pd.read_csv("./data/train.csv")
stores = pd.read_csv("./data/store.csv")

data.head()

Unnamed: 0,Date,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,2013-01-01,1115.0,2.0,0.0,0.0,0.0,0.0,a,1.0
1,2013-01-01,379.0,2.0,0.0,0.0,0.0,0.0,a,1.0
2,2013-01-01,378.0,2.0,0.0,0.0,0.0,0.0,a,1.0
3,2013-01-01,377.0,2.0,0.0,0.0,0.0,0.0,a,1.0
4,2013-01-01,376.0,2.0,0.0,0.0,0.0,0.0,a,1.0


In [29]:
df = f.join_with_store(data)

train_df, val_df = train_test_split(df, test_size=0.3, 
                                    random_state=42, stratify=df['Store'], shuffle=True)

val_df.shape
train_df.shape

train_df.Store.nunique(), val_df.Store.nunique()

(1115, 1115)

In [56]:
# train_rd = f.data_cleaner(train_df)
val_rd = f.data_cleaner(val_df)

In [53]:
#write the cleaned data to disk
train_rd.to_csv('./data/train_reduced.csv',index=False)

In [57]:
val_rd.to_csv('./data/val_reduced.csv',index=False)

In [31]:
X = train_rd.drop(columns=['Sales'])
y = train_rd.Sales

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [32]:
# Initialize the OrdinalEncoder with the mapping
le = ce.OrdinalEncoder(mapping=[{'col': 'Assortment', 'mapping': {'a': 0, 'b': 1, 'c': 2}},
                                {'col': 'StoreType',  'mapping': {'a': 0, 'b': 1, 'c': 2, 'd':3}}])

ohe = ce.OneHotEncoder(cols=['Promo', 'Promo2', 'PromoInterval', 'StateHoliday'], handle_unknown="ignore")

cat_pipeline = Pipeline([
    ("label_encoder", le),
    ('one_hot_encoder', ohe)
])

In [33]:
num_pipeline = make_pipeline(SimpleImputer(strategy="mean"), StandardScaler())

In [34]:
#set up the column transfomer

cat_features = ['Assortment', 'StoreType', 'Promo', 'Promo2', 'PromoInterval', 'StateHoliday']
num_features = ['Customers','CompetitionDistance', 'Competition_Since_X_months', 'weeks_since_promo2']

preprocessing = make_column_transformer(
                (cat_pipeline, cat_features),
                (num_pipeline, num_features), 
                remainder='passthrough')

In [47]:
from sklearn.linear_model import LinearRegression

#Build pipeline
lin_reg = make_pipeline(preprocessing, 
                        LinearRegression(n_jobs=-1))

#fit the pipeline
lin_reg.fit(X_train,y_train)

# make predictions
lr_pred = lin_reg.predict(X_test)

# score
lr_rmse = mean_squared_error(y_test, lr_pred, squared=False)
lr_rmspe = f.rmspe(lr_pred, y_test)

print(f'Linear Regression performance metrics:')
print(f'Root Mean Square Error RMSE:             {lr_rmse}')
print(f'Root Mean Square Percantage Error RMSPE: {lr_rmspe}')

Linear Regression performance metrics:
Root Mean Square Error RMSE:             2464.6901064517547
Root Mean Square Percantage Error RMSPE: 42.943099670318325


## Decision Tree Regressor

In [45]:
from sklearn.tree import DecisionTreeRegressor

# initialze tree pipeline 
tree_reg = make_pipeline(preprocessing, DecisionTreeRegressor(random_state=42))

# fit-transform tree regressor
tree_reg.fit(X_train, y_train)

#make predictions
pred_tree = tree_reg.predict(X_test)

# score
dt_rmspe = f.rmspe(pred_tree, y_test)
dt_rmse = mean_squared_error(y_test, pred_tree, squared=False)
print(f'Decision Tree Regressor performance metrics:')
print(f'Root Mean Square Error RMSE:             {dt_rmse}')
print(f'Root Mean Square Percantage Error RMSPE: {dt_rmspe}')

Decision Tree Regressor performance metrics:
Root Mean Square Error RMSE:             1424.613354902512
Root Mean Square Percantage Error RMSPE: 21.489054391482416


# Random Forrest

In [52]:
from sklearn.ensemble import RandomForestRegressor

#Build pipeline
pipe_rf = make_pipeline(preprocessing, 
                RandomForestRegressor(
                    max_depth=10,
                    min_samples_split=2, 
                    random_state=42, 
                    n_estimators=150, 
                    n_jobs=-1))


#fit the pipeline
pipe_rf.fit(X_train, y_train)

#make predictions
pred_rf = pipe_rf.predict(X_test)

# score
rf_rmse = mean_squared_error(y_test, pred_rf, squared=False)
rf_rmspe = f.rmspe(pred_rf, y_test)
print(f'Random Forrest Model performance metrics:')
print(f'Root Mean Square Error RMSE:             {rf_rmse}')
print(f'Root Mean Square Percantage Error RMSPE: {rf_rmspe}')

Random Forrest Model performance metrics:
Root Mean Square Error RMSE:             1741.9099230936124
Root Mean Square Percantage Error RMSPE: 27.896006017278964


In [51]:
from sklearn.ensemble import AdaBoostRegressor

#Build pipeline
pipe_adaboost = make_pipeline(preprocessing, 
                AdaBoostRegressor(
                    random_state=42,
                    n_estimators=300,
                    loss='square'))

#fit the pipeline
pipe_adaboost.fit(X_train, y_train)

#make predictions
pred_ad = pipe_adaboost.predict(X_test)

# score
ad_rmse = mean_squared_error(y_test, pred_ad, squared=False)
ad_rmspe = f.rmspe(pred_ad, y_test)
print(f'Ada Boost Regressor performance metrics:')
print(f'Root Mean Square Error RMSE:             {ad_rmse}')
print(f'Root Mean Square Percantage Error RMSPE: {ad_rmspe}')

Ada Boost Regressor performance metrics:
Root Mean Square Error RMSE:             6368.907935554269
Root Mean Square Percantage Error RMSPE: 137.77578010805948
