In [1]:
# ignoring warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# importing required libraries
import pandas as pd
from sklearn.compose import ColumnTransformer 
from sklearn.impute import SimpleImputer
# import category_encoders as ce
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

In [11]:
# read the training data set
data = pd.read_csv('../data/cleaned_data.csv')
data = data[:100000]
# top rows of the data
data.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,year,month,day,weekday,weekend,month_period
0,1,5,2015-07-31,5263,555,1,1,0,1,2015,Jul,Friday,1,0,Ending
1,1,4,2015-07-30,5020,546,1,1,0,1,2015,Jul,Thursday,1,0,Ending
2,1,3,2015-07-29,4782,523,1,1,0,1,2015,Jul,Wednesday,1,0,Ending
3,1,2,2015-07-28,5011,560,1,1,0,1,2015,Jul,Tuesday,1,0,Ending
4,1,1,2015-07-27,6102,612,1,1,0,1,2015,Jul,Monday,1,0,Ending


In [12]:
# seperate the independent and target variables
train_x = data.drop(columns=['Sales'])
train_y = data['Sales']

In [14]:
# import the BaseEstimator
from sklearn.base import BaseEstimator

# define the class OutletTypeEncoder
# This will be our custom transformer that will create 3 new binary columns
# custom transformer must have methods fit and transform
class OutletTypeEncoder(BaseEstimator):

    def __init__(self):
        pass

    def fit(self, documents, y=None):
        return self

    def transform(self, x_dataset):
        x_dataset['month'] = (x_dataset['month'] == 'Jan')*1
        x_dataset['month'] = (x_dataset['month'] == 'Feb')*1
        x_dataset['month'] = (x_dataset['month'] == 'Mar')*1
        x_dataset['month'] = (x_dataset['month'] == 'Apr')*1
        x_dataset['month'] = (x_dataset['month'] == 'May')*1
        x_dataset['month'] = (x_dataset['month'] == 'Jun')*1
        x_dataset['month'] = (x_dataset['month'] == 'Jul')*1        
        x_dataset['month'] = (x_dataset['month'] == 'Aug')*1        
        x_dataset['month'] = (x_dataset['month'] == 'Sep')*1        
        x_dataset['month'] = (x_dataset['month'] == 'Oct')*1        
        x_dataset['month'] = (x_dataset['month'] == 'Nov')*1        
        x_dataset['month'] = (x_dataset['month'] == 'Dec')*1 
        
        return x_dataset

In [15]:
# pre-processsing step
# Drop the columns - 
# Impute the missing values in column Item_Weight by mean
# Scale the data in the column Item_MRP
pre_process = ColumnTransformer(remainder='passthrough',
                                transformers=[('drop_columns', 'drop', ['Date',
                                                                        'Open',
                                                                        'StateHoliday',
                                                                        'SchoolHoliday',
                                                                        'month',
                                                                        'day',
                                                                        'weekday',
                                                                        'weekend',
                                                                        'month_period']),
                                              ('scale_data', StandardScaler(),['Store', 'DayOfWeek', 'Customers',
                                                                               'Promo', 'year'])])

In [16]:
# Define the Pipeline
"""
Step1: get the oultet binary columns
Step2: pre processing
Step3: Train a Random Forest Model
"""
model_pipeline = Pipeline(steps=[('get_outlet_binary_columns', OutletTypeEncoder()),
                                 ('pre_processing',pre_process),
                                 ('random_forest', RandomForestRegressor(max_depth=10,random_state=2))
                                 ])
# fit the pipeline with the training data
model_pipeline.fit(train_x,train_y)

# predict target values on the training data
model_pipeline.predict(train_x)

array([4811.11019795, 4764.03151924, 4633.88828003, ..., 3554.1609131 ,
       4876.33550308, 5993.7206681 ])

In [17]:
# checking accuracy
model_pipeline.score(train_x, train_y)

0.9409894817267572