In [1]:
# Importing the libraries

import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import pickle
from mlflow import log_metric, log_param, log_artifacts

import sys
sys.path.append("..")
warnings.filterwarnings('ignore')
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Importing some scripts

from scripts.data_vizualization import Data_Viz 
from scripts.data_cleaning import DataCleaner
from scripts.data_transformation import DataTransformer

DC = DataCleaner()
DV = Data_Viz()
DT = DataTransformer()

TypeError: __init__() missing 1 required positional argument: 'filehandler'

In [None]:
# Importing the train data and check for outliers

train = pd.read_csv('../data/train_data_clean.csv',index_col='Date')
DV.summ_columns(train)

In [None]:
# Creating a new column

train.loc[train['DayOfMonth'] >= 20,'DayStatus'] = 'End of Month' 
train.loc[(train['DayOfMonth'] < 20) & (train['DayOfMonth'] > 10),'DayStatus'] = 'Mid of Month' 
train.loc[train['DayOfMonth'] < 10,'DayStatus'] = 'Begining of Month' 

In [None]:
# Finding out if the column worked as planned

train['DayStatus'].value_counts()

In [None]:
# Creating a new column for store having competator or not

train.loc[train['CompetitionOpenSinceMonth'] != 'Not Available','HasCompetator'] = 1 
train.loc[train['CompetitionOpenSinceMonth'] == 'Not Available','HasCompetator'] = 0

In [None]:
# Lets see the catagorical and numerical columns and list out all the numerical columns

categorical_col, numerical_col = DT.sep_cat_num(train)
numerical_col.head()

In [None]:
# Lets check the most correlated columns and see the heatmap

plt.figure(figsize=(4, 8))
sns.heatmap(numerical_col.corr().loc['Sales',:].to_frame(), annot=True)
plt.show()
plt.savefig('../charts/sales_vs_all_corr.jpg')

log_artifacts("../charts")

#### From the above correlation graph we can see that Weekday, Promo, Open, Customer and DaysOfWeek column has greater impact on sales. So we can take only these columns for our analysis

In [None]:
# Sampling the data by the above infered correlation and taking into consideration the effect of store type and state holidays

useful_columns = ['Sales','DayOfWeek','Customers','Open','StoreType','StateHoliday','Assortment']
sampled_df = train[useful_columns]
sampled_df.head()

In [None]:
# Lets find out on which column is sales found

train.columns.tolist()[2] # It is the 3rd column

In [None]:
# Creating our pipe line

pipe = Pipeline(steps = [
                        ("labeling", FunctionTransformer(DT.cat_labeler, kw_args={"cat_cols": categorical_col.columns.to_list()})),
                        ("scaling", FunctionTransformer(DT.scaler)), 
                        ("target", FunctionTransformer(DT.target_feature, kw_args={"t":2})),
                        ("split", FunctionTransformer(DT.set_splitter, kw_args={"test": 0.1, "val":0.2, "rand_state":15}))
                        ])

In [None]:
# For simplicity and effectiveness let take some samples 300K from the 1M 

sampled_train = train.sample(n=300000)
sets = pipe.fit_transform(sampled_train)



In [None]:
# Fitting the data with random forest regression

regressor = RandomForestRegressor(n_estimators = 200, random_state = 15)
regressor.fit(sets[0], sets[1])

###### Loss function is at its core, measure of how good your prediction model does in terms of being able to predict the expected outcome(or value). We convert the learning problem into an optimization problem, define a loss function and then optimize the algorithm to minimize the loss function. Since we have normalized data with no outliers I prefer to use MSE as it gives more emphasis to errors that will make the prediction more reliable 

In [None]:
# Calculating the accuracy of our model

score = regressor.score(sets[2], sets[3])
score

In [None]:
# Creating a feature list and finding out the feature importance

features = train.columns.to_list()
features.remove('Sales')
plt.figure(figsize=(12,8))
plt.barh(features, regressor.feature_importances_)
plt.savefig('../charts/rf_feature_imp.jpg')


In [None]:
# Finding out the feature importance of customer

print(f'The maximum importance value is customer with importance value of: {max(regressor.feature_importances_)}')

In [None]:
# Finding out the current date and time for our serialization naming

now = datetime.now()
formated_date = now.strftime("%Y-%m-%d-%H-%M-%S")
formated_date

In [None]:
# Creating a file name using the formated time

file_path = "../models/"+str(formated_date)+'.pkl'
file_path

In [None]:
# Serializing or dumping using pkl loader

pickle.dump(regressor,open(file_path,'wb'))

In [None]:
# Lets read from our pickle serialized model and predict by using our validation data set

model = pickle.load(open(file_path,'rb')) # Opening the pkl file and passing rb( read binary) to read it
score = model.score(sets[4],sets[5])
print(f'The accuracy of the model saved is: {score}')