## MODELING

In [8]:
# Ignore Warnings
import warnings
warnings.filterwarnings("ignore")

#for easy access to path
import sys
import os

#importing packages for data manuplation and visualization
import numpy as np
import pandas as pd

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

# Visualization Configuration
%matplotlib inline
sns.set()

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import mlflow


In [9]:
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, accuracy_score, precision_recall_curve, f1_score, mean_squared_error, r2_score, mean_absolute_error, auc

In [10]:
#Acessing scripts and data path
sys.path.append(os.path.abspath(os.path.join('data')))
sys.path.insert(0,'../scripts/')

In [11]:
#import local libraries
from clean_train_test_df import CleanStoreDf
from data_preview import DataPreview
from data_loader import load_df_from_csv
from data_manipulation import DataManipulator
from plots import *
from result import ResultPicker
from data_loader import load_df_from_csv
from ml_modeling import *

##### LOAD DATA

In [12]:
#Load our clean merged data
clean_df =pd.read_csv('../data/store_train.csv')

In [6]:
clean_df=clean_df.drop(['Date'],axis=1,inplace=True)

In [None]:
#clean_df['Date'] = pd.to_datetime(clean_df['Date'],format = '%Y-%m-%d')

In [13]:
clean_df.head()

Unnamed: 0.1,Unnamed: 0,Store,DayOfWeek,Date,Year,Month,Day,Sales,Customers,Open,...,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,0,1,5,2015-07-31,2015,7,31,5263,555,1,...,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
1,1,1,4,2015-07-30,2015,7,30,5020,546,1,...,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
2,2,1,3,2015-07-29,2015,7,29,4782,523,1,...,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
3,3,1,2,2015-07-28,2015,7,28,5011,560,1,...,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
4,4,1,1,2015-07-27,2015,7,27,6102,612,1,...,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0


In [14]:
clean_df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [15]:
clean_df.head()

Unnamed: 0,Store,DayOfWeek,Date,Year,Month,Day,Sales,Customers,Open,Promo,...,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,2015-07-31,2015,7,31,5263,555,1,1,...,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
1,1,4,2015-07-30,2015,7,30,5020,546,1,1,...,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
2,1,3,2015-07-29,2015,7,29,4782,523,1,1,...,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
3,1,2,2015-07-28,2015,7,28,5011,560,1,1,...,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
4,1,1,2015-07-27,2015,7,27,6102,612,1,1,...,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0


In [16]:
clean_df.drop(['Store','Date','Customers'],axis=1,inplace=True)

In [17]:
clean_df.head()

Unnamed: 0,DayOfWeek,Year,Month,Day,Sales,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,5,2015,7,31,5263,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
1,4,2015,7,30,5020,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
2,3,2015,7,29,4782,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
3,2,2015,7,28,5011,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0
4,1,2015,7,27,6102,1,1,0,1,c,a,1270.0,9.0,2008.0,0,0.0,0.0,0


In [18]:
clean_df.dtypes

DayOfWeek                      int64
Year                           int64
Month                          int64
Day                            int64
Sales                          int64
Open                           int64
Promo                          int64
StateHoliday                  object
SchoolHoliday                  int64
StoreType                     object
Assortment                    object
CompetitionDistance          float64
CompetitionOpenSinceMonth    float64
CompetitionOpenSinceYear     float64
Promo2                         int64
Promo2SinceWeek              float64
Promo2SinceYear              float64
PromoInterval                 object
dtype: object

In [19]:
manipulator = DataManipulator(clean_df)

In [20]:
manipulator.add_week_day('DayOfWeek')

Data Manipulatior:INFO->Successfully Added WeekDay Column to the DataFrame


In [21]:
manipulator.add_month_timing('Day')

Data Manipulatior:INFO->Successfully Added MonthTiming Column


In [22]:
manipulator.add_season('Month')

Data Manipulatior:INFO->Successfully Added Season Column


In [23]:
manipulator.add_number_of_days_to_holiday('StateHoliday')

Data Manipulatior:INFO->Successfully Added DaysToHoliday Column


In [24]:
manipulator.add_number_of_days_after_holiday('StateHoliday')

Data Manipulatior:INFO->Successfully Added DaysAfterHoliday Column


In [25]:
manipulator.label_columns(['Season', 'StateHoliday', 'StoreType', 'Assortment'])

{'Season': LabelEncoder(),
 'StateHoliday': LabelEncoder(),
 'StoreType': LabelEncoder(),
 'Assortment': LabelEncoder()}

In [26]:
clean_df['PromoInterval'].value_counts()

0                   508031
Jan,Apr,Jul,Oct     293122
Feb,May,Aug,Nov     118596
Mar,Jun,Sept,Dec     97460
Name: PromoInterval, dtype: int64

In [27]:
manipulator.label_columns(['PromoInterval'])

{'PromoInterval': LabelEncoder()}

In [28]:
clean_df.dtypes

DayOfWeek                      int64
WeekDay                        int64
Year                           int64
Month                          int64
Season                         int32
Day                            int64
MonthTiming                    int64
Sales                          int64
Open                           int64
Promo                          int64
StateHoliday                   int32
DaysAfterHoliday               int64
DaysToHoliday                  int64
SchoolHoliday                  int64
StoreType                      int32
Assortment                     int32
CompetitionDistance          float64
CompetitionOpenSinceMonth    float64
CompetitionOpenSinceYear     float64
Promo2                         int64
Promo2SinceWeek              float64
Promo2SinceYear              float64
PromoInterval                  int32
dtype: object

In [29]:
manipulator = DataManipulator(clean_df)


###### Random Forest

In [30]:
y_values = clean_df['Sales']
x_values = clean_df.drop(['Sales'],axis=1)

In [31]:
# Splitting Data (60,20,20)
x_train, x_test, y_train, y_test = train_test_split(x_values, y_values, test_size=0.2, random_state=42)
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

Our Metric is Sales

In [32]:
x_values.dtypes

DayOfWeek                      int64
WeekDay                        int64
Year                           int64
Month                          int64
Season                         int32
Day                            int64
MonthTiming                    int64
Open                           int64
Promo                          int64
StateHoliday                   int32
DaysAfterHoliday               int64
DaysToHoliday                  int64
SchoolHoliday                  int64
StoreType                      int32
Assortment                     int32
CompetitionDistance          float64
CompetitionOpenSinceMonth    float64
CompetitionOpenSinceYear     float64
Promo2                         int64
Promo2SinceWeek              float64
Promo2SinceYear              float64
PromoInterval                  int32
dtype: object

In [33]:
from ml_modeling import *

In [34]:
def calculate_metrics(y_test, y_preds, name: str = ''):
    try:
        rmse = np.sqrt(mean_squared_error(y_test, y_preds))
        r_sq = r2_score(y_test, y_preds)
        mae = mean_absolute_error(y_test, y_preds)
        return {f'{name}RMSE Score': rmse, f'{name}R2_Squared': r_sq, f'{name}MAE Score': mae}
    
    except Exception as e:
        print("Model Metrics Calculation failed")


In [35]:
pipeline = Pipeline(steps=[
    # ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [None]:
# Fit the pipeline with the data
mlflow.autolog(log_input_examples=True, disable_for_unsupported_versions=True, silent=True)
with mlflow.start_run() as run:
    best_model = pipeline.fit(x_train, y_train)

    train_score = best_model.score(x_train, y_train)
    valid_score = best_model.score(x_valid, y_valid)
    valid_metrics = calculate_metrics(y_valid, best_model.predict(x_valid))
    test_score = best_model.score(x_test, y_test)
    test_metrics = calculate_metrics(y_test, best_model.predict(x_test))

    mlflow.log_metric("Valid Score", valid_score)
    mlflow.log_metrics(valid_metrics)
    mlflow.log_metric("Test Score", test_score)
    mlflow.log_metrics(test_metrics)