<a href="https://colab.research.google.com/github/JDevine1981/Prediction-of-Product-Sales/blob/main/Project1Part6Core.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [49]:
# Import standard packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Import modeling tools
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
# set the default output to pandas
from sklearn import set_config
set_config(transform_output='pandas')

## LOAD THE DATA

In [12]:
fpath = '/content/drive/MyDrive/CodingDojo/02-IntroML/Week05/Data/sales-predictioncore+++-eda.csv'
df = pd.read_csv(fpath)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          8523 non-null   int64  
 1   Weight              7060 non-null   float64
 2   Fat_Content         8523 non-null   object 
 3   Visibility          7997 non-null   float64
 4   Type                8523 non-null   object 
 5   MRP                 8523 non-null   float64
 6   Outlet_Identifier   8523 non-null   object 
 7   Establishment_Year  8523 non-null   int64  
 8   Outlet_Size         6113 non-null   object 
 9   Location_Type       8523 non-null   object 
 10  Outlet_Type         8523 non-null   object 
 11  Item_Outlet_Sales   8523 non-null   float64
dtypes: float64(4), int64(2), object(6)
memory usage: 799.2+ KB


Unnamed: 0.1,Unnamed: 0,Weight,Fat_Content,Visibility,Type,MRP,Outlet_Identifier,Establishment_Year,Outlet_Size,Location_Type,Outlet_Type,Item_Outlet_Sales
0,0,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,1,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,2,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,3,19.2,Regular,,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,4,8.93,Low Fat,,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [13]:
# Drop column 'Unnamed: 0'
df.drop(columns='Unnamed: 0', inplace=True)

In [15]:
# Confirm changes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Weight              7060 non-null   float64
 1   Fat_Content         8523 non-null   object 
 2   Visibility          7997 non-null   float64
 3   Type                8523 non-null   object 
 4   MRP                 8523 non-null   float64
 5   Outlet_Identifier   8523 non-null   object 
 6   Establishment_Year  8523 non-null   int64  
 7   Outlet_Size         6113 non-null   object 
 8   Location_Type       8523 non-null   object 
 9   Outlet_Type         8523 non-null   object 
 10  Item_Outlet_Sales   8523 non-null   float64
dtypes: float64(4), int64(1), object(6)
memory usage: 732.6+ KB


## ASSIGN TARGET AND FEATURE

In [16]:
y = df['Item_Outlet_Sales']
X = df.drop(columns='Item_Outlet_Sales')

## TRAIN/TEST/SPLIT

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train.head()

Unnamed: 0,Weight,Fat_Content,Visibility,Type,MRP,Outlet_Identifier,Establishment_Year,Outlet_Size,Location_Type,Outlet_Type
4776,16.35,Low Fat,0.029565,Household,256.4646,OUT018,2009,Medium,Tier 3,Supermarket Type2
7510,15.25,Regular,,Snack Foods,179.766,OUT018,2009,Medium,Tier 3,Supermarket Type2
5828,12.35,Regular,0.158716,Meat,157.2946,OUT049,1999,Medium,Tier 1,Supermarket Type1
5327,7.975,Low Fat,0.014628,Baking Goods,82.325,OUT035,2004,Small,Tier 2,Supermarket Type1
4810,19.35,Low Fat,0.016645,Frozen Foods,120.9098,OUT045,2002,,Tier 2,Supermarket Type1


In [19]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6392 entries, 4776 to 7270
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Weight              5285 non-null   float64
 1   Fat_Content         6392 non-null   object 
 2   Visibility          5992 non-null   float64
 3   Type                6392 non-null   object 
 4   MRP                 6392 non-null   float64
 5   Outlet_Identifier   6392 non-null   object 
 6   Establishment_Year  6392 non-null   int64  
 7   Outlet_Size         4580 non-null   object 
 8   Location_Type       6392 non-null   object 
 9   Outlet_Type         6392 non-null   object 
dtypes: float64(3), int64(1), object(6)
memory usage: 549.3+ KB


## PREPROCESSING

FEATURES BY TYPE:

  - Numeric
     
     - Weight
     - MRP
     - Establishment_Year
     - Visibility

  - Nominal(Categorical)

     - Fat_Content
     - Type
     - Outlet_Identifier
     - Outlet_Type

  - Ordinal

     - Outlet_Size
     - Location_Type

ISOLATE FEATURES

PREPROCESSING PIPELINES

In [24]:
# Save list of ordinal columns
ord_cols = ['Outlet_Size', 'Location_Type']
# Oridnal Column Lists
outlet_size_list = ['Small', 'Medium', 'High']
location_type_list = ['Tier 1', 'Tier 2', 'Tier 3']
# Transformers
ord = OrdinalEncoder(categories=[outlet_size_list, location_type_list])
freq_imputer = SimpleImputer(strategy='most_frequent')
# Pipeline
ord_pipeline = make_pipeline(freq_imputer, ord)
# Tuple
ord_tuple = ('ordinal', ord_pipeline, ord_cols)

In [25]:
# Save a list of nominal columns
cat_cols = X_train.select_dtypes('object').drop(columns=ord_cols).columns
# Confirm Results
cat_cols
# Transformers
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
# Pipeline
cat_pipeline = make_pipeline(ohe)
# Tuple
cat_tuple = ('nominal', cat_pipeline, cat_cols)

In [26]:
# Save list of numeric columns
num_cols = X_train.select_dtypes('number').columns
# Confirm results
num_cols
# Transformers
mean_imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()
# Pipeline
num_pipeline = make_pipeline(mean_imputer, scaler)
#Tuple
num_tuple = ('numeric', num_pipeline, num_cols)

In [28]:
# Instantiate the preprocessor/ColumnTransformer
preprocessor = ColumnTransformer([num_tuple, cat_tuple, ord_tuple],
                                 verbose_feature_names_out=False)
preprocessor

In [31]:
# import packages
from sklearn.linear_model import LinearRegression

In [32]:
# Instantiate a linear regression model
linreg = LinearRegression()
# Combine the preprocessing ColumnTransformer and the linear regression model in a Pipeline
linreg_pipe = make_pipeline(preprocessor, linreg)
linreg_pipe

In [34]:
# Fit the model pipeline on the training data
linreg_pipe.fit(X_train, y_train)



## EVALUATE THE MODEL

In [38]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [39]:
def regression_metrics(y_true, y_pred, label='', verbose = True, output_dict=False):
  mae = mean_absolute_error(y_true, y_pred)
  mse = mean_squared_error(y_true, y_pred)
  rmse = mean_squared_error(y_true, y_pred, squared=False)
  r_squared = r2_score(y_true, y_pred)
  if verbose == True:

    header = "-"*60
    print(header, f"Regression Metrics: {label}", header, sep='\n')
    print(f"- MAE = {mae:,.3f}")
    print(f"- MSE = {mse:,.3f}")
    print(f"- RMSE = {rmse:,.3f}")
    print(f"- R^2 = {r_squared:,.3f}")
  if output_dict == True:
      metrics = {'Label':label, 'MAE':mae,
                 'MSE':mse, 'RMSE':rmse, 'R^2':r_squared}
      return metrics

In [40]:
def evaluate_regression(reg, X_train, y_train, X_test, y_test, verbose = True,
                        output_frame=False):
  y_train_pred = reg.predict(X_train)

  results_train = regression_metrics(y_train, y_train_pred, verbose = verbose,
                                     output_dict=output_frame,
                                     label='Training Data')
  print()
  y_test_pred = reg.predict(X_test)
  results_test = regression_metrics(y_test, y_test_pred, verbose = verbose,
                                  output_dict=output_frame,
                                    label='Test Data' )

  if output_frame:
    results_df = pd.DataFrame([results_train,results_test])
    results_df = results_df.set_index('Label')
    results_df.index.name=None
    return results_df.round(3)

## LINEAR REGRESSION

In [41]:
# Obtain Model Evulation using custom function
evaluate_regression(linreg_pipe, X_train, y_train, X_test, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 847.268
- MSE = 1,297,802.367
- RMSE = 1,139.211
- R^2 = 0.561

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 803.751
- MSE = 1,193,975.091
- RMSE = 1,092.692
- R^2 = 0.567


### EVALUATION OF LINEAR REGRESSION RESULTS

- The R^2 scores of .561. on the training data
  and .567 on the testing data indicates the model underfit the data and displays a high bias. This may be due to the model being too simple, lack of data, and/or features that do not correlate with the target.

- The Mean Absolute Error is $803.75

- Mean Squared Error is $1,193,975.09

- Root Mean Squared Error is $1,092.69

## RANDOM FOREST

In [42]:
# Import Packages
from sklearn.ensemble import RandomForestRegressor

In [43]:
# Instantiate default random forest model
rf = RandomForestRegressor(random_state = 42)
# Model Pipeline
rf_pipe = make_pipeline(preprocessor, rf)

In [44]:
# Fit the model pipeline on the training data only
rf_pipe.fit(X_train, y_train)



In [45]:
# Use custom function to evaluate default model
evaluate_regression(rf_pipe, X_train, y_train, X_test, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 296.986
- MSE = 184,958.942
- RMSE = 430.069
- R^2 = 0.938

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 770.075
- MSE = 1,227,435.249
- RMSE = 1,107.897
- R^2 = 0.555


## EVALUATION

- The model performed well on the training
  data, but poorly on the testing data, indicating high variance and overfitting.

- Compated to the Default Random Forest, the
  linear regression model has better test scores.

## TUNING RANDOM FOREST MODEL

In [46]:
# Parameters for tuning
rf_pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('numeric',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    Index(['Weight', 'Visibility', 'MRP', 'Establishment_Year'], dtype='object')),
                                   ('nominal',
                                    Pipeline(steps=[('onehotencoder',
                                                     OneHotEncoder(handle_unknown='ignore',
                                                                   sparse=False))]),
                                    Index(['Fat_Content', 'Type', 'Outlet_Identifier', 'Outlet_Type'], dtype='object')),
                                   ('ordinal',
                                    Pipe

In [47]:
# Define param grid with options to try
params = {'randomforestregressor__max_depth': [None,10,15,20],
          'randomforestregressor__n_estimators':[10,100,150,200],
          'randomforestregressor__min_samples_leaf':[2,3,4],
          'randomforestregressor__max_features':['sqrt','log2',None],
          'randomforestregressor__oob_score':[True,False],
          }

In [53]:
# Instantiate the gridsearch
gridsearch = GridSearchCV(rf_pipe, params, n_jobs=-1, cv = 3, verbose=1)
# Fit the gridsearch on training data
gridsearch.fit(X_train, y_train)

Fitting 3 folds for each of 288 candidates, totalling 864 fits




In [54]:
# Obtain best parameters
gridsearch.best_params_

{'randomforestregressor__max_depth': 10,
 'randomforestregressor__max_features': None,
 'randomforestregressor__min_samples_leaf': 2,
 'randomforestregressor__n_estimators': 100,
 'randomforestregressor__oob_score': True}

In [55]:
# Define and refit best model
best_rf = gridsearch.best_estimator_
evaluate_regression(best_rf, X_train, y_train, X_test, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 648.020
- MSE = 847,575.968
- RMSE = 920.639
- R^2 = 0.714

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 737.381
- MSE = 1,125,804.833
- RMSE = 1,061.040
- R^2 = 0.592


## EVALUATION OF TUNED MODEL

 - Using GridSearch, our testing R^2 improved
   from .555 to .592.

## OVERALL EVALUATION/RECOMMENDATION

- The tuned model produces the best goodness
  of with a testing R^2 score of 0.592.

- While some bias still exists in this model,
  mean absolute error is off by $737.38, compared to $770.08 for the untuned model and $803.75 for the linear regression .