<a href="https://colab.research.google.com/github/Loris99/Axsos_DS/blob/main/Project_core.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## Typical Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Modeling & preprocessing import
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.compose import ColumnTransformer,make_column_transformer,make_column_selector
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor # NEW

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn import set_config
set_config(transform_output='pandas')

In [None]:
# Load data directly from url
df = pd.read_csv('/content/drive/MyDrive/CodingDojo/02-IntroML/Week05/Data/sales_predictions_2023.csv')
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [None]:
# Checking for Duplicates
df.duplicated().sum()

0

In [None]:
# Checking missing values
df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [None]:
df.value_counts()

Item_Identifier  Item_Weight  Item_Fat_Content  Item_Visibility  Item_Type     Item_MRP  Outlet_Identifier  Outlet_Establishment_Year  Outlet_Size  Outlet_Location_Type  Outlet_Type        Item_Outlet_Sales
DRA12            11.600       LF                0.000000         Soft Drinks   141.9154  OUT035             2004                       Small        Tier 2                Supermarket Type1  992.7078             1
FDV35            19.500       Low Fat           0.128182         Breads        156.1314  OUT035             2004                       Small        Tier 2                Supermarket Type1  2792.3652            1
FDV25            5.905        low fat           0.045614         Canned        221.5456  OUT013             1987                       High         Tier 3                Supermarket Type1  6852.4136            1
                              Low Fat           0.045838         Canned        222.5456  OUT018             2009                       Medium       Tier 3   

In [None]:
df['Item_Fat_Content'].value_counts()

Item_Fat_Content
Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: count, dtype: int64

In [None]:

name_map = {'LF':'Low Fat',
                   'reg':'Regular',
                   'low fat':'Low Fat'}

df['Item_Fat_Content'] =df['Item_Fat_Content'].replace(name_map)
df['Item_Fat_Content'].value_counts()

Item_Fat_Content
Low Fat    5517
Regular    3006
Name: count, dtype: int64

In [None]:
# Remove  the "$" from fare column by replacing it with nothing
Location_Type_values = df['Outlet_Location_Type'].str.replace('Tier ','')
# Check the .value_counts to confirm the change
df['Outlet_Location_Type'] =Location_Type_values.astype('float64')
df['Outlet_Location_Type']

0       1.0
1       3.0
2       1.0
3       3.0
4       3.0
       ... 
8518    3.0
8519    2.0
8520    2.0
8521    3.0
8522    1.0
Name: Outlet_Location_Type, Length: 8523, dtype: float64

In [None]:
df['Outlet_Type'].value_counts()

Outlet_Type
Supermarket Type1    5577
Grocery Store        1083
Supermarket Type3     935
Supermarket Type2     928
Name: count, dtype: int64

#Defining X and y


In [None]:
target = 'Item_Outlet_Sales'
X = df.drop(columns=[target , 'Item_Outlet_Sales'])
y = df[target]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

#Create a ColumnTransformer to preprocess the data

##Create lists of column names for numeric and categorical columns.



In [None]:
# Checking object columns
X_train.select_dtypes('object')

Unnamed: 0,Item_Identifier,Item_Fat_Content,Item_Type,Outlet_Identifier,Outlet_Size,Outlet_Type
4776,NCG06,Low Fat,Household,OUT018,Medium,Supermarket Type2
7510,FDV57,Regular,Snack Foods,OUT018,Medium,Supermarket Type2
5828,FDM27,Regular,Meat,OUT049,Medium,Supermarket Type1
5327,FDG24,Low Fat,Baking Goods,OUT035,Small,Supermarket Type1
4810,FDD05,Low Fat,Frozen Foods,OUT045,,Supermarket Type1
...,...,...,...,...,...,...
5734,FDY08,Regular,Fruits and Vegetables,OUT010,,Grocery Store
5191,FDC41,Low Fat,Frozen Foods,OUT017,,Supermarket Type1
5390,NCQ53,Low Fat,Health and Hygiene,OUT045,,Supermarket Type1
860,FDL46,Low Fat,Snack Foods,OUT017,,Supermarket Type1


##Create a StandardScaler for scaling numeric columns.


##A) simple imputer


In [None]:
# Check # of null values in X_train
X_train.isna().sum()

Item_Identifier                 0
Item_Weight                  1107
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1812
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [None]:
# Check X_train dtypes
X_train.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type         float64
Outlet_Type                   object
dtype: object

#HEREEEEEEEEE


### Impute the numeric columns

In [None]:
numeric_cols = X_train.select_dtypes('number').columns
numeric_cols

Index(['Item_Weight', 'Item_Visibility', 'Item_MRP',
       'Outlet_Establishment_Year', 'Outlet_Location_Type'],
      dtype='object')

In [None]:
# Instantaite and fit the imputer for numeric features
impute_median = SimpleImputer(strategy='median')
impute_median.fit(X_train[numeric_cols])

In [None]:
# Transform the training and test numeric features and save as new vars
X_train_num_imputed = impute_median.transform(X_train[numeric_cols])
X_test_num_imputed = impute_median.transform(X_test[numeric_cols])

# confirm no null values remaining
X_train_num_imputed.isna().sum()

Item_Weight                  0
Item_Visibility              0
Item_MRP                     0
Outlet_Establishment_Year    0
Outlet_Location_Type         0
dtype: int64

### Impute the categorical columns

In [None]:
categorical_cols = X_train.select_dtypes('object').columns
categorical_cols

Index(['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier',
       'Outlet_Size', 'Outlet_Type'],
      dtype='object')

In [None]:
# Instantaite and fit the imputer for categorical features
impute_missing = SimpleImputer(strategy='constant', fill_value='MISSING')
impute_missing.fit(X_train[categorical_cols])

In [None]:
# Transform the training and test categorical features and save as new vars
X_train_cat_imputed = impute_missing.transform(X_train[categorical_cols])
X_test_cat_imputed = impute_missing.transform(X_test[categorical_cols])

# confirm no null values remaining
X_train_cat_imputed.isna().sum()

Item_Identifier      0
Item_Fat_Content     0
Item_Type            0
Outlet_Identifier    0
Outlet_Size          0
Outlet_Type          0
dtype: int64

### Scale Numeric Features

In [None]:
# instantiate standard scaler and fit on training data
scaler = StandardScaler()
scaler.fit(X_train_num_imputed)

In [None]:
# Apply the scaling transformation
X_train_num_scaled = scaler.transform(X_train_num_imputed)
X_test_num_scaled = scaler.transform(X_test_num_imputed)
X_train_num_scaled.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Location_Type
4776,0.827485,-0.712775,1.828109,1.327849,1.084948
7510,0.566644,-1.291052,0.603369,1.327849,1.084948
5828,-0.121028,1.813319,0.244541,0.136187,-1.384777
5327,-1.158464,-1.004931,-0.952591,0.732018,-0.149914
4810,1.53887,-0.965484,-0.33646,0.493686,-0.149914


In [None]:
# check describe() for pre-scaling data
X_train_num_imputed.describe().round(2)

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Location_Type
count,6392.0,6392.0,6392.0,6392.0,6392.0
mean,12.86,0.07,141.98,1997.86,2.12
std,4.22,0.05,62.63,8.39,0.81
min,4.56,0.0,31.29,1985.0,1.0
25%,9.5,0.03,94.15,1987.0,1.0
50%,12.65,0.05,144.11,1999.0,2.0
75%,16.1,0.09,186.9,2004.0,3.0
max,21.35,0.33,266.89,2009.0,3.0


In [None]:
X_train_num_scaled.describe().round(2)

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Location_Type
count,6392.0,6392.0,6392.0,6392.0,6392.0
mean,-0.0,-0.0,0.0,-0.0,0.0
std,1.0,1.0,1.0,1.0,1.0
min,-1.97,-1.29,-1.77,-1.53,-1.38
25%,-0.8,-0.76,-0.76,-1.29,-1.38
50%,-0.05,-0.23,0.03,0.14,-0.15
75%,0.77,0.56,0.72,0.73,1.08
max,2.01,5.13,1.99,1.33,1.08


### Encode the Categorical Data with OneHotEncoder

In [None]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

In [None]:
# Instantiate and fit onehotencoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoder.fit(X_train_cat_imputed)

In [None]:
# Transform the categorical features and save
X_train_cat_encoded = encoder.transform(X_train_cat_imputed)
X_test_cat_encoded = encoder.transform(X_test_cat_imputed)
X_train_cat_encoded.head()

Unnamed: 0,Item_Identifier_DRA12,Item_Identifier_DRA24,Item_Identifier_DRA59,Item_Identifier_DRB01,Item_Identifier_DRB13,Item_Identifier_DRB24,Item_Identifier_DRB25,Item_Identifier_DRB48,Item_Identifier_DRC01,Item_Identifier_DRC12,...,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049,Outlet_Size_High,Outlet_Size_MISSING,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
4776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
7510,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
5327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4810,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
print("X_train_cat_imputed: ", X_train_cat_imputed.shape)
print("X_train_cat_encoded: ", X_train_cat_encoded.shape)

X_train_cat_imputed:  (6392, 6)
X_train_cat_encoded:  (6392, 1586)


# Prepare Categorical pipeline


In [None]:
# Prepare Categorical pipeline
cat_cols = X_train.select_dtypes('object').columns

impute_missing = SimpleImputer(strategy='constant',fill_value='MISSING')
ohe_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe_pipe = make_pipeline(impute_missing, ohe_encoder)
ohe_pipe

#Define Categorical Tuple for ColumnTransformer


In [None]:
# Making a ohe_tuple for ColumnTransformer
ohe_tuple = ('categorical', ohe_pipe, cat_cols)
ohe_tuple


('categorical',
 Pipeline(steps=[('simpleimputer',
                  SimpleImputer(fill_value='MISSING', strategy='constant')),
                 ('onehotencoder',
                  OneHotEncoder(handle_unknown='ignore', sparse_output=False))]),
 Index(['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier',
        'Outlet_Size', 'Outlet_Type'],
       dtype='object'))

# Prepare numeric piepline


In [None]:
# Prepare numeric piepline
num_cols = X_train.select_dtypes('number').columns

mean_imputer = SimpleImputer(strategy="mean")
scaler = StandardScaler()

#Numeric Pipeline
numeric_pipe = make_pipeline(mean_imputer, scaler)
numeric_pipe

# Define Numeric Tuple for ColumnTransformer


In [None]:
# Making a numeric tuple for ColumnTransformer
num_tuple = ('numeric', numeric_pipe, num_cols)
num_tuple

('numeric',
 Pipeline(steps=[('simpleimputer', SimpleImputer()),
                 ('standardscaler', StandardScaler())]),
 Index(['Item_Weight', 'Item_Visibility', 'Item_MRP',
        'Outlet_Establishment_Year', 'Outlet_Location_Type'],
       dtype='object'))

##Instantiate the ColumnTransformer

In [None]:
# Instantiate with verbose_feature_names_out=False
preprocessor = ColumnTransformer([num_tuple, ohe_tuple],
                                    verbose_feature_names_out=False)
preprocessor

In [None]:
# Fit on training data
preprocessor.fit(X_train)

In [None]:
# Transform the training data
X_train_tf  = preprocessor.transform(X_train)
# Transform the testing data
X_test_tf  = preprocessor.transform(X_test)
# View the processed training data
X_train_tf .head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Location_Type,Item_Identifier_DRA12,Item_Identifier_DRA24,Item_Identifier_DRA59,Item_Identifier_DRB01,Item_Identifier_DRB13,...,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049,Outlet_Size_High,Outlet_Size_MISSING,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
4776,0.817249,-0.712775,1.828109,1.327849,1.084948,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
7510,0.55634,-1.291052,0.603369,1.327849,1.084948,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5828,-0.131512,1.813319,0.244541,0.136187,-1.384777,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
5327,-1.169219,-1.004931,-0.952591,0.732018,-0.149914,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4810,1.528819,-0.965484,-0.33646,0.493686,-0.149914,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
X_train_tf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6392 entries, 4776 to 7270
Columns: 1591 entries, Item_Weight to Outlet_Type_Supermarket Type3
dtypes: float64(1591)
memory usage: 77.6 MB


#Build a linear regression model.


In [None]:
# Define the custom functions for regressoin evaluation
def regression_metrics(y_true, y_pred, label='', verbose = True, output_dict=False):
  # Get metrics
  mae = mean_absolute_error(y_true, y_pred)
  mse = mean_squared_error(y_true, y_pred)
  rmse = mean_squared_error(y_true, y_pred, squared=False)
  r_squared = r2_score(y_true, y_pred)
  if verbose == True:
    # Print Result with Label and Header
    header = "-"*60
    print(header, f"Regression Metrics: {label}", header, sep='\n')
    print(f"- MAE = {mae:,.3f}")
    print(f"- MSE = {mse:,.3f}")
    print(f"- RMSE = {rmse:,.3f}")
    print(f"- R^2 = {r_squared:,.3f}")
  if output_dict == True:
      metrics = {'Label':label, 'MAE':mae,
                 'MSE':mse, 'RMSE':rmse, 'R^2':r_squared}
      return metrics

def evaluate_regression(reg, X_train, y_train, X_test, y_test, verbose = True,
                        output_frame=False):
  # Get predictions for training data
  y_train_pred = reg.predict(X_train)

  # Call the helper function to obtain regression metrics for training data
  results_train = regression_metrics(y_train, y_train_pred, verbose = verbose,
                                     output_dict=output_frame,
                                     label='Training Data')
  print()
  # Get predictions for test data
  y_test_pred = reg.predict(X_test)
  # Call the helper function to obtain regression metrics for test data
  results_test = regression_metrics(y_test, y_test_pred, verbose = verbose,
                                  output_dict=output_frame,
                                    label='Test Data' )

  # Store results in a dataframe if ouput_frame is True
  if output_frame:
    results_df = pd.DataFrame([results_train,results_test])
    # Set the label as the index
    results_df = results_df.set_index('Label')
    # Set index.name to none to get a cleaner looking result
    results_df.index.name=None
    # Return the dataframe
    return results_df.round(3)

In [None]:
## Make and fit model
linreg_pipe = make_pipeline(preprocessor,LinearRegression())
linreg_pipe.fit(X_train, y_train)

In [None]:
# Call custom function to obtain evaluation metrics
evaluate_regression (linreg_pipe, X_train, y_train, X_test, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 739.963
- MSE = 977,329.602
- RMSE = 988.600
- R^2 = 0.670

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 701,867,001,711.983
- MSE = 122,179,197,776,264,841,951,969,280.000
- RMSE = 11,053,469,942,794.654
- R^2 = -44,284,234,711,259,791,360.000


#calculate R2


In [None]:
# Get predictions to use to evaluate model
y_hat_train = linreg_pipe.predict(X_train)
y_hat_test = linreg_pipe.predict(X_test)

Compare the training vs. test R-squared values and answer the question: to what extent is this model overfit/underfit?

->overfit, better on training than test


# Your second task is to build a Random Forest model to predict sales.



In [None]:
# Instantiate default random forest model
rf = RandomForestRegressor(random_state = 42)
# Model Pipeline
rf_pipe = make_pipeline(preprocessor, rf)

In [None]:
# Fit the model pipeline on the training data only
rf_pipe.fit(X_train, y_train)

In [None]:
# Use custom function to evaluate default model
evaluate_regression(rf_pipe, X_train, y_train, X_test, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 297.517
- MSE = 184,658.585
- RMSE = 429.719
- R^2 = 0.938

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 773.321
- MSE = 1,243,035.006
- RMSE = 1,114.915
- R^2 = 0.549


Compare the training vs. test R-squared values and answer the question: to what extent is this model overfit/underfit?

->overfit, better on training than test
, this has beetter test scores


# Use GridSearchCV to tune at least two hyperparameters for a Random Forest model.



In [None]:
# Parameters for tuning
rf_pipe.get_params()


{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('numeric',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    Index(['Item_Weight', 'Item_Visibility', 'Item_MRP',
          'Outlet_Establishment_Year', 'Outlet_Location_Type'],
         dtype='object')),
                                   ('categorical',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer(fill_value='MISSING',
                                                                   strategy='constant')),
                                                    ('onehotencoder',
                                                     OneHotEncoder(

In [None]:
# Define param grid with options to try
params = {'randomforestregressor__max_depth': [None,10,15,20],
          'randomforestregressor__n_estimators':[10,100,150,200],
          'randomforestregressor__min_samples_leaf':[2,3,4],
          'randomforestregressor__max_features':['sqrt','log2',None],
          }

In [None]:
# Instantiate the gridsearch
gridsearch = GridSearchCV(rf_pipe, params, n_jobs=-1, cv = 5, verbose=1)
# Fit the gridsearch on training data
gridsearch.fit(X_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


In [None]:
# Obtain best parameters
gridsearch.best_params_

In [None]:
# Define and refit best model
best_rf = gridsearch.best_estimator_
evaluate_regression(best_rf, X_train, y_train, X_test, y_test)