<a href="https://colab.research.google.com/github/MayBornWitIt/sales-predictions/blob/main/Project_1_Part_5_Final(Core).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Project 1 - Part 5**

## Import Libraries

In [1]:
## Pandas
import pandas as pd
## Numpy
import numpy as np
## MatPlotLib
import matplotlib.pyplot as plt

## Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

## Models
from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeClassifier

## Regression Metrics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


## Set global scikit-learn configuration 
from sklearn import set_config
from sklearn.datasets import load_iris

## Display estimators as a diagram
set_config(display='diagram') # 'text' or 'diagram'}

from IPython.core.display import clear_output

## Functions

In [2]:
## Create a function to take the true and predicted values
## and print MAE, MSE, RMSE, and R2 metrics for a model
def eval_regression(y_true, y_pred, name='model'):
  """Takes true targets and predictions from a regression model and prints
  MAE, MSE, RMSE, AND R2 scores
  Set 'name' to name of model and 'train' or 'test' as appropriate"""
  mae = mean_absolute_error(y_true, y_pred)
  mse = mean_squared_error(y_true, y_pred)
  rmse = np.sqrt(mse)
  r2 = r2_score(y_true, y_pred)

  print(f'{name} Scores')
  print(f'MAE: {mae:,.4f} \nMSE: {mse:,.4f} \nRMSE: {rmse:,.4f} \nR2: {r2:.4f}\n')

In [3]:
## Create a function to take the true and predicted values
## and print MAE, MSE, RMSE, and R2 metrics
def model_metrics(pipe, X_train, y_train, X_test, y_test, 
                       model_name='Regression Model'):
  
  """Takes a regression model, training features and target and testing 
  features and target.
  Returns a dataframe of regression metrics"""

  ## Predictions
  train_pred = pipe.predict(X_train)
  test_preds = pipe.predict(X_test)
  
  metrics = ['train_MAE','test_MAE', 
             'train_RMSE', 'test_RMSE', 
             'train_R2', 'test_R2']
  
  ## Train
  train_mae = round(mean_absolute_error(y_train, train_pred),4)
  train_rmse = round(np.sqrt(mean_squared_error(y_train, train_pred)),4)
  train_r2 = round(r2_score(y_train, train_pred),4)

  ## Test
  test_mae = round(mean_absolute_error(y_test, test_preds),4)
  test_rmse = round(np.sqrt(mean_squared_error(y_test, test_preds)),4)
  test_r2 = round(r2_score(y_test, test_preds),4)

  ## Output Dataframe
  scores = pd.DataFrame(columns=metrics, index=[f'{model_name}'])
  scores.loc[f'{model_name}', :] = [train_mae, test_mae,
                                          train_rmse, test_rmse,
                                          train_r2, test_r2]

  return scores

## Load and Inspect the Data

### Load the Data

In [4]:
# Load the Data
df = pd.read_csv('/content/drive/MyDrive/sales_predictions.csv')

### Inspect the Data

In [5]:
# Display the first (5) rows of the dataframe
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [6]:
## Display the number of rows and columns for the dataframe
df.shape
print(f'There are {df.shape[0]} rows, and {df.shape[1]} columns.')
print(f'The rows represent {df.shape[0]} observations, and the columns represent {df.shape[1]-1} features and 1 target variable.')

There are 8523 rows, and 12 columns.
The rows represent 8523 observations, and the columns represent 11 features and 1 target variable.


In [7]:
## Display the column names, count of non-null values, and their datatypes
## Columns with mixed datatypes are identified as an object datatype
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


- We will need to impute any missing values from columns "Item_Weight" and "Outlet_Size"
  - We will need to use SimpleImputer in our preprocessing steps.

In [8]:
## Display the descriptive statistics for the numeric columns
df.describe(include="number") ## or 'object'

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.643456,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.77375,0.026989,93.8265,1987.0,834.2474
50%,12.6,0.053931,143.0128,1999.0,1794.331
75%,16.85,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


In [9]:
## Display the descriptive statistics for the non-numeric (categorical) columns
df.describe(include="object") ## or 'number'

Unnamed: 0,Item_Identifier,Item_Fat_Content,Item_Type,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type
count,8523,8523,8523,8523,6113,8523,8523
unique,1559,5,16,10,3,3,4
top,FDW13,Low Fat,Fruits and Vegetables,OUT027,Medium,Tier 3,Supermarket Type1
freq,10,5089,1232,935,2793,3350,5577


## Clean the Data

### Remove Unnecessary Columns

In [10]:
df = df.drop(columns=['Item_Identifier'])
df.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


### Remove Unnecessary Rows

#### Duplicates

In [11]:
## Display the number of duplicate rows in the dataset
print(f'There are {df.duplicated().sum()} duplicate rows.')

There are 0 duplicate rows.


#### Categorical Columns

In [12]:
df.nunique()

Item_Weight                   415
Item_Fat_Content                5
Item_Visibility              7880
Item_Type                      16
Item_MRP                     5938
Outlet_Identifier              10
Outlet_Establishment_Year       9
Outlet_Size                     3
Outlet_Location_Type            3
Outlet_Type                     4
Item_Outlet_Sales            3493
dtype: int64

In [13]:
## Print the unique values for the column
print('Unique Item Fat Content types:\n', df['Item_Fat_Content'].unique())
print('\n')
## Print the unique values for the column
print('Unique Item types:\n', df['Item_Type'].unique())
print('\n')
## Print the unique values for the column
print('Unique Outlet Identifier types:\n', df['Outlet_Identifier'].unique())
print('\n')
## Print the unique values for the column
print('Unique Outlet Size types:\n', df['Outlet_Size'].unique())
print('\n')
## Print the unique values for the column
print('Unique Outlet Location types:\n', df['Outlet_Location_Type'].unique())
print('\n')
## Print the unique values for the column
print('Unique Outlet types:\n', df['Outlet_Type'].unique())
print('\n')

Unique Item Fat Content types:
 ['Low Fat' 'Regular' 'low fat' 'LF' 'reg']


Unique Item types:
 ['Dairy' 'Soft Drinks' 'Meat' 'Fruits and Vegetables' 'Household'
 'Baking Goods' 'Snack Foods' 'Frozen Foods' 'Breakfast'
 'Health and Hygiene' 'Hard Drinks' 'Canned' 'Breads' 'Starchy Foods'
 'Others' 'Seafood']


Unique Outlet Identifier types:
 ['OUT049' 'OUT018' 'OUT010' 'OUT013' 'OUT027' 'OUT045' 'OUT017' 'OUT046'
 'OUT035' 'OUT019']


Unique Outlet Size types:
 ['Medium' nan 'High' 'Small']


Unique Outlet Location types:
 ['Tier 1' 'Tier 3' 'Tier 2']


Unique Outlet types:
 ['Supermarket Type1' 'Supermarket Type2' 'Grocery Store'
 'Supermarket Type3']




In [14]:
df['Item_Fat_Content'].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [15]:
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace('LF','Low Fat')
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace('reg','Regular')
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace('low fat','Low Fat')
df['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

### Missing Values

In [16]:
## Display the total number of missing values
print(f'There are {df.isna().sum().sum()} missing values.')

There are 3873 missing values.


In [17]:
## Display the count of missing values by column
print(df.isna().sum())

Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64


- Missing values will be imputed with SimpleImputer after splitting data as to not cause data leakage.

## Split the Data

In [18]:
# Identifing the features (X) and target (y): Assigning "Item_Outlet_Sales" column as the target and the rest of the relevant variables as the features matrix. 

target = "Item_Outlet_Sales"
X = df.drop(columns=[target])
y = df[target]

In [19]:
# Performing a train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [20]:
# Display first (5) rows of the dataframe features (X)
X.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2
2,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1
3,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store
4,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1


In [21]:
# Display first (5) rows of the dataframe target (y)
y.head()

0    3735.1380
1     443.4228
2    2097.2700
3     732.3800
4     994.7052
Name: Item_Outlet_Sales, dtype: float64

## Prepare the Data

### Identify the datatypes for each feature

Ordinal: 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'

Numeric: 'Item_Weight',	'Item_Visibility',	'Item_MRP',	'Outlet_Establishment_Year',	'Item_Outlet_Sales'

Nominal: 'Item_Identifier',	'Item_Fat_Content',	'Item_Type',	'Outlet_Identifier'

### Column Selector

In [22]:
## Instantiate the column selectors
num_selector = make_column_selector(dtype_include='number')
cat_selector = make_column_selector(dtype_include='object')

### Imputers

In [23]:
# Imputation of missing values
freq_imputer = SimpleImputer(strategy='most_frequent', fill_value='missing')

### Transformers

In [24]:
## Instantiate the transformers
scaler = StandardScaler()
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

### Tuples

In [25]:
## Create tuples
number_tuple = (scaler, num_selector)
category_tuple = (ohe, cat_selector)

### Check that the preprocessing is working

In [26]:
## Create the preprocessor using make_column_transformer
preprocessor = make_column_transformer(number_tuple,
                                       remainder='drop')

In [27]:
# fit on train
preprocessor.fit(X_train)

# transform train and test
X_train_processed = pd.DataFrame(preprocessor.transform(X_train),
                                 columns=preprocessor.get_feature_names_out())

X_test_processed = pd.DataFrame(preprocessor.transform(X_test),
                                 columns=preprocessor.get_feature_names_out())
# Check for missing values and that data is scaled and one-hot encoded
display(X_train_processed.head(2))
X_train_processed.info()

Unnamed: 0,standardscaler__Item_Weight,standardscaler__Item_Visibility,standardscaler__Item_MRP,standardscaler__Outlet_Establishment_Year
0,0.743119,-0.712775,1.828109,1.327849
1,0.505876,-1.291052,0.603369,1.327849


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6392 entries, 0 to 6391
Data columns (total 4 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   standardscaler__Item_Weight                5285 non-null   float64
 1   standardscaler__Item_Visibility            6392 non-null   float64
 2   standardscaler__Item_MRP                   6392 non-null   float64
 3   standardscaler__Outlet_Establishment_Year  6392 non-null   float64
dtypes: float64(4)
memory usage: 199.9 KB


In [28]:
# Transformers
mean_imputer = SimpleImputer(strategy='mean', fill_value='missing')
scaler = StandardScaler()

#Pipeline
num_pipe = make_pipeline(mean_imputer, scaler)

#Selector
num_selector = make_column_selector(dtype_include='number')

#Tuple
num_tuple = (num_pipe, num_selector)

In [29]:
# Transformers
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
freq_imputer = SimpleImputer(strategy='most_frequent', fill_value='missing')

#Pipeline
cat_pipe = make_pipeline(freq_imputer, ohe)

#Selector
cat_cols = make_column_selector(dtype_include='object')

#Tuple
cat_tuple = (cat_pipe, cat_cols)

In [30]:
# Instantiate the make column transformer
preprocessor = make_column_transformer(num_tuple, cat_tuple,remainder='passthrough')
preprocessor

In [31]:
# Fit the column transformer on the X_train
preprocessor.fit(X_train)

In [32]:
# Transforming train and test
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)


In [33]:
X_train_processed

array([[ 0.81724868, -0.71277507,  1.82810922, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.5563395 , -1.29105225,  0.60336888, ...,  0.        ,
         1.        ,  0.        ],
       [-0.13151196,  1.81331864,  0.24454056, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 1.11373638, -0.92052713,  1.52302674, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.76600931, -0.2277552 , -0.38377708, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.81724868, -0.95867683, -0.73836105, ...,  1.        ,
         0.        ,  0.        ]])

## **Project 1 - Final**

### Linear Regression Model

In [34]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression() 

In [35]:
## Create an instance of the model
dummy = DummyRegressor(strategy='median')
# Instantiate the model
linreg_pipe = make_pipeline(preprocessor, linreg)

In [36]:
# Fit the training set
linreg_pipe.fit(X_train, y_train)

In [37]:
# Model Performance
lin_train_pred = linreg_pipe.predict(X_train)
lin_test_pred = linreg_pipe.predict(X_test)

eval_regression(y_train, lin_train_pred, name='linreg_train')
eval_regression(y_test, lin_test_pred, name='linreg_test')

linreg_train Scores
MAE: 847.1280 
MSE: 1,297,558.1834 
RMSE: 1,139.1041 
R2: 0.5616

linreg_test Scores
MAE: 804.1181 
MSE: 1,194,347.6143 
RMSE: 1,092.8621 
R2: 0.5671



In [38]:
# Evaluate your model performance using R^2 on the training set and on the test set.

train_r2 = r2_score(y_train, lin_train_pred)
test_r2 = r2_score(y_test, lin_test_pred)

print(f'Training R2:{train_r2}')
print(f'Testing R2:{test_r2}')

Training R2:0.5615551260381082
Testing R2:0.5671049487900051


In [39]:
# Evaluate your model performance using RMSE on the training set and on the test set

train_RMSE = np.sqrt(np.mean(np.abs(lin_train_pred - y_train)**2))
test_RMSE = np.sqrt(np.mean(np.abs(lin_test_pred - y_test)**2))

print(f'Training RMSE:{train_RMSE}')
print(f'Testing RMSE:{test_RMSE}')

Training RMSE:1139.1041143666143
Testing RMSE:1092.8621204398453


### Regression Tree Model

In [40]:
rf = RandomForestRegressor(random_state = 42)

In [41]:
rf.fit(X_train, y_train)

ValueError: ignored

In [None]:
rf_train_score = rf.score(X_train, y_train)
rf_test_score = rf.score(X_test, y_test)
print(rf_train_score)
print(rf_test_score)


In [None]:
est_depths = [estimator.get_depth() for estimator in rf.estimators_]
max(est_depths)

In [None]:
depths = range(1, max(est_depths))
scores = pd.DataFrame(index=depths, columns=['Test Score'])
for depth in depths:    
   model = RandomForestRegressor(max_depth=depth)
   model.fit(X_train, y_train)
   scores.loc[depth, 'Train Score'] = model.score(X_train, y_train)
   scores.loc[depth, 'Test Score'] = model.score(X_test, y_test)
   scores.head()

In [None]:
plt.plot(scores['Test Score'])

In [None]:
sorted_scores = scores.sort_values(by='Test Score', ascending=False)
sorted_scores.head()

In [None]:
rf = RandomForestRegressor

rf_tree_19 = RandomForestRegressor(max_depth = 19, random_state = 42)
rf_tree_19.fit(X_train, y_train)
train_19_score = rf_tree_19.score(X_train, y_train)
test_19_score = rf_tree_19.score(X_test, y_test)
print(train_19_score)
print(test_19_score)

### Recommendation:
