<a href="https://colab.research.google.com/github/Jeremy-Alekai/food-sales-predictions./blob/main/Food_sales_Prediction_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [94]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [95]:
# importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import set_config
set_config(display='diagram')


In [96]:
#load file
filename = '/content/drive/MyDrive/Colab Notebooks/Coding Dojo/Project 1 - Food Sales prediction/sales_predictions.csv'
df = pd.read_csv(filename)
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [97]:
df.shape

(8523, 12)

In [98]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [99]:
df['Outlet_Size'].value_counts()

Medium    2793
Small     2388
High       932
Name: Outlet_Size, dtype: int64

In [100]:
# Replaceing the ordinal column. OUTLET SIZE
df['Outlet_Size'] = df['Outlet_Size'].replace({'Small': '0', 'Medium': '1', 'High': '2'})
df['Outlet_Size'].value_counts()


1    2793
0    2388
2     932
Name: Outlet_Size, dtype: int64

In [101]:
# assign X and y
X = df.drop('Item_Outlet_Sales', axis=1)
y = df['Item_Outlet_Sales']

In [102]:
# split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [103]:
# make column selector
num_selector = make_column_selector(dtype_include= 'number')
cat_selector = make_column_selector(dtype_include= 'object')

In [104]:
# imputers, scaler and one hot encoder
freq_imputer = SimpleImputer(strategy = 'most_frequent')
num_imputer = SimpleImputer(strategy = 'mean')
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown= 'ignore', sparse_output = False)


In [105]:
# create the numeric pipeline
num_pipe = make_pipeline(num_imputer, scaler)
num_pipe

In [106]:
cat_pipe = make_pipeline(freq_imputer, ohe)
cat_pipe

In [107]:
# creating tuples that will go into the column transformer
num_tuple = (num_pipe, num_selector)
cat_tuple = (cat_pipe, cat_selector)


In [108]:
# column transformer
sales_preprocessor = make_column_transformer(num_tuple, cat_tuple)
sales_preprocessor

In [109]:
from sklearn.linear_model import LinearRegression

In [110]:
# linear regression model to predict sales
reg = LinearRegression()
reg_pipeline = make_pipeline(sales_preprocessor, reg)
reg_pipeline

In [111]:
# fit the model
reg_pipeline.fit(X_train, y_train)

In [112]:
# Predict the target values for the training set and the test set.
train_pred = reg_pipeline.predict(X_train)
test_pred = reg_pipeline.predict(X_test)

In [113]:
# Evaluate the performance of your model based on r^2.
train_score = reg_pipeline.score(X_train, y_train)
test_score = reg_pipeline.score(X_test, y_test)
print(train_score)
print(test_score)

0.6580833393432646
-2.008495080023361e+19


In [114]:
# evaluating using R2
train_R2 = np.corrcoef(y_train, train_pred)[0][1]**2
test_R2 = np.corrcoef(y_test, test_pred)[0][1]**2
print(train_R2)
print(test_R2)

0.6583244461716079
5.761005850303654e-05


In [115]:
# Evaluate the performance of your model based on rmse.
train_RMSE = np.sqrt(np.mean(np.abs(train_pred - y_train)**2))
test_RMSE = np.sqrt(np.mean(np.abs(test_pred - y_test)**2))
print({train_RMSE})
print({test_RMSE})

{1005.9260469106301}
{7444052866856.778}


2) Build a regression tree model to predict sales.

  - Build a simple regression tree model.

  - Compare the performance of your model based on r^2.

  - Compare the performance of your model based on rmse.  




In [116]:
from sklearn.ensemble import RandomForestRegressor

In [117]:
rf = RandomForestRegressor(random_state = 42)

In [118]:
tree_pipeline = make_pipeline(sales_preprocessor, rf)
tree_pipeline

In [119]:
tree_pipeline.fit(X_train, y_train)

In [120]:
# Predict the target values for the training set and the test set.
train_pred2 = tree_pipeline.predict(X_train)
test_pred2 = tree_pipeline.predict(X_test)

In [121]:
# Evaluate the performance of your model based on r^2.
train_score2 = tree_pipeline.score(X_train, y_train)
test_score2 = tree_pipeline.score(X_test, y_test)
print(train_score2)
print(test_score2)
# the model is overfitting and peorming poorly on the testing set

0.937925043125691
0.5507906556585676


In [122]:
# Evaluate the performance of your model based on rmse.
train_RMSE2 = np.sqrt(np.mean(np.abs(train_pred2 - y_train)**2))
test_RMSE2 = np.sqrt(np.mean(np.abs(test_pred2 - y_test)**2))
print({train_RMSE2})
print({test_RMSE2})

{428.6118118180061}
{1113.2647387340223}


In [123]:
# To see what the depth of each tree in your random forest was when the max_depth was unlimited
est_depths = [estimator.get_depth() for estimator in rf.estimators_]
max(est_depths)

86

In [None]:
# looping through the depths to see performance on various depths of the tree
depths = range(10, 85)
scores = pd.DataFrame(index=depths, columns=['Test Score'])
for depth in depths:
   model = RandomForestRegressor(max_depth=depth)
   tree_pipeline = make_pipeline(sales_preprocessor, model)
   tree_pipeline.fit(X_train, y_train)
   scores.loc[depth, 'Train Score'] = tree_pipeline.score(X_train, y_train)
   scores.loc[depth, 'Test Score'] = tree_pipeline.score(X_test, y_test)


In [None]:
# Sort in ascending to see which is performing best
sorted_scores = scores.sort_values(by = 'Test Score', ascending = False)
sorted_scores.head()
# the best performing depth on the test set is 22.

In [None]:
# using a max_depth of 10
model = RandomForestRegressor(max_depth=10, random_state = 42)
tree_pipeline = make_pipeline(sales_preprocessor, model)
tree_pipeline.fit(X_train, y_train)

In [None]:
train_pred10 = tree_pipeline.predict(X_train)
test_pred10 = tree_pipeline.predict(X_test)

In [None]:
# Model performance based on RMSE
train_RMSE10 = np.sqrt(np.mean(np.abs(train_pred10 - y_train)**2))
test_RMSE10 = np.sqrt(np.mean(np.abs(test_pred10 - y_test)**2))
print({train_RMSE10})
print({test_RMSE10})

In [None]:
# Model performance based on R2
train_R2 = np.corrcoef(y_train, train_pred10)[0][1]**2
test_R2 = np.corrcoef(y_test, test_pred10)[0][1]**2
print({train_R2})
print({test_R2})


I recommend that we use the Random Forest model for the predictions because on it the test set has performed better than for the linear regression.
