<a href="https://colab.research.google.com/github/JFaberSFSD/Food_Sales_Predictions/blob/main/Sales_Predictions_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Justin Faber - Sales Predictions Project
##June 2022


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import set_config
set_config(display='diagram')

df = pd.read_csv('/content/drive/MyDrive/01 - Week 1 (June 20th)/Sales_Predictions_Updated.csv')

df.head()


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [3]:
#Let's check for any duplicates in our dataframe

df.duplicated().sum()

0

In [4]:
#Let's check for any missing or null values in our dataframe

df.isna().sum()

Item_Identifier                0
Item_Weight                    4
Item_Fat_Content               0
Item_Visibility                0
Item_Type                      0
Item_MRP                       0
Outlet_Identifier              0
Outlet_Establishment_Year      0
Outlet_Size                  555
Outlet_Location_Type           0
Outlet_Type                    0
Item_Outlet_Sales              0
dtype: int64

In [5]:
#There are 555 values in the Outlet_Size column that are null. When looking over this data it was noticed that all null values are for Tier 3 Grocery Stores
#There are no Tier 3 Grocery stores to try to impute a value, so we will simply assign these with the value "Unknown"

df['Outlet_Size'].fillna('Unknown',inplace=True)

df.isna().sum()

#We still have 4 rows where the Item_Weight column has a null value (and no available lookup). With only 4 rows missing data, let's remove those rows from our dataframe

df.dropna(subset=['Item_Weight'],inplace=True)

df.isna().sum()

#We have now eliminated all null values! 

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [8]:
#Let's take a look now at categorical columns to make sure there's no cleanup we need to do
#We can take a look at individual columns using the .value_counts() method 

#Our first candidate for cleanup is the "Item_Fat_Content" column, which really only has two values "Low Fat" and "Regular"
#Let's replace the abbreviations and alternate versions with the standard string

df['Item_Fat_Content'].value_counts()

df['Item_Fat_Content'] = df['Item_Fat_Content'].replace("LF","Low Fat")
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace("reg","Regular")
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace("low fat","Low Fat")

df['Item_Fat_Content'].value_counts()

#Fixed! 

#No other categorical columns appeared to have duplicates or unnecesarry values

df.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

In [7]:
#Now that our data is in an organized and somewhat clean format, let's split into Train/Test sets

X = df.drop(columns='Item_Outlet_Sales')
y = df['Item_Outlet_Sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [45]:
#Let's create our column selectors for our different dytpes

cat_selector = make_column_selector(dtype_include='object')
float_selector = make_column_selector(dtype_include='float64')

#We have one numeric column (Outlet_Establishment_Year) that won't work well with a MEAN, so we need to make a MEDIAN pipeline for this one
    ##(I know that this particular dataframe doesn't have any null values in this column, but I wanted to practice creating three of these instead of two)

int_selector = make_column_selector(dtype_include='int64')


In [46]:
#Now let's create an imputer for each of our selectors

mean_imputer = SimpleImputer(strategy='mean')
median_imputer = SimpleImputer(strategy='median')
freq_imputer = SimpleImputer(strategy='most_frequent')



In [47]:
#After imputing our data, we will need to scale our numeric data and one-hot encode our categorical data. Let's create an instance of each

scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [48]:
#Each of our different dtypes will have to follow a different pipeline, so let's create one for each

float_pipe = make_pipeline(mean_imputer, scaler)
int_pipe = make_pipeline(median_imputer, scaler)
categorical_pipe = make_pipeline(freq_imputer, ohe)


In [49]:
#We will need a tuple, witho our different piplines, to feed into our column transformer, let's make one for each dtypes now

float_tuple = (float_pipe, float_selector)
int_tuple = (int_pipe, int_selector)
category_tuple = (categorical_pipe, cat_selector)

In [50]:
#Now we will create our preprocessor with all pipelines in it

preprocessor = make_column_transformer(float_tuple, int_tuple, category_tuple, remainder = 'passthrough')

In [51]:
#Now we want to fit our column transformer on our TRAIN DATA ONLY! 

preprocessor.fit(X_train)

In [52]:
#Now that we have created our preprocessor with our pipelines, let's transform our data!  

X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [54]:
X_train_processed_df = pd.DataFrame(X_train_processed)

X_train_processed_df.head()

#preprocessed and ready to rock!! 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1578,1579,1580,1581,1582,1583,1584,1585,1586,1587
0,0.739662,-0.705576,1.837974,1.332059,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,-0.196563,1.304008,-0.281413,-1.535168,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-1.669773,0.346318,1.691163,0.137381,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-0.013623,0.837109,1.467227,0.495785,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.341497,-1.284039,-1.604467,1.332059,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
