<a href="https://colab.research.google.com/github/KAlikhanov/food-sales-prediction-proj/blob/main/FSP_Data_Preparation_For_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preparing Data for Machine-Learning

## Importing libraries and loading in the data.

In [212]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer 
from sklearn import set_config
set_config(display='diagram')

In [213]:
# Load in the dataset fresh
filename = '/content/drive/MyDrive/Colab Notebooks/Sales Prediction Project/sales_predictions.csv'
df = pd.read_csv(filename)
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


## Creating a copy of the data and doing preliminary manipulations
The manipulations include dropping duplicate rows and fixing inconsistencies with categorical data.

In [214]:
# Create a copy of the dataframe
ml_df = df.copy()

In [215]:
# drop duplicates (There are no duplicates in this case but just to make sure.)
ml_df.drop_duplicates(inplace=True)

In [216]:
ml_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 865.6+ KB


In [217]:
ml_df['Item_Fat_Content'].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [218]:
# Fix inconsistencies in categorical data before splitting.
replace_dic = {'LF':'Low Fat',
               'reg':'Regular',
               'low fat':'Low Fat'}
ml_df = ml_df.replace(replace_dic)
ml_df['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [219]:
ml_df.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.643456,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.77375,0.026989,93.8265,1987.0,834.2474
50%,12.6,0.053931,143.0128,1999.0,1794.331
75%,16.85,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


## Creating the target and feature dataframes and splitting the data using train_test_split.
Extraneous columns are also dropped.

In [220]:
# I check to see if there are missing any values in the target dataset, since  
# there are not nothing needs to be done.
ml_df['Item_Outlet_Sales'].isna().sum()

0

In [221]:
# Creating our feature and target datasets.
# I drop the Item_Identifier column because it is a unique product id and will
# not be useful for ML. Outlet_Identifier is the same but for outlets.
# Outlet_Establishment_Year is not really relevant data for our problem. 
X = ml_df.drop(columns=['Item_Outlet_Sales',
                        'Item_Identifier',
                        'Outlet_Identifier',
                        'Outlet_Establishment_Year'])
y = ml_df['Item_Outlet_Sales']

In [222]:
# Performing a train_test_split on the data.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

## Sorting the different types of data.
I sort the columns to be either nominal, ordinal, or numerical.

In [223]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6392 entries, 4776 to 7270
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Item_Weight           5285 non-null   float64
 1   Item_Fat_Content      6392 non-null   object 
 2   Item_Visibility       6392 non-null   float64
 3   Item_Type             6392 non-null   object 
 4   Item_MRP              6392 non-null   float64
 5   Outlet_Size           4580 non-null   object 
 6   Outlet_Location_Type  6392 non-null   object 
 7   Outlet_Type           6392 non-null   object 
dtypes: float64(3), object(5)
memory usage: 449.4+ KB


In [224]:
X_train.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type
4776,16.35,Low Fat,0.029565,Household,256.4646,Medium,Tier 3,Supermarket Type2
7510,15.25,Regular,0.0,Snack Foods,179.766,Medium,Tier 3,Supermarket Type2
5828,12.35,Regular,0.158716,Meat,157.2946,Medium,Tier 1,Supermarket Type1
5327,7.975,Low Fat,0.014628,Baking Goods,82.325,Small,Tier 2,Supermarket Type1
4810,19.35,Low Fat,0.016645,Frozen Foods,120.9098,,Tier 2,Supermarket Type1


**After some deliberation I explored the data and decided that Outlet_Type and Outlet_Location_Type are both nominal, if I had more knowledge on what the supermarket tiers and location tiers actually represented I might be able to set them as ordinal but without that information they are nominal.**

Numerical - Item_Weight, Item_Visibility, Item_MRP

Ordinal - Outlet_Size 

Nominal -  Item_Fat_Content, Item_Type, Outlet_Type, Outlet_Location_Type 

How I want to manipulate each type of data:

Numerical -> Impute missing values (Mean) -> Scale the data

Ordinal -> Impute missing values (Most_Frequent) -> Ordinal Encode the data

Nominal -> Imputer missing values (Missing) -> OHE the data.

## Creating a different pipeline for each type of data and putting them into a column transformer.

### Creating the pipelines.

In [225]:
# Prepare the different transformations that I want to accomplish.
scaler = StandardScaler()
ohe = OneHotEncoder(sparse=False,handle_unknown = 'ignore')
mean_imputer = SimpleImputer(strategy = 'mean')
freq_imputer = SimpleImputer(strategy = 'most_frequent')
constant_imputer = SimpleImputer(strategy = 'constant', fill_value = 'Missing')
ordered_label =[['Small','Medium','High']]
ordinal = OrdinalEncoder(categories=ordered_label)

In [226]:
# For the numeric data we will first impute the missing values with the
# mean then we will scale it.
numeric_pipeline = make_pipeline(mean_imputer, scaler)
numeric_pipeline

In [227]:
# For the ordinal data we will first impute the missing values with the most
# frequent value then we will Ordinal Encode the data.
ordinal_pipeline = make_pipeline(freq_imputer, ordinal)
ordinal_pipeline

In [228]:
# For the nominal data we will impute the missing values by using a constant
# 'Missing' value, then we will OneHotEncode the data.
nominal_pipeline = make_pipeline(constant_imputer, ohe)
nominal_pipeline

### Creating the column transformer.

In [229]:
# In order to put pipelines into a column transformer we need to get
# column data pertaining to each transformation.

# Sort the columns under the different categories to prepare them for column
# transformation.
numeric_columns = make_column_selector(dtype_include = 'number')
ordinal_columns = ['Outlet_Size']
nominal_columns = ['Item_Fat_Content',
                   'Item_Type', 
                   'Outlet_Type', 
                   'Outlet_Location_Type']

In [230]:
# Since column transformer takes tuples we will pair the columns with their
# respective transformations.
numeric_tuple = (numeric_pipeline, numeric_columns)
ordinal_tuple = (ordinal_pipeline, ordinal_columns)
nominal_tuple = (nominal_pipeline, nominal_columns)

In [231]:
# Making a column transformer.
preprocessor = make_column_transformer(ordinal_tuple,
                                       nominal_tuple,
                                       numeric_tuple,
                                       remainder='drop')
preprocessor

In [232]:
# Fitting the preprocessor to the X_train data.
preprocessor.fit(X_train)

In [233]:
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)
X_train_processed

array([[ 1.        ,  1.        ,  0.        , ...,  0.81724868,
        -0.71277507,  1.82810922],
       [ 1.        ,  0.        ,  1.        , ...,  0.5563395 ,
        -1.29105225,  0.60336888],
       [ 1.        ,  0.        ,  1.        , ..., -0.13151196,
         1.81331864,  0.24454056],
       ...,
       [ 1.        ,  1.        ,  0.        , ...,  1.11373638,
        -0.92052713,  1.52302674],
       [ 1.        ,  1.        ,  0.        , ...,  1.76600931,
        -0.2277552 , -0.38377708],
       [ 0.        ,  1.        ,  0.        , ...,  0.81724868,
        -0.95867683, -0.73836105]])