<a href="https://colab.research.google.com/github/JDevine1981/Prediction-of-Product-Sales/blob/main/Project1Part5Core%2B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns',100)
import missingno
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

from sklearn import set_config
set_config(transform_output='pandas')

## LOADING

In [2]:
fpath = '/content/drive/MyDrive/CodingDojo/02-IntroML/Week05/Data/sales_predictions_2023.csv'
df = pd.read_csv(fpath)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


## INITIAL INSPECTION

In [3]:
df.shape

(8523, 12)

In [4]:
df.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

In [5]:
df.duplicated().sum()

0

In [6]:
df.nunique()

Item_Identifier              1559
Item_Weight                   415
Item_Fat_Content                5
Item_Visibility              7880
Item_Type                      16
Item_MRP                     5938
Outlet_Identifier              10
Outlet_Establishment_Year       9
Outlet_Size                     3
Outlet_Location_Type            3
Outlet_Type                     4
Item_Outlet_Sales            3493
dtype: int64

All columns appear to be assigned the appropriate data type. Further inspection will determine if any columns
will need to be converted.

# DATA CLEANING

The column Item_Fat_Content contains five unqiue values, but according to our data dictionary,
it should only contain two: Low-Fat and Regular.

In [7]:
# Reviewing value counts for Item_Fat_Content
df['Item_Fat_Content'].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [8]:
# Replacing and Standardizing Inconsistent Values
df["Item_Fat_Content"] = df['Item_Fat_Content'].replace({'low fat':'Low Fat',
'reg':'Regular', 'LF': 'Low Fat'})
df['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [9]:
df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [10]:
df.isna().sum() / len(df) * 100

Item_Identifier               0.000000
Item_Weight                  17.165317
Item_Fat_Content              0.000000
Item_Visibility               0.000000
Item_Type                     0.000000
Item_MRP                      0.000000
Outlet_Identifier             0.000000
Outlet_Establishment_Year     0.000000
Outlet_Size                  28.276428
Outlet_Location_Type          0.000000
Outlet_Type                   0.000000
Item_Outlet_Sales             0.000000
dtype: float64

## Missing Values:

  There are 1,463 missing values for Item_Weight (17%).

  There are 2,410 missing values for Outlet_size (28%).

  We will impute the missing values post test/train split to avoid data leakage.

In [11]:
# Create a filter to use in a for loop to loop through the object type columns
data_types = df.dtypes
str_cols = data_types[data_types=='object'].index
str_cols

Index(['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier',
       'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'],
      dtype='object')

In [12]:
# Run the for loop
for col in str_cols:
  print(f'-{col}:')
  print(df[col].value_counts(dropna=False))
  print('\n\n')

-Item_Identifier:
FDW13    10
FDG33    10
NCY18     9
FDD38     9
DRE49     9
         ..
FDY43     1
FDQ60     1
FDO33     1
DRF48     1
FDC23     1
Name: Item_Identifier, Length: 1559, dtype: int64



-Item_Fat_Content:
Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64



-Item_Type:
Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Soft Drinks               445
Meat                      425
Breads                    251
Hard Drinks               214
Others                    169
Starchy Foods             148
Breakfast                 110
Seafood                    64
Name: Item_Type, dtype: int64



-Outlet_Identifier:
OUT027    935
OUT013    932
OUT049    930
OUT046    930
OUT035    930
OUT045    929
OUT018    928
OUT017    926
OUT010    555
OUT019    528
Name: Outlet_Identifi

We can rename several columns by removing the terms item and outlet, as it is implied the data refers to the products and locations.

In [13]:
# Create a rename dictionary
rename_dict = {'Item_Weight':'Weight', 'Item_Fat_Content':'Fat_Content', 'Item_Visibility':'Visibility',
'Item_Type':'Type', 'Item_MRP': 'MRP', 'Outlet_Establishment_Year':'Establishment_Year', 'Outlet_Location_Type':
'Location_Type'}
rename_dict

{'Item_Weight': 'Weight',
 'Item_Fat_Content': 'Fat_Content',
 'Item_Visibility': 'Visibility',
 'Item_Type': 'Type',
 'Item_MRP': 'MRP',
 'Outlet_Establishment_Year': 'Establishment_Year',
 'Outlet_Location_Type': 'Location_Type'}

In [14]:
# Dictionary substitution using rename method
df = df.rename(rename_dict, axis=1)
df.head()

Unnamed: 0,Item_Identifier,Weight,Fat_Content,Visibility,Type,MRP,Outlet_Identifier,Establishment_Year,Outlet_Size,Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Item_Identifier     8523 non-null   object 
 1   Weight              7060 non-null   float64
 2   Fat_Content         8523 non-null   object 
 3   Visibility          8523 non-null   float64
 4   Type                8523 non-null   object 
 5   MRP                 8523 non-null   float64
 6   Outlet_Identifier   8523 non-null   object 
 7   Establishment_Year  8523 non-null   int64  
 8   Outlet_Size         6113 non-null   object 
 9   Location_Type       8523 non-null   object 
 10  Outlet_Type         8523 non-null   object 
 11  Item_Outlet_Sales   8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


SUMMARY STATISTICS FOR NUMERIC VALUES

The minimum value for visibility is 0, which is not possible. For the time being, we will convert this to a null value. Following the test/train split, we can then impute it with another value, such as the mean.

In [16]:
df.describe().round(2)

Unnamed: 0,Weight,Visibility,MRP,Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.86,0.07,140.99,1997.83,2181.29
std,4.64,0.05,62.28,8.37,1706.5
min,4.56,0.0,31.29,1985.0,33.29
25%,8.77,0.03,93.83,1987.0,834.25
50%,12.6,0.05,143.01,1999.0,1794.33
75%,16.85,0.09,185.64,2004.0,3101.3
max,21.35,0.33,266.89,2009.0,13086.96


In [17]:
# Create a filter to identify the rows where Visibility = 0.00
filter_zero_vis = df['Visibility'] == 0.00

In [18]:
# Change to a null value
df.loc[filter_zero_vis, 'Visibility']=np.nan

In [19]:
# Confirm change to null value
df['Visibility'].describe()

count    7997.000000
mean        0.070482
std         0.050308
min         0.003575
25%         0.031403
50%         0.057792
75%         0.098109
max         0.328391
Name: Visibility, dtype: float64

In [20]:
df['Visibility'].isna().sum()

526

We will drop the column 'Item_Identifier' due to its high cardinality.

In [21]:
df.drop(columns='Item_Identifier', inplace=True)

In [22]:
# Saving the cleaned data to Google Drive
fpath_out = "/content/drive/MyDrive/CodingDojo/02-IntroML/Week05/Data/sales-predictioncore+++-eda.csv"
df.to_csv(fpath_out, index=True)

In [23]:
# Loading and inspecting saved file
loaded = pd.read_csv(fpath_out)
loaded.head()
df.info()
df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Weight              7060 non-null   float64
 1   Fat_Content         8523 non-null   object 
 2   Visibility          7997 non-null   float64
 3   Type                8523 non-null   object 
 4   MRP                 8523 non-null   float64
 5   Outlet_Identifier   8523 non-null   object 
 6   Establishment_Year  8523 non-null   int64  
 7   Outlet_Size         6113 non-null   object 
 8   Location_Type       8523 non-null   object 
 9   Outlet_Type         8523 non-null   object 
 10  Item_Outlet_Sales   8523 non-null   float64
dtypes: float64(4), int64(1), object(6)
memory usage: 732.6+ KB


Weight                1463
Fat_Content              0
Visibility             526
Type                     0
MRP                      0
Outlet_Identifier        0
Establishment_Year       0
Outlet_Size           2410
Location_Type            0
Outlet_Type              0
Item_Outlet_Sales        0
dtype: int64

## TRAIN/TEST SPLIT AND PREPOCESSING

In [24]:
# Assign target column
y = df['Item_Outlet_Sales']
# Assign the features
X = df.drop(columns='Item_Outlet_Sales')

In [25]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## IDENTIFY FEATURES

In [26]:
df.dtypes

Weight                float64
Fat_Content            object
Visibility            float64
Type                   object
MRP                   float64
Outlet_Identifier      object
Establishment_Year      int64
Outlet_Size            object
Location_Type          object
Outlet_Type            object
Item_Outlet_Sales     float64
dtype: object

FEATURES BY TYPE:

  - Numeric
     
     - Weight
     - MRP
     - Establishment_Year
     - Visibility

  - Nominal(Categorical)

     - Fat_Content
     - Type
     - Outlet_Identifier
     - Outlet_Type

  - Ordinal

     - Outlet_Size
     - Location_Type

ISOLATE FEATURES

In [28]:
# Save list of ordinal columns
ord_cols = ['Outlet_Size', 'Location_Type']
# Oridnal Column Lists
outlet_size_list = ['Small', 'Medium', 'High']
location_type_list = ['Tier 1', 'Tier 2', 'Tier 3']
# Transformers
ord = OrdinalEncoder(categories=[outlet_size_list, location_type_list])
freq_imputer = SimpleImputer(strategy='most_frequent')
# Pipeline
ord_pipeline = make_pipeline(freq_imputer, ord)
# Tuple
ord_tuple = ('ordinal', ord_pipeline, ord_cols)

In [29]:
# Save a list of nominal columns
cat_cols = X_train.select_dtypes('object').columns
# Confirm Results
cat_cols
# Transformers
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
# Pipeline
cat_pipeline = make_pipeline(ohe)
# Tuple
cat_tuple = ('nominal', cat_pipeline, cat_cols)

In [33]:
# Save list of numeric columns
num_cols = X_train.select_dtypes('number').columns
# Confirm results
num_cols
# Transformers
mean_imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()
# Pipeline
num_pipeline = make_pipeline(mean_imputer, scaler)
#Tuple
num_tuple = ('numeric', num_pipeline, num_cols)

TRANSFORM

In [34]:
col_transformer = ColumnTransformer([num_tuple, ord_tuple, cat_tuple], verbose_feature_names_out=False)

In [35]:
col_transformer.fit(X_train)



In [37]:
X_train_processed = col_transformer.transform(X_train)
X_test_processed = col_transformer.transform(X_test)

In [38]:
X_train_processed.shape

(6392, 45)

In [39]:
X_train_processed.isna().sum()

Weight                           0
Visibility                       0
MRP                              0
Establishment_Year               0
Outlet_Size                      0
Location_Type                    0
Fat_Content_Low Fat              0
Fat_Content_Regular              0
Type_Baking Goods                0
Type_Breads                      0
Type_Breakfast                   0
Type_Canned                      0
Type_Dairy                       0
Type_Frozen Foods                0
Type_Fruits and Vegetables       0
Type_Hard Drinks                 0
Type_Health and Hygiene          0
Type_Household                   0
Type_Meat                        0
Type_Others                      0
Type_Seafood                     0
Type_Snack Foods                 0
Type_Soft Drinks                 0
Type_Starchy Foods               0
Outlet_Identifier_OUT010         0
Outlet_Identifier_OUT013         0
Outlet_Identifier_OUT017         0
Outlet_Identifier_OUT018         0
Outlet_Identifier_OU

In [40]:
X_train_processed.dtypes

Weight                           float64
Visibility                       float64
MRP                              float64
Establishment_Year               float64
Outlet_Size                      float64
Location_Type                    float64
Fat_Content_Low Fat              float64
Fat_Content_Regular              float64
Type_Baking Goods                float64
Type_Breads                      float64
Type_Breakfast                   float64
Type_Canned                      float64
Type_Dairy                       float64
Type_Frozen Foods                float64
Type_Fruits and Vegetables       float64
Type_Hard Drinks                 float64
Type_Health and Hygiene          float64
Type_Household                   float64
Type_Meat                        float64
Type_Others                      float64
Type_Seafood                     float64
Type_Snack Foods                 float64
Type_Soft Drinks                 float64
Type_Starchy Foods               float64
Outlet_Identifie

In [42]:
X_train_processed.head()

Unnamed: 0,Weight,Visibility,MRP,Establishment_Year,Outlet_Size,Location_Type,Fat_Content_Low Fat,Fat_Content_Regular,Type_Baking Goods,Type_Breads,Type_Breakfast,Type_Canned,Type_Dairy,Type_Frozen Foods,Type_Fruits and Vegetables,Type_Hard Drinks,Type_Health and Hygiene,Type_Household,Type_Meat,Type_Others,Type_Seafood,Type_Snack Foods,Type_Soft Drinks,Type_Starchy Foods,Outlet_Identifier_OUT010,Outlet_Identifier_OUT013,Outlet_Identifier_OUT017,Outlet_Identifier_OUT018,Outlet_Identifier_OUT019,Outlet_Identifier_OUT027,Outlet_Identifier_OUT035,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Size_nan,Location_Type_Tier 1,Location_Type_Tier 2,Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
4776,0.817249,-0.8475007,1.828109,1.327849,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
7510,0.55634,-2.879313e-16,0.603369,1.327849,1.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5828,-0.131512,1.832065,0.244541,0.136187,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
5327,-1.169219,-1.157407,-0.952591,0.732018,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4810,1.528819,-1.115563,-0.33646,0.493686,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
