<a href="https://colab.research.google.com/github/HadilGhaith/Prediction-of-Product-Sales/blob/main/Project_1_Part_5_(Core).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import set_config
set_config(transform_output='pandas')
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

In [3]:
path = '/content/drive/MyDrive/CodingDojo/02-IntroML/Week05/Data/sales_predictions_2023.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [4]:
df.info(())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


## Cleaning Data

In [5]:
# check for duplicated
df.duplicated().sum()

0

In [6]:
# fix inconsistencies in categorical
cat_cols = df.select_dtypes('object').columns
cat_cols

Index(['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier',
       'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'],
      dtype='object')

In [7]:
for col in cat_cols:
  print(f'Value counts for {col}\n', df[col].value_counts(), '\n')


Value counts for Item_Identifier
 FDW13    10
FDG33    10
NCY18     9
FDD38     9
DRE49     9
         ..
FDY43     1
FDQ60     1
FDO33     1
DRF48     1
FDC23     1
Name: Item_Identifier, Length: 1559, dtype: int64 

Value counts for Item_Fat_Content
 Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64 

Value counts for Item_Type
 Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Soft Drinks               445
Meat                      425
Breads                    251
Hard Drinks               214
Others                    169
Starchy Foods             148
Breakfast                 110
Seafood                    64
Name: Item_Type, dtype: int64 

Value counts for Outlet_Identifier
 OUT027    935
OUT013    932
OUT049    930
OUT046    

In [8]:
# fic the inconsistencies in item_fst_content
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({'LF':'Low Fat', 'low fat':'Low Fat','reg':'Regular'})
df['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [9]:
df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

## preprocessing for machine learning

In [10]:
# difine target and feature matrix
y = df['Item_Outlet_Sales']
X = df.drop(columns=['Item_Outlet_Sales', 'Item_Identifier'])

In [11]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [12]:
# preprocessing for numeric columns
# define numeric columns
num_cols = X_train.select_dtypes('number').columns
print('Numeric Columns:', num_cols)
# imput mean for item weight
mean_imputer = SimpleImputer(strategy='mean')
# scaleing the numeric columns
num_scaler = StandardScaler()
# numeric pipeline
num_pipe = make_pipeline(mean_imputer, num_scaler)
# num tuple
num_tuple = ("numeric", num_pipe, num_cols)
num_tuple
num_pipe

Numeric Columns: Index(['Item_Weight', 'Item_Visibility', 'Item_MRP',
       'Outlet_Establishment_Year'],
      dtype='object')


In [13]:
# value count
X_train['Outlet_Size'].value_counts()

Medium    2103
Small     1788
High       689
Name: Outlet_Size, dtype: int64

In [14]:
X_train['Outlet_Location_Type'].value_counts()

Tier 3    2531
Tier 2    2106
Tier 1    1755
Name: Outlet_Location_Type, dtype: int64

In [15]:
# preprocessing for ordinal featurs
# define ordinal featurs
ord_cols = ['Outlet_Size', 'Outlet_Location_Type']
print('Ordinal Columns:', ord_cols)
# impute the ordinal data
ord_imputer = SimpleImputer(strategy='constant', fill_value= 'Medium')
# ordering for ordinal columns
size_order = ['Small', 'Medium', 'High']
type_order = ['Tier 1', 'Tier 2', 'Tier 3']
ord_order = [size_order, type_order]
ord_encoder = OrdinalEncoder(categories=ord_order)
# scaleing ordinal feature
ord_scaler = StandardScaler()
# pipeline
ord_pipe = make_pipeline(ord_imputer, ord_encoder, ord_scaler)
# ordinal tuple
ord_tuple = ('ordinal', ord_pipe, ord_cols)
ord_pipe

Ordinal Columns: ['Outlet_Size', 'Outlet_Location_Type']


In [16]:
# preprocessing for nominal features
# define nominal columns
ohe_cols = X_train.select_dtypes('object').drop(columns=ord_cols).columns
print('Nominal Columns:', ohe_cols)
# one hot encoding for nominal
ohe_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# nominal pipeline
ohe_pipe = make_pipeline(ohe_encoder)
# nominal tuple
ohe_tuple = ('Nominal', ohe_pipe, ohe_cols)
ohe_pipe

Nominal Columns: Index(['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Type'], dtype='object')


# Instantiate the Column Transformer

In [17]:
# instantiate
col_transformer = ColumnTransformer([num_tuple, ord_tuple, ohe_tuple], verbose_feature_names_out=False)
col_transformer

# fit column transformer on training data

In [18]:
# fit
col_transformer.fit(X_train)

# transform train and testing data

In [19]:
# transform train data
X_train_tf = col_transformer.transform(X_train)
# transform test data
X_test_tf = col_transformer.transform(X_test)
# check X_train
X_train_tf.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Type_Baking Goods,Item_Type_Breads,...,Outlet_Identifier_OUT019,Outlet_Identifier_OUT027,Outlet_Identifier_OUT035,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
4776,0.817249,-0.712775,1.828109,1.327849,0.287374,1.084948,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7510,0.55634,-1.291052,0.603369,1.327849,0.287374,1.084948,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5828,-0.131512,1.813319,0.244541,0.136187,0.287374,-1.384777,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
5327,-1.169219,-1.004931,-0.952591,0.732018,-1.384048,-0.149914,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4810,1.528819,-0.965484,-0.33646,0.493686,0.287374,-0.149914,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
