<a href="https://colab.research.google.com/github/MayBornWitIt/sales-predictions/blob/main/Project_1_Part_5_(Core).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

from sklearn import set_config
set_config(display='diagram')

In [2]:
# Load the Data
# load csv from url, display info, and preview rows 
df = pd.read_csv('/content/sales_predictions.csv')
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [4]:
# Checking amount of rows and columns in dataset
df.shape

(8523, 12)

In [5]:
# Checking datatypes
df.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

In [6]:
# Checking duplicates
df.duplicated().sum()

0

In [7]:
# 4) Identify missing values.
df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [8]:
# (Numerical Column) Explanation: Obtain the value that is likely to be closest to the correct value given the data provided
median_i_weight = df['Item_Weight'].median()
df['Item_Weight'].fillna(median_i_weight, inplace = True)

In [9]:
# (Categorical Column) Explanation: Replace missing values with the value 'Missing'
df['Outlet_Size'].fillna('Missing', inplace = True)

In [10]:
# Confirming there are no missing values after addressing them.
df.isna().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [11]:
# Checking any inconsistent categories of data
df['Outlet_Size'].value_counts()

Medium     2793
Missing    2410
Small      2388
High        932
Name: Outlet_Size, dtype: int64

In [12]:
df.describe(include="number")

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,8523.0,8523.0,8523.0,8523.0,8523.0
mean,12.81342,0.066132,140.992782,1997.831867,2181.288914
std,4.22724,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,9.31,0.026989,93.8265,1987.0,834.2474
50%,12.6,0.053931,143.0128,1999.0,1794.331
75%,16.0,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


In [13]:
# Identifing the features (X) and target (y): Assigning "Item_Outlet_Sales" column as the target and the rest of the relevant variables as the features matrix.
X = df.drop(columns=['Item_Weight', 'Item_Visibility','Item_MRP', 'Outlet_Establishment_Year'])
y = df['Item_Outlet_Sales']
X.head()

Unnamed: 0,Item_Identifier,Item_Fat_Content,Item_Type,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,Low Fat,Dairy,OUT049,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,Regular,Soft Drinks,OUT018,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,Low Fat,Meat,OUT049,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,Regular,Fruits and Vegetables,OUT010,Missing,Tier 3,Grocery Store,732.38
4,NCD19,Low Fat,Household,OUT013,High,Tier 3,Supermarket Type1,994.7052


In [14]:
y.head()

0    3735.1380
1     443.4228
2    2097.2700
3     732.3800
4     994.7052
Name: Item_Outlet_Sales, dtype: float64

In [15]:
# Performing a train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [16]:
nom_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include='number')

In [17]:
nom_cols = nom_selector(X_train)
nom_cols

['Item_Identifier',
 'Item_Fat_Content',
 'Item_Type',
 'Outlet_Identifier',
 'Outlet_Size',
 'Outlet_Location_Type',
 'Outlet_Type']

In [18]:
num_cols = num_selector(X_train)
num_cols

['Item_Outlet_Sales']

In [19]:
# Transformers
mean_imputer = SimpleImputer(strategy='mean', fill_value='missing')
scaler = StandardScaler()

#Pipeline
num_pipe = make_pipeline(mean_imputer, scaler)

#Selector
num_selector = make_column_selector(dtype_include='number')

#Tuple
num_tuple = (num_pipe, num_selector)

In [20]:
# Transformers
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
freq_imputer = SimpleImputer(strategy='most_frequent', fill_value='missing')

#Pipeline
nom_pipe = make_pipeline(freq_imputer, ohe)

#Selector
nom_cols = ['Outlet_Type']

#Tuple
nom_tuple = (nom_pipe, nom_cols)

In [21]:
# Instantiate the make column transformer
preprocessor = make_column_transformer(num_tuple, nom_tuple,remainder='passthrough')
preprocessor

In [22]:
# Fit the column transformer on the X_train
preprocessor.fit(X_train)

In [23]:
# Transforming train and test
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)


In [24]:
X_train_processed

array([[-0.9850859710989077, 0.0, 0.0, ..., 'OUT018', 'Medium', 'Tier 3'],
       [0.4917975163416224, 0.0, 0.0, ..., 'OUT018', 'Medium', 'Tier 3'],
       [-0.3673957074838433, 0.0, 1.0, ..., 'OUT049', 'Medium', 'Tier 1'],
       ...,
       [2.287588758931785, 0.0, 1.0, ..., 'OUT045', 'Missing', 'Tier 2'],
       [-0.3255971182166585, 0.0, 1.0, ..., 'OUT017', 'Missing',
        'Tier 2'],
       [-0.7234577642043066, 0.0, 1.0, ..., 'OUT046', 'Small', 'Tier 1']],
      dtype=object)