<a href="https://colab.research.google.com/github/JosephHobbs9292/Dojo/blob/main/Project_1_Part_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
## Importing Libraries

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

from sklearn import set_config
set_config(display='diagram')

In [3]:
## Mounting Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
## Load in dataset
df_sales = pd.read_csv("/content/sales_predictions.csv")
df_sales.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [None]:
## Checking duplicates
df_sales.duplicated().sum()

0

In [6]:
## Displaying summary statistics of numeric features

df_sales.describe().round(2)

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.86,0.07,140.99,1997.83,2181.29
std,4.64,0.05,62.28,8.37,1706.5
min,4.56,0.0,31.29,1985.0,33.29
25%,8.77,0.03,93.83,1987.0,834.25
50%,12.6,0.05,143.01,1999.0,1794.33
75%,16.85,0.09,185.64,2004.0,3101.3
max,21.35,0.33,266.89,2009.0,13086.96


---

Finding and Replacing Bad Values

---

In [12]:
## Checking for bad values
df_sales["Item_Fat_Content"].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [None]:
df_sales.replace(["LF", "reg", "low fat"], ["Low Fat", "Regular", "Low Fat"], inplace = True)

In [None]:
df_sales["Outlet_Size"].value_counts()

Medium    2793
Small     2388
High       932
Name: Outlet_Size, dtype: int64

In [14]:
df_sales["Outlet_Location_Type"].value_counts()

Tier 3    3350
Tier 2    2785
Tier 1    2388
Name: Outlet_Location_Type, dtype: int64

In [15]:
df_sales["Outlet_Type"].value_counts()

Supermarket Type1    5577
Grocery Store        1083
Supermarket Type3     935
Supermarket Type2     928
Name: Outlet_Type, dtype: int64

In [19]:
df_sales["Item_Type"].value_counts()

Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Soft Drinks               445
Meat                      425
Breads                    251
Hard Drinks               214
Others                    169
Starchy Foods             148
Breakfast                 110
Seafood                    64
Name: Item_Type, dtype: int64

In [23]:
## Spliting the data and running the train test split
X = df_sales[["Item_Weight", "Outlet_Establishment_Year", "Outlet_Size", "Outlet_Location_Type", "Outlet_Type", "Item_Fat_Content", "Item_Visibility", "Item_Type", "Item_MRP"]].copy()
y = df_sales["Item_Outlet_Sales"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [22]:
scaler = StandardScaler()
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

mean_imputer = SimpleImputer(strategy='mean')
freq_imputer = SimpleImputer(strategy='most_frequent')
 
missing_imputer = SimpleImputer(strategy='constant', fill_value='missing')

In [26]:
## Creating the ordinal lists in order of least to most
size_labels = ["Small", "Medium", "High"]
outlet_location_type_labels = ["Tier 1", "Tier 2", "Tier 3"]
ordered_labels = [size_labels, outlet_location_type_labels]

ordinal = OrdinalEncoder(categories = ordered_labels)

In [31]:
## Creating pipelines
num_pipeline = make_pipeline(mean_imputer, scaler)
ord_pipeline = make_pipeline(freq_imputer, ordinal)
nom_pipelne = make_pipeline(missing_imputer, ohe)

In [32]:
X = df_sales[["Item_Weight", "Outlet_Establishment_Year", "Outlet_Size", "Outlet_Location_Type", "Outlet_Type", "Item_Fat_Content", "Item_Visibility", "Item_Type", "Item_MRP"]].copy()

ordinal_columns = ["Outlet_Size", "Outlet_Location_Type"]
numerical_columns = ["Item_Weight", "Outlet_Establishment_Year", "Item_Visibility", "Item_MRP"]
nominal_columns = ["Item_Fat_Content", "Outlet_Type", "Item_Fat_Content", "Item_Type"]

ordinal_tuple = (ord_pipeline, ordinal_columns)
numerical_tuple = (num_pipeline, numerical_columns)
nominal_tuple = (nom_pipelne, nominal_columns)

In [33]:
preprocessor = make_column_transformer(ordinal_tuple, numerical_tuple, nominal_tuple, remainder='drop')
preprocessor.fit(X_train)

In [34]:
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [35]:
X_train_transformed.shape

(6392, 36)

In [36]:
X_train_transformed[:10]

array([[ 1.00000000e+00,  2.00000000e+00,  8.17248678e-01,
         1.32784893e+00, -7.12775072e-01,  1.82810922e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 1.00000000e+00,  2.00000000e+00,  5.56339503e-01,
         1.32784893e+00, -1.29105225e+00,  6.03368881e-01,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+