#Preliminary Steps

In [32]:
#Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import set_config
set_config(display='diagram')

In [33]:
#Loading data
df = pd.read_csv('/content/sales_predictions.csv')
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


# Getting rid of unnecessary columns

In [34]:
df = df.drop(columns = ['Item_Identifier','Outlet_Identifier','Item_Weight'])

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Fat_Content           8523 non-null   object 
 1   Item_Visibility            8523 non-null   float64
 2   Item_Type                  8523 non-null   object 
 3   Item_MRP                   8523 non-null   float64
 4   Outlet_Establishment_Year  8523 non-null   int64  
 5   Outlet_Size                6113 non-null   object 
 6   Outlet_Location_Type       8523 non-null   object 
 7   Outlet_Type                8523 non-null   object 
 8   Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(3), int64(1), object(5)
memory usage: 599.4+ KB


##I see categorical values in the 'Outlet_Size' column that'll need to be imputed.

## Looking for duplicate values.

In [36]:
df.duplicated().sum()

0

In [37]:
columns = df.columns
columns

Index(['Item_Fat_Content', 'Item_Visibility', 'Item_Type', 'Item_MRP',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Outlet_Sales'],
      dtype='object')

# Checking for questionable values. 

In [38]:
for col in columns:
    print(f'{col}:')
    print(df[col].describe())
    print('\n\n')

Item_Fat_Content:
count        8523
unique          5
top       Low Fat
freq         5089
Name: Item_Fat_Content, dtype: object



Item_Visibility:
count    8523.000000
mean        0.066132
std         0.051598
min         0.000000
25%         0.026989
50%         0.053931
75%         0.094585
max         0.328391
Name: Item_Visibility, dtype: float64



Item_Type:
count                      8523
unique                       16
top       Fruits and Vegetables
freq                       1232
Name: Item_Type, dtype: object



Item_MRP:
count    8523.000000
mean      140.992782
std        62.275067
min        31.290000
25%        93.826500
50%       143.012800
75%       185.643700
max       266.888400
Name: Item_MRP, dtype: float64



Outlet_Establishment_Year:
count    8523.000000
mean     1997.831867
std         8.371760
min      1985.000000
25%      1987.000000
50%      1999.000000
75%      2004.000000
max      2009.000000
Name: Outlet_Establishment_Year, dtype: float64



Outlet_Size:

Checking all column values for nominal or categorical values.

In [39]:
for col in columns:
    print(f'{col}:')
    print(df[col].value_counts())
    print('\n\n')

Item_Fat_Content:
Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64



Item_Visibility:
0.000000    526
0.076975      3
0.162462      2
0.076841      2
0.073562      2
           ... 
0.013957      1
0.110460      1
0.124646      1
0.054142      1
0.044878      1
Name: Item_Visibility, Length: 7880, dtype: int64



Item_Type:
Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Soft Drinks               445
Meat                      425
Breads                    251
Hard Drinks               214
Others                    169
Starchy Foods             148
Breakfast                 110
Seafood                    64
Name: Item_Type, dtype: int64



Item_MRP:
172.0422    7
170.5422    6
196.5084    6
188.1872    6
142.0154    6
           .

##The 'Outlet_Size' and 'Outlet_Location_Type' columns will need to be ordinal encoded and the 'Item_Fat_Content' columns have some questionable values. 

In [40]:
replace = {
'Low':0,
'Small': 1,
'Medium':2,
'High':3
}
df.Outlet_Size.replace(replace, inplace = True)
df.Outlet_Size.value_counts()

2.0    2793
1.0    2388
3.0     932
Name: Outlet_Size, dtype: int64

In [41]:
replais = {
'Tier 1':0,
'Tier 2':1,
'Tier 3':2
}
df.Outlet_Location_Type.replace(replais, inplace = True)
df.Outlet_Location_Type.value_counts()

2    3350
1    2785
0    2388
Name: Outlet_Location_Type, dtype: int64

In [42]:
df.Item_Fat_Content.replace({
'LF':'Low Fat',
'low fat':'Low Fat',
'reg':'Regular'
}, inplace = True)
df.Item_Fat_Content.value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

##Now we can split our data

#Validation Split

In [43]:
y = df.Item_Outlet_Sales
X = df.drop(columns = 'Item_Outlet_Sales')

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)

#Instantiating column selectors

In [45]:
#Numeric:
num_selector = make_column_selector(dtype_include = 'number')
num_selector(X_train)

['Item_Visibility',
 'Item_MRP',
 'Outlet_Establishment_Year',
 'Outlet_Size',
 'Outlet_Location_Type']

In [46]:
#Categorical
cat_selector = make_column_selector(dtype_include = 'object')
cat_selector(X_train)

['Item_Fat_Content', 'Item_Type', 'Outlet_Type']

#Instantiating transformers

In [47]:
#Imputers
mean_imputer = SimpleImputer(strategy = 'mean')
mfrq_imputer = SimpleImputer(strategy = 'most_frequent')
#OneHotEncoder
ohe = OneHotEncoder(sparse = False, handle_unknown = 'ignore')

scaler = StandardScaler()
scaler.fit_transform(X_train[num_selector(X_train)])

array([[-0.71277507,  1.82810922,  1.32784893,  0.34518029,  1.08494779],
       [-1.29105225,  0.60336888,  1.32784893,  0.34518029,  1.08494779],
       [ 1.81331864,  0.24454056,  0.13618724,  0.34518029, -1.38477667],
       ...,
       [-0.92052713,  1.52302674,  0.49368575,         nan, -0.14991444],
       [-0.2277552 , -0.38377708,  1.0895166 ,         nan, -0.14991444],
       [-0.95867683, -0.73836105, -0.10214509, -1.09333266, -1.38477667]])

In [48]:
#Instantiating pipelines
num_pipe = make_pipeline(mean_imputer, scaler)
cat_pipe = make_pipeline(mfrq_imputer, ohe)
display(num_pipe)
cat_pipe

# Making our ColumnTransformer

In [49]:
#first making our tuples
num_tuple = (num_pipe, num_selector)
cat_tuple = (cat_pipe, cat_selector)

In [50]:
#letting make_column_tranformer take them in (pipeline, column selector) tuple format
preprocessor = make_column_transformer(num_tuple, cat_tuple)
preprocessor

In [51]:
#fitting onto our training data
preprocessor.fit(X_train)

In [52]:
#transforming our data
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [53]:
display(X_train_processed)
X_test_processed

array([[-0.71277507,  1.82810922,  1.32784893, ...,  0.        ,
         1.        ,  0.        ],
       [-1.29105225,  0.60336888,  1.32784893, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.81331864,  0.24454056,  0.13618724, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [-0.92052713,  1.52302674,  0.49368575, ...,  1.        ,
         0.        ,  0.        ],
       [-0.2277552 , -0.38377708,  1.0895166 , ...,  1.        ,
         0.        ,  0.        ],
       [-0.95867683, -0.73836105, -0.10214509, ...,  1.        ,
         0.        ,  0.        ]])

array([[-0.77664625, -0.99881554, -1.29380678, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.1003166 , -1.58519423, -0.10214509, ...,  1.        ,
         0.        ,  0.        ],
       [-0.48299432, -1.59578435,  0.13618724, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 1.21832428,  1.09397975,  0.49368575, ...,  1.        ,
         0.        ,  0.        ],
       [-0.77809567, -0.36679966,  0.13618724, ...,  1.        ,
         0.        ,  0.        ],
       [-0.77976293,  0.11221189,  1.0895166 , ...,  1.        ,
         0.        ,  0.        ]])