## Processing on categorical columns

In [51]:
# Importing the required packages
import numpy as np
import pandas as pd 
import seaborn as sns
# pd.options.display.float_format = '{:,.2f}'.format

In [52]:
# Loading the data
filePath = 'data/bigmart.csv'
data = pd.read_csv(filePath)
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.02,Dairy,249.81,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.14
1,DRC01,5.92,Regular,0.02,Soft Drinks,48.27,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.42
2,FDN15,17.5,Low Fat,0.02,Meat,141.62,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.09,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.86,OUT013,1987,High,Tier 3,Supermarket Type1,994.71


In [53]:
# Drop the item identifier columns since it will add no value to the data at this point
data=data.drop(['Item_Identifier'], axis=1)

In [54]:
# check for all the objects data types that has missing values on them
data.select_dtypes('object').isnull().sum()

Item_Fat_Content           0
Item_Type                  0
Outlet_Identifier          0
Outlet_Size             2410
Outlet_Location_Type       0
Outlet_Type                0
dtype: int64

In [55]:
# Checking for the frequency of the contents on the Outlet_Size
data['Outlet_Size'].value_counts()

Medium    2793
Small     2388
High       932
Name: Outlet_Size, dtype: int64

In [56]:
# Replace the missing values with the medium
data['Outlet_Size'].fillna('Medium',inplace=True)

In [57]:
# Check for the missing values of only objects
data.select_dtypes('object').isnull().sum()

Item_Fat_Content        0
Item_Type               0
Outlet_Identifier       0
Outlet_Size             0
Outlet_Location_Type    0
Outlet_Type             0
dtype: int64

### Label Encoding

In [58]:
# Importing the required package
from sklearn.preprocessing import LabelEncoder

In [59]:
# Inialize the encoder
lab_encoder = LabelEncoder()

In [60]:
data['Item_Fat_Content_encode'] = lab_encoder.fit_transform(data['Item_Fat_Content'])
data['Item_Type_encode'] = lab_encoder.fit_transform(data['Item_Type'])
data['Outlet_Identifier_encode'] = lab_encoder.fit_transform(data['Outlet_Identifier'])
data['Outlet_Size_encode'] = lab_encoder.fit_transform(data['Outlet_Size'])
data['Outlet_Location_Type_encode'] = lab_encoder.fit_transform(data['Outlet_Location_Type'])
data['Outlet_Type_encode'] = lab_encoder.fit_transform(data['Outlet_Type'])

In [61]:
data.head(7)

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_Fat_Content_encode,Item_Type_encode,Outlet_Identifier_encode,Outlet_Size_encode,Outlet_Location_Type_encode,Outlet_Type_encode
0,9.3,Low Fat,0.02,Dairy,249.81,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.14,1,4,9,1,0,1
1,5.92,Regular,0.02,Soft Drinks,48.27,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.42,2,14,3,1,2,2
2,17.5,Low Fat,0.02,Meat,141.62,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,1,10,9,1,0,1
3,19.2,Regular,0.0,Fruits and Vegetables,182.09,OUT010,1998,Medium,Tier 3,Grocery Store,732.38,2,6,0,1,2,0
4,8.93,Low Fat,0.0,Household,53.86,OUT013,1987,High,Tier 3,Supermarket Type1,994.71,1,9,1,0,2,1
5,10.39,Regular,0.0,Baking Goods,51.4,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.61,2,0,3,1,2,2
6,13.65,Regular,0.01,Snack Foods,57.66,OUT013,1987,High,Tier 3,Supermarket Type1,343.55,2,13,1,0,2,1


## Feature mapping for ordinal variable


In [62]:
# Importing the data set
data2 = pd.read_csv(filePath)
data2.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.02,Dairy,249.81,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.14
1,DRC01,5.92,Regular,0.02,Soft Drinks,48.27,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.42
2,FDN15,17.5,Low Fat,0.02,Meat,141.62,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.09,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.86,OUT013,1987,High,Tier 3,Supermarket Type1,994.71


In [63]:
# Check for missing values on the object data types
data2.select_dtypes('object').isnull().sum()

Item_Identifier            0
Item_Fat_Content           0
Item_Type                  0
Outlet_Identifier          0
Outlet_Size             2410
Outlet_Location_Type       0
Outlet_Type                0
dtype: int64

In [64]:
# Check the content of the values in the Outlet_Size
data2['Outlet_Size'].value_counts()

Medium    2793
Small     2388
High       932
Name: Outlet_Size, dtype: int64

In [65]:
# Fill in missing values for the Outlet_Size using the mode
data2['Outlet_Size'].fillna(data2['Outlet_Size'].mode()[0], inplace=True)

In [66]:
data.select_dtypes('object').isnull().sum()

Item_Fat_Content        0
Item_Type               0
Outlet_Identifier       0
Outlet_Size             0
Outlet_Location_Type    0
Outlet_Type             0
dtype: int64

In [67]:
# Code the categores of the content of Outlet_Size with medium being 1, small being 2 and high being 3
data2['Outlet_Size'] = data2['Outlet_Size'].replace(('Medium', 'Small', 'High'), (1,2,3))

In [68]:
data2['Outlet_Size'].value_counts()

1    5203
2    2388
3     932
Name: Outlet_Size, dtype: int64

## One HotEncoding with get dummies

In [69]:
# Loading the data
filePath2 = 'data/employee.csv'
data3 = pd.read_csv(filePath2)
data3.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [70]:
# One hot encode the department column
df = pd.get_dummies(data3['Department'])

In [71]:
df.head()

Unnamed: 0,Human Resources,Research & Development,Sales
0,0,0,1
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0


In [72]:
data3 = pd.concat([data3,df], axis=1)

In [73]:
# Check the columns of the data frame
data3.columns.tolist() 

['Age',
 'Attrition',
 'BusinessTravel',
 'DailyRate',
 'Department',
 'DistanceFromHome',
 'Education',
 'EducationField',
 'EmployeeCount',
 'EmployeeNumber',
 'EnvironmentSatisfaction',
 'Gender',
 'HourlyRate',
 'JobInvolvement',
 'JobLevel',
 'JobRole',
 'JobSatisfaction',
 'MaritalStatus',
 'MonthlyIncome',
 'MonthlyRate',
 'NumCompaniesWorked',
 'Over18',
 'OverTime',
 'PercentSalaryHike',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'StandardHours',
 'StockOptionLevel',
 'TotalWorkingYears',
 'TrainingTimesLastYear',
 'WorkLifeBalance',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager',
 'Human Resources',
 'Research & Development',
 'Sales']

In [74]:
# Dropping the department column
data3 = data3.drop(['Department'], axis=1)

In [75]:
data3.columns.tolist()

['Age',
 'Attrition',
 'BusinessTravel',
 'DailyRate',
 'DistanceFromHome',
 'Education',
 'EducationField',
 'EmployeeCount',
 'EmployeeNumber',
 'EnvironmentSatisfaction',
 'Gender',
 'HourlyRate',
 'JobInvolvement',
 'JobLevel',
 'JobRole',
 'JobSatisfaction',
 'MaritalStatus',
 'MonthlyIncome',
 'MonthlyRate',
 'NumCompaniesWorked',
 'Over18',
 'OverTime',
 'PercentSalaryHike',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'StandardHours',
 'StockOptionLevel',
 'TotalWorkingYears',
 'TrainingTimesLastYear',
 'WorkLifeBalance',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager',
 'Human Resources',
 'Research & Development',
 'Sales']

In [76]:
data3.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,...,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Human Resources,Research & Development,Sales
0,41,Yes,Travel_Rarely,1102,1,2,Life Sciences,1,1,2,...,8,0,1,6,4,0,5,0,0,1
1,49,No,Travel_Frequently,279,8,1,Life Sciences,1,2,3,...,10,3,3,10,7,1,7,0,1,0
2,37,Yes,Travel_Rarely,1373,2,2,Other,1,4,4,...,7,3,3,0,0,0,0,0,1,0
3,33,No,Travel_Frequently,1392,3,4,Life Sciences,1,5,4,...,8,3,3,8,7,3,0,0,1,0
4,27,No,Travel_Rarely,591,2,1,Medical,1,7,1,...,6,3,3,2,2,2,2,0,1,0


## One HotEncoding using OneHotEncoder

In [77]:
# Loading the data
filePath2 = 'data/employee.csv'
data4 = pd.read_csv(filePath2)
data4.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [78]:
# Importing the one hot encoder
from sklearn.preprocessing import OneHotEncoder

In [79]:
oneEncoder = OneHotEncoder()

In [80]:
one_encoder = oneEncoder.fit(data4['Department'].values.reshape(-1,1))

In [81]:
one_encoder.categories_

[array(['Human Resources', 'Research & Development', 'Sales'], dtype=object)]

In [82]:
hotencoder = one_encoder.transform(data4['Department'].values.reshape(-1,1)).toarray()

In [83]:
hotencoder

array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       ...,
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [84]:
# Creating a data frame for the hot encoder
label_df = pd.DataFrame()

In [85]:
label_df['Human_Resources_enc'] = hotencoder[:,0]
label_df['Research_Development_enc'] = hotencoder[:,1]
label_df['Sales_enc'] = hotencoder[:,2]

In [86]:
label_df

Unnamed: 0,Human_Resources_enc,Research_Development_enc,Sales_enc
0,0.00,0.00,1.00
1,0.00,1.00,0.00
2,0.00,1.00,0.00
3,0.00,1.00,0.00
4,0.00,1.00,0.00
...,...,...,...
1465,0.00,1.00,0.00
1466,0.00,1.00,0.00
1467,0.00,1.00,0.00
1468,0.00,0.00,1.00


In [87]:
data4 = pd.concat([data4, label_df])
data4

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Human_Resources_enc,Research_Development_enc,Sales_enc
0,41.00,Yes,Travel_Rarely,1102.00,Sales,1.00,2.00,Life Sciences,1.00,1.00,...,8.00,0.00,1.00,6.00,4.00,0.00,5.00,,,
1,49.00,No,Travel_Frequently,279.00,Research & Development,8.00,1.00,Life Sciences,1.00,2.00,...,10.00,3.00,3.00,10.00,7.00,1.00,7.00,,,
2,37.00,Yes,Travel_Rarely,1373.00,Research & Development,2.00,2.00,Other,1.00,4.00,...,7.00,3.00,3.00,0.00,0.00,0.00,0.00,,,
3,33.00,No,Travel_Frequently,1392.00,Research & Development,3.00,4.00,Life Sciences,1.00,5.00,...,8.00,3.00,3.00,8.00,7.00,3.00,0.00,,,
4,27.00,No,Travel_Rarely,591.00,Research & Development,2.00,1.00,Medical,1.00,7.00,...,6.00,3.00,3.00,2.00,2.00,2.00,2.00,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,,,,,,,,,,,...,,,,,,,,0.00,1.00,0.00
1466,,,,,,,,,,,...,,,,,,,,0.00,1.00,0.00
1467,,,,,,,,,,,...,,,,,,,,0.00,1.00,0.00
1468,,,,,,,,,,,...,,,,,,,,0.00,0.00,1.00


THE END