# Encoding

### Feature Encoding

Feature encoding is the process of transforming categorical features into numeric features. This is necessary because machine learning algorithms can only handle numeric features. There are many different ways to encode categorical features, and each method has its own advantages and disadvantages. In this notebook, we will explore some of the most popular methods for encoding categorical features, such as:

### Features:
Label encoding Ordinal encoding One-hot encoding Binary encoding


In [1]:
#import Libraries
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 

In [2]:
# data load
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [8]:
df.describe()

Unnamed: 0,total_bill,tip,size,encoded_time
count,244.0,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672,0.278689
std,8.902412,1.383638,0.9511,0.449276
min,3.07,1.0,1.0,0.0
25%,13.3475,2.0,2.0,0.0
50%,17.795,2.9,2.0,0.0
75%,24.1275,3.5625,3.0,1.0
max,50.81,10.0,6.0,1.0


In [3]:
df['time'].value_counts()

time
Dinner    176
Lunch      68
Name: count, dtype: int64

In [5]:
df.shape

(244, 7)

In [13]:
df['encoded_time'] =df['time'].map({'Lunch':0 , 'Dinner':1}).astype(int)

In [14]:
df['encoded_time'].value_counts()


encoded_time
1    176
0     68
Name: count, dtype: int64

In [17]:
# Let's encode the time in label encoder with sklearn
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder

le = LabelEncoder()
df['encoded_time'] = le.fit_transform(df['time'])
df.head()


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,encoded_time
0,16.99,1.01,Female,No,Sun,Dinner,2,0
1,10.34,1.66,Male,No,Sun,Dinner,3,0
2,21.01,3.5,Male,No,Sun,Dinner,3,0
3,23.68,3.31,Male,No,Sun,Dinner,2,0
4,24.59,3.61,Female,No,Sun,Dinner,4,0


In [18]:
df['encoded_time'].value_counts()


encoded_time
0    176
1     68
Name: count, dtype: int64

In [19]:
# encode day column by mapping
df['day'].value_counts()


day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [20]:
# Ordinal Encoding the day column using specific order
oe = OrdinalEncoder(categories=[['Thur', 'Fri', 'Sat', 'Sun']])
df['encoded_day'] = oe.fit_transform(df[['day']])
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,encoded_time,encoded_day
0,16.99,1.01,Female,No,Sun,Dinner,2,0,3.0
1,10.34,1.66,Male,No,Sun,Dinner,3,0,3.0
2,21.01,3.5,Male,No,Sun,Dinner,3,0,3.0
3,23.68,3.31,Male,No,Sun,Dinner,2,0,3.0
4,24.59,3.61,Female,No,Sun,Dinner,4,0,3.0


In [12]:
df['encoded_day'].value_counts()

encoded_day
2.0    87
3.0    76
0.0    62
1.0    19
Name: count, dtype: int64

In [21]:
# One hot encoding the day column
ohe = OneHotEncoder()
df_encoded = ohe.fit_transform(df[['day']]).toarray()
df_encoded

array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],


In [14]:
# pip install category_encoders

Note: you may need to restart the kernel to use updated packages.


In [22]:
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Load the Titanic dataset from seaborn
titanic = sns.load_dataset('titanic')

# Create an instance of the OneHotEncoder
onehot_encoder = OneHotEncoder()

# Perform one-hot encoding on the 'embarked' column
embarked_onehot = onehot_encoder.fit_transform(titanic[['embarked']]).toarray()

# Create a DataFrame from the one-hot encoded array with column names
embarked_onehot_df = pd.DataFrame(embarked_onehot, columns=onehot_encoder.categories_[0])

# Concatenate the original Titanic dataset with the one-hot encoded DataFrame
titanic = pd.concat([titanic.reset_index(drop=True), embarked_onehot_df], axis=1)

# Display the first few rows of the modified Titanic dataset
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,C,Q,S,NaN
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,0.0,0.0,1.0,0.0
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,1.0,0.0,0.0,0.0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,0.0,0.0,1.0,0.0
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,0.0,0.0,1.0,0.0
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,0.0,0.0,1.0,0.0


In [23]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [17]:
from category_encoders import BinaryEncoder

binary_encoder = BinaryEncoder()
df_binary = binary_encoder.fit_transform(df['day'])
df_binary

Unnamed: 0,day_0,day_1,day_2
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
239,0,1,0
240,0,1,0
241,0,1,0
242,0,1,0


In [18]:
# pandas get dummies
df = pd.get_dummies(df, columns=['sex', 'smoker', 'day', 'time'])
df

Unnamed: 0,total_bill,tip,size,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
0,16.99,1.01,2,False,True,False,True,False,False,False,True,False,True
1,10.34,1.66,3,True,False,False,True,False,False,False,True,False,True
2,21.01,3.50,3,True,False,False,True,False,False,False,True,False,True
3,23.68,3.31,2,True,False,False,True,False,False,False,True,False,True
4,24.59,3.61,4,False,True,False,True,False,False,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,True,False,False,True,False,False,True,False,False,True
240,27.18,2.00,2,False,True,True,False,False,False,True,False,False,True
241,22.67,2.00,2,True,False,True,False,False,False,True,False,False,True
242,17.82,1.75,2,True,False,False,True,False,False,True,False,False,True
