### Feature/data encoding: 
Feature encoding is the process of transforming `categorical feature` in `numeric feature `. this is necessary becaues machine learning algorithms can only handle numeric features. there are many different ways to encode categorical features, and each method has its own advantages and disadvantages. in this notebook, we will explore some of the most popular methods for encoding categoricl feature, such as:

+ label encoding
+ Ordinal encoding
+ One-hot encoding
+ Binary encoding

In [52]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [53]:
# data load
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [54]:
df['time'].value_counts()

time
Dinner    176
Lunch      68
Name: count, dtype: int64

In [55]:
# let's encode the time in label encoding with sklearn
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
le = LabelEncoder()
df['encoded_time'] = le.fit_transform(df['time'])
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,encoded_time
0,16.99,1.01,Female,No,Sun,Dinner,2,0
1,10.34,1.66,Male,No,Sun,Dinner,3,0
2,21.01,3.5,Male,No,Sun,Dinner,3,0
3,23.68,3.31,Male,No,Sun,Dinner,2,0
4,24.59,3.61,Female,No,Sun,Dinner,4,0


In [56]:
df['encoded_time'].value_counts()

encoded_time
0    176
1     68
Name: count, dtype: int64

In [57]:
df['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [58]:
# ordinal encoding the day column using specific order
ordinal_encoder = OrdinalEncoder(categories=[['Thur', 'Fri', 'Sat', 'Sun']])
df['encoded_day'] = ordinal_encoder.fit_transform(df[['day']])
df.head()   

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,encoded_time,encoded_day
0,16.99,1.01,Female,No,Sun,Dinner,2,0,3.0
1,10.34,1.66,Male,No,Sun,Dinner,3,0,3.0
2,21.01,3.5,Male,No,Sun,Dinner,3,0,3.0
3,23.68,3.31,Male,No,Sun,Dinner,2,0,3.0
4,24.59,3.61,Female,No,Sun,Dinner,4,0,3.0


In [59]:
df['encoded_day'].value_counts()

encoded_day
2.0    87
3.0    76
0.0    62
1.0    19
Name: count, dtype: int64

In [60]:
# one-hot encoding on day column
ohe = OneHotEncoder()
ohe.fit_transform(df[['sex']]).toarray()

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.

In [61]:
# example of one-hot encoding 
titanic_df = sns.load_dataset('titanic')
titanic_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [64]:
titanic_df['embarked'].value_counts()

embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [65]:
onehot = OneHotEncoder(sparse_output=False)
embarked_encoded = onehot.fit_transform(titanic_df[['embarked']])
embarked_encoded_df = pd.DataFrame(embarked_encoded, columns=onehot.get_feature_names_out(['embarked']))
titanic_df = pd.concat([titanic_df.reset_index(drop=True), embarked_encoded_df.reset_index(drop=True)], axis=1)
titanic_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,alive,alone,embarked_C,embarked_Q,embarked_S,embarked_nan,embarked_C.1,embarked_Q.1,embarked_S.1,embarked_nan.1
0,0,3,male,22.0,1,0,7.25,S,Third,man,...,no,False,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,yes,False,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,...,yes,True,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,1,1,female,35.0,1,0,53.1,S,First,woman,...,yes,False,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0,3,male,35.0,0,0,8.05,S,Third,man,...,no,True,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


# !pip install category-encoders
https://pypi.org/project/category-encoders/

In [68]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [72]:
df['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [71]:
from category_encoders import BinaryEncoder
# binary encoding
be = BinaryEncoder()
df_be = be.fit_transform(df['day'])
df_be.head()


Unnamed: 0,day_0,day_1,day_2
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1


In [75]:
# use pandas get dummies
get_dumies = pd.get_dummies(df , columns=['day'])
get_dumies.head()

Unnamed: 0,total_bill,tip,sex,smoker,time,size,day_Thur,day_Fri,day_Sat,day_Sun
0,16.99,1.01,Female,No,Dinner,2,False,False,False,True
1,10.34,1.66,Male,No,Dinner,3,False,False,False,True
2,21.01,3.5,Male,No,Dinner,3,False,False,False,True
3,23.68,3.31,Male,No,Dinner,2,False,False,False,True
4,24.59,3.61,Female,No,Dinner,4,False,False,False,True


In [76]:
# manual encoding using pandas
manual_encoding = {'Thur': 0, 'Fri': 1, 'Sat': 2, 'Sun': 3}
df['manual_encoded_day'] = df['day'].map(manual_encoding)
df.head()   

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,manual_encoded_day
0,16.99,1.01,Female,No,Sun,Dinner,2,3
1,10.34,1.66,Male,No,Sun,Dinner,3,3
2,21.01,3.5,Male,No,Sun,Dinner,3,3
3,23.68,3.31,Male,No,Sun,Dinner,2,3
4,24.59,3.61,Female,No,Sun,Dinner,4,3
