In [12]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
import seaborn as sns

In [13]:
df = sns.load_dataset('tips')
print(type(df))
print(df.head())

<class 'pandas.core.frame.DataFrame'>
   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4


In [14]:
encoder = OneHotEncoder()
encoded = encoder.fit_transform(df[['sex','smoker','day','time']]).toarray()
print(encoded)

[[1. 0. 1. ... 0. 1. 0.]
 [0. 1. 1. ... 0. 1. 0.]
 [0. 1. 1. ... 0. 1. 0.]
 ...
 [0. 1. 0. ... 0. 1. 0.]
 [0. 1. 1. ... 0. 1. 0.]
 [1. 0. 1. ... 1. 1. 0.]]


In [15]:
encoded_df = pd.DataFrame(encoded,columns=encoder.get_feature_names_out())
encoded_df

Unnamed: 0,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
239,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
240,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
241,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
242,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [16]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [19]:
lbl_encoder = LabelEncoder()
encoded_sex = lbl_encoder.fit_transform(df['sex'])
encoded_smoker = lbl_encoder.fit_transform(df['smoker'])
encoded_day = lbl_encoder.fit_transform(df['day'])
encoded_time = lbl_encoder.fit_transform(df['time'])

In [20]:
encoded_df = pd.DataFrame({
    'sex':encoded_sex,
    'smoker':encoded_smoker,
    'day':encoded_day,
    'time':encoded_time
})

In [21]:
encoded_df

Unnamed: 0,sex,smoker,day,time
0,0,0,2,0
1,1,0,2,0
2,1,0,2,0
3,1,0,2,0
4,0,0,2,0
...,...,...,...,...
239,1,0,1,0
240,0,1,1,0
241,1,1,1,0
242,1,0,1,0


### Target Guided Ordinal Encoding

In [2]:
import seaborn as sns
import pandas as pd

In [3]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [13]:
grouped = df.groupby('time')['total_bill'].mean().to_dict()
print(grouped)

{'Lunch': 17.168676470588235, 'Dinner': 20.79715909090909}


  grouped = df.groupby('time')['total_bill'].mean().to_dict()


In [17]:
df2 = pd.DataFrame(df['total_bill'],columns=['total_bill'])
df2['time_encoded']=df['time'].map(grouped)

In [18]:
print(df2)

     total_bill time_encoded
0         16.99    20.797159
1         10.34    20.797159
2         21.01    20.797159
3         23.68    20.797159
4         24.59    20.797159
..          ...          ...
239       29.03    20.797159
240       27.18    20.797159
241       22.67    20.797159
242       17.82    20.797159
243       18.78    20.797159

[244 rows x 2 columns]


In [20]:
df3 = pd.concat([df2,df['time']],axis = 1)

In [21]:
df3

Unnamed: 0,total_bill,time_encoded,time
0,16.99,20.797159,Dinner
1,10.34,20.797159,Dinner
2,21.01,20.797159,Dinner
3,23.68,20.797159,Dinner
4,24.59,20.797159,Dinner
...,...,...,...
239,29.03,20.797159,Dinner
240,27.18,20.797159,Dinner
241,22.67,20.797159,Dinner
242,17.82,20.797159,Dinner
