In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# **Nominal Encoding:**

Binary outputs for each of the categories!
Used when the categories have NO meaningful order

In [5]:
from sklearn.preprocessing import OneHotEncoder

df = pd.DataFrame({"Status":["Single","Seperated","Seperated","Single",'Married']})


In [6]:
df

Unnamed: 0,Status
0,Single
1,Seperated
2,Seperated
3,Single
4,Married


In [7]:
encoder = OneHotEncoder()

In [16]:
encoded = encoder.fit_transform(df[['Status']]).toarray()

In [17]:
encoded

array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [18]:
encoder.get_feature_names_out()

array(['Status_Married', 'Status_Seperated', 'Status_Single'],
      dtype=object)

In [19]:
encoded_df = pd.DataFrame(encoded,columns=encoder.get_feature_names_out())

In [20]:
encoded_df

Unnamed: 0,Status_Married,Status_Seperated,Status_Single
0,0.0,0.0,1.0
1,0.0,1.0,0.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0


In [22]:
encoder.transform([['Single']]).toarray()

array([[0., 0., 1.]])

In [24]:
encoder.transform([['Married']]).toarray()

array([[1., 0., 0.]])

In [26]:
df

Unnamed: 0,Status
0,Single
1,Seperated
2,Seperated
3,Single
4,Married


In [32]:
pd.concat([df,encoded_df],axis=1)

Unnamed: 0,Status,Status_Married,Status_Seperated,Status_Single
0,Single,0.0,0.0,1.0
1,Seperated,0.0,1.0,0.0
2,Seperated,0.0,1.0,0.0
3,Single,0.0,0.0,1.0
4,Married,1.0,0.0,0.0


# **Label Encoding:**

Assigns numerical data to each category

In [34]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

In [51]:
df = pd.DataFrame({"Status":["Single","Seperated","Seperated","Single",'Married']})

In [54]:
label_encoder.fit_transform(df['Status'])

array([2, 1, 1, 2, 0])

In [38]:
label_encoder.transform(['Single'])

array([2])

# **Ordinal Encoding:**

Used when categories have a natural order

Something that has low → medium → high, beginner → advanced, etc.

In [43]:
from sklearn.preprocessing import OrdinalEncoder

df2 = pd.DataFrame({"Education":["High School","Bachelor","Master","PhD"]})

In [44]:
df2

Unnamed: 0,Education
0,High School
1,Bachelor
2,Master
3,PhD


In [48]:
encoder = OrdinalEncoder(categories=[['High School','Bachelor','Master','PhD']])
encoder.fit_transform(df[['Education']])

array([[0.],
       [1.],
       [2.],
       [3.]])

In [47]:
encoder.transform([['Bachelor']])

array([[1.]])

# **Replace the categort with mean and median of respective Groups:**

In [58]:
import pandas as pd

df3 = pd.DataFrame({
    'time': ['Lunch', 'breakfast', 'dinner', 'Lunch', 'breakfast'],
    'total_bill': [120, 130, 90, 125, 366]
})


In [59]:
df3

Unnamed: 0,time,total_bill
0,Lunch,120
1,breakfast,130
2,dinner,90
3,Lunch,125
4,breakfast,366


In [60]:
df3.groupby('time')['total_bill'].mean()

Unnamed: 0_level_0,total_bill
time,Unnamed: 1_level_1
Lunch,122.5
breakfast,248.0
dinner,90.0


In [62]:
mean_price = df3.groupby('time')['total_bill'].mean().to_dict()

In [63]:
mean_price

{'Lunch': 122.5, 'breakfast': 248.0, 'dinner': 90.0}

In [66]:
df3['time_encoded']=df['time'].map(mean_price)

In [67]:
df3

Unnamed: 0,time,total_bill,time_encoded
0,Lunch,120,122.5
1,breakfast,130,248.0
2,dinner,90,90.0
3,Lunch,125,122.5
4,breakfast,366,248.0
