# One Hot Encoding or Nominal Encoding

In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [3]:
## Creating a simple DataFrame

df = pd.DataFrame({
    'color': ['red', 'blue', 'green', 'blue', 'red']
})

In [4]:
## Creating an instance of OneHotEncoder
encoder = OneHotEncoder()

In [5]:
## Perform fit and transform
encoded = encoder.fit_transform(df[['color']]).toarray()

In [6]:
import pandas as pd
encoded_df = pd.DataFrame(encoded, columns = encoder.get_feature_names_out())

In [7]:
encoded_df

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0


In [10]:
encoder.transform([['blue']]).toarray()



array([[1., 0., 0.]])

In [11]:
pd.concat([df,encoded_df], axis=1)

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,blue,1.0,0.0,0.0
2,green,0.0,1.0,0.0
3,blue,1.0,0.0,0.0
4,red,0.0,0.0,1.0


In [12]:
import seaborn as sns
df = sns.load_dataset('tips')

In [13]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [14]:
encoder = OneHotEncoder()
encoded = encoder.fit_transform(df[['sex','smoker','day','time']]).toarray()

In [15]:
encoded

array([[1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.]], shape=(244, 10))

In [17]:
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out())
encoded_df

Unnamed: 0,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
239,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
240,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
241,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
242,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [18]:
pd.concat([df, encoded_df], axis=1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,16.99,1.01,Female,No,Sun,Dinner,2,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,10.34,1.66,Male,No,Sun,Dinner,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,21.01,3.50,Male,No,Sun,Dinner,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,23.68,3.31,Male,No,Sun,Dinner,2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,24.59,3.61,Female,No,Sun,Dinner,4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
240,27.18,2.00,Female,Yes,Sat,Dinner,2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
242,17.82,1.75,Male,No,Sat,Dinner,2,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


# Label Encoding

In [20]:
df = pd.DataFrame({
    'color': ['red', 'blue', 'green', 'blue', 'red']
})

In [22]:
df

Unnamed: 0,color
0,red
1,blue
2,green
3,blue
4,red


In [24]:
from sklearn.preprocessing import LabelEncoder
lbl_encoder = LabelEncoder()

In [31]:
lbl_encoder.fit_transform(df[['color']])

  y = column_or_1d(y, warn=True)


array([2, 0, 1, 0, 2])

In [34]:
lbl_encoder.transform(['blue'])

array([0])

# Ordinal Encoding

In [41]:
from sklearn.preprocessing import OrdinalEncoder

In [42]:
df = pd.DataFrame({
    'size': ['small', 'medium', 'large', 'medium', 'small','large']
})

In [43]:
df

Unnamed: 0,size
0,small
1,medium
2,large
3,medium
4,small
5,large


In [44]:
## Creating an instance of OrdinalEncoder and then fit_transform
encoder = OrdinalEncoder(categories=[['small', 'medium', 'large']])

In [45]:
encoder.fit_transform(df[['size']])

array([[0.],
       [1.],
       [2.],
       [1.],
       [0.],
       [2.]])

In [50]:
encoder.transform([['large']])



array([[2.]])

In [51]:
encoded = encoder.fit_transform(df[['size']])
encoded_df = pd.DataFrame(encoded, columns=['size'])

In [53]:
df = pd.concat([df, encoded_df], axis=1)

In [54]:
df

Unnamed: 0,size,size.1
0,small,0.0
1,medium,1.0
2,large,2.0
3,medium,1.0
4,small,0.0
5,large,2.0


# Target Guided Ordinal Encoding

In [55]:
df = pd.DataFrame({
    'city': ['New York', 'London', 'Paris', 'Tokyo', 'New York','Paris'],
    'price': [200,150,300,250,180,320]
})

In [56]:
df

Unnamed: 0,city,price
0,New York,200
1,London,150
2,Paris,300
3,Tokyo,250
4,New York,180
5,Paris,320


In [58]:
mean_price = df.groupby('city')['price'].mean().to_dict()

In [59]:
mean_price

{'London': 150.0, 'New York': 190.0, 'Paris': 310.0, 'Tokyo': 250.0}

In [60]:
df['city_encoded'] = df['city'].map(mean_price)

In [61]:
df

Unnamed: 0,city,price,city_encoded
0,New York,200,190.0
1,London,150,150.0
2,Paris,300,310.0
3,Tokyo,250,250.0
4,New York,180,190.0
5,Paris,320,310.0


In [62]:
df = sns.load_dataset('tips')

In [63]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [65]:
mean_time = df.groupby('time')['total_bill'].mean()

  mean_time = df.groupby('time')['total_bill'].mean()


In [66]:
df['mean_time'] = df['time'].map(mean_time)

In [67]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,mean_time
0,16.99,1.01,Female,No,Sun,Dinner,2,20.797159
1,10.34,1.66,Male,No,Sun,Dinner,3,20.797159
2,21.01,3.50,Male,No,Sun,Dinner,3,20.797159
3,23.68,3.31,Male,No,Sun,Dinner,2,20.797159
4,24.59,3.61,Female,No,Sun,Dinner,4,20.797159
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,20.797159
240,27.18,2.00,Female,Yes,Sat,Dinner,2,20.797159
241,22.67,2.00,Male,Yes,Sat,Dinner,2,20.797159
242,17.82,1.75,Male,No,Sat,Dinner,2,20.797159


In [69]:
df[['total_bill','time','mean_time']]

Unnamed: 0,total_bill,time,mean_time
0,16.99,Dinner,20.797159
1,10.34,Dinner,20.797159
2,21.01,Dinner,20.797159
3,23.68,Dinner,20.797159
4,24.59,Dinner,20.797159
...,...,...,...
239,29.03,Dinner,20.797159
240,27.18,Dinner,20.797159
241,22.67,Dinner,20.797159
242,17.82,Dinner,20.797159
