### OHE Encoding

In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder


In [3]:
df = pd.DataFrame({
    'color': ['red', 'green', 'blue', 'green', 'red']
})

df.head()


Unnamed: 0,color
0,red
1,green
2,blue
3,green
4,red


In [4]:
## Create an instance of the OneHotEncoder
ohe = OneHotEncoder()

In [5]:
## perform fit and transform

encoded_values = ohe.fit_transform(df[['color']]).toarray()

In [6]:
encoded_df = pd.DataFrame(encoded_values,columns=ohe.get_feature_names_out())

In [7]:
encoded_df.head()

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0


In [8]:
df = pd.concat([df,encoded_df],axis=1)
df.head()

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,green,0.0,1.0,0.0
2,blue,1.0,0.0,0.0
3,green,0.0,1.0,0.0
4,red,0.0,0.0,1.0


In [10]:
import seaborn as sns

df  = sns.load_dataset('tips')

In [11]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [13]:
tips_encoded = ohe.fit_transform(df[['sex','smoker','day','time']]).toarray()

In [14]:
tips_encoded_df = pd.DataFrame(tips_encoded,columns=ohe.get_feature_names_out())

In [15]:
tips_df = pd.concat([df,tips_encoded_df],axis=1)
tips_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,16.99,1.01,Female,No,Sun,Dinner,2,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,10.34,1.66,Male,No,Sun,Dinner,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,21.01,3.5,Male,No,Sun,Dinner,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,23.68,3.31,Male,No,Sun,Dinner,2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,24.59,3.61,Female,No,Sun,Dinner,4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


### Label Encoding

In [17]:
df = pd.DataFrame({
    'color': ['red', 'green', 'blue', 'green', 'red']
})

df.head()

Unnamed: 0,color
0,red
1,green
2,blue
3,green
4,red


In [18]:
from sklearn.preprocessing import LabelEncoder

lbl_encoder = LabelEncoder()

In [19]:
lbl_encoder.fit_transform(df[['color']])

  y = column_or_1d(y, warn=True)


array([2, 1, 0, 1, 2])

In [21]:
## Transform single value
lbl_encoder.transform([['red']])

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([2])

### Ordinal Encoding

In [29]:
from sklearn.preprocessing import OrdinalEncoder


In [32]:
df = pd.DataFrame({
    'size' :['small','medium','large','large','small','medium']
})


In [33]:

oe_encoder = OrdinalEncoder(categories=[['small','medium','large']])
encoded_values = oe_encoder.fit_transform(df[['size']])

In [38]:
size_df = pd.DataFrame(encoded_values,columns=['size'])

final_size_df = pd.concat([df,size_df],axis=1)

final_size_df.head()

Unnamed: 0,size,size.1
0,small,0.0
1,medium,1.0
2,large,2.0
3,large,2.0
4,small,0.0


### Target guided ordinal encoding

In [39]:
import pandas as pd

# create a sample dataframe with a categorical variable and a target variable
df = pd.DataFrame({
    'city': ['New York', 'London', 'Paris', 'Tokyo', 'New York', 'Paris'],
    'price': [200, 150, 300, 250, 180, 320]
})

df

Unnamed: 0,city,price
0,New York,200
1,London,150
2,Paris,300
3,Tokyo,250
4,New York,180
5,Paris,320


In [43]:
mean_dict = df.groupby('city')['price'].mean().to_dict()

mean_dict

{'London': 150.0, 'New York': 190.0, 'Paris': 310.0, 'Tokyo': 250.0}

In [44]:
df['city_encoded'] = df['city'].map(mean_dict)

In [45]:
df

Unnamed: 0,city,price,city_encoded
0,New York,200,190.0
1,London,150,150.0
2,Paris,300,310.0
3,Tokyo,250,250.0
4,New York,180,190.0
5,Paris,320,310.0


In [47]:
df_tips = sns.load_dataset('tips')
df_tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [49]:
time_mean_dict = df_tips.groupby('time')['total_bill'].mean().to_dict()

time_mean_dict

  time_mean_dict = df_tips.groupby('time')['total_bill'].mean().to_dict()


{'Lunch': 17.168676470588235, 'Dinner': 20.79715909090909}

In [50]:
df_tips['time_encoded'] = df_tips['time'].map(time_mean_dict)
df_tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,time_encoded
0,16.99,1.01,Female,No,Sun,Dinner,2,20.797159
1,10.34,1.66,Male,No,Sun,Dinner,3,20.797159
2,21.01,3.5,Male,No,Sun,Dinner,3,20.797159
3,23.68,3.31,Male,No,Sun,Dinner,2,20.797159
4,24.59,3.61,Female,No,Sun,Dinner,4,20.797159
