# Nominal/OHE
#### Represent categorical data as numerical data which is more suitable for algorithm
#### **Disadvantage**:
#### 1. If there are 100 categories we increase the number of feature by 100
#### 2. Sparse Matrix(with only 0 & 1): Leads to overfitting

In [None]:
# red,green,blue
'''
red=[1,0,0]
green=[0,1,0]
blue=[0,0,1]
'''

In [4]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
df=pd.DataFrame({'color':['red','green','blue','green','red']})

In [5]:
df.head()

Unnamed: 0,color
0,red
1,green
2,blue
3,green
4,red


In [7]:
# create an instance of OneHotEncoder
encoder=OneHotEncoder()
# fit and transform the data
encoded_colors=encoder.fit_transform(df[['color']]).toarray()
encoded_colors

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [9]:
encoded_df=pd.DataFrame(encoded_colors,columns=encoder.get_feature_names_out(['color']))
encoded_df

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0


In [10]:
encoder.transform([['blue'],['red']]).toarray()



array([[1., 0., 0.],
       [0., 0., 1.]])

In [11]:
pd.concat([df,encoded_df],axis=1)

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,green,0.0,1.0,0.0
2,blue,1.0,0.0,0.0
3,green,0.0,1.0,0.0
4,red,0.0,0.0,1.0


# Label and Ordinal encoding
#### Label encoding: assigning a unique numerical label to each category in the variable(alphabatically).Just nominal encoded.
#### Like Red=0, Green=1 ,Blue=2
#### Disadvantage with label encoding:Model may think this as rank


In [17]:
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()
labels=label_encoder.fit_transform(df['color']) 
labels

array([2, 1, 0, 1, 2])

In [16]:
print(label_encoder.inverse_transform([0,1,2,1,0]))
label_encoder.transform(['red','blue'])

['blue' 'green' 'red' 'green' 'blue']


array([2, 0])

### Ordinal encoding: When we want to assign intrinsic order or ranking
##### "High School"=1, "Associate's Degree"=2, "Bachelor's Degree"=3, "Master's Degree"=4, "PhD"=5

In [21]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder=OrdinalEncoder(categories=[['S','M','L']])
df=pd.DataFrame({'size':['S','M','L','M','S']})
df

Unnamed: 0,size
0,S
1,M
2,L
3,M
4,S


In [22]:
encoded_df=ordinal_encoder.fit_transform(df)
encoded_df

array([[0.],
       [1.],
       [2.],
       [1.],
       [0.]])

In [25]:
ordinal_encoder.transform([['L'],['S']])



array([[2.],
       [0.]])

### Target guided ordinal encoding
##### -Target-guided ordinal encoding is a technique used to convert categorical variables into numeric representations based on their relationship with a target variable.
##### -Useful when have large number of unique category.
##### -We replace the category in the categorical variable with a numerical value based on mean or median of the target variable for that category. This creates a monotonic relationship between the categorical variable and the target variable.


In [27]:
df=pd.DataFrame({'city':['New York','Los Angeles','Chicago','Los Angeles','New York'],'price':[100,200,150,180,120]})
df

Unnamed: 0,city,price
0,New York,100
1,Los Angeles,200
2,Chicago,150
3,Los Angeles,180
4,New York,120


In [28]:
mean_price=df.groupby('city')['price'].mean().to_dict()

In [32]:
df['city_encoded']=df['city'].map(mean_price)

In [33]:
df

Unnamed: 0,city,price,price_encoded,city_encoded
0,New York,100,110.0,110.0
1,Los Angeles,200,190.0,190.0
2,Chicago,150,150.0,150.0
3,Los Angeles,180,190.0,190.0
4,New York,120,110.0,110.0


In [34]:
df[['city','city_encoded']]

Unnamed: 0,city,city_encoded
0,New York,110.0
1,Los Angeles,190.0
2,Chicago,150.0
3,Los Angeles,190.0
4,New York,110.0


In [35]:
import seaborn as sns
df=sns.load_dataset('tips')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [None]:
data=df.groupby('time')['total_bill'].mean().to_dict()


  data=df.groupby('time')['total_bill'].mean().to_dict()


In [39]:
df['time_encoded']=df['time'].map(data)

In [41]:
df[['time','time_encoded']]

Unnamed: 0,time,time_encoded
0,Dinner,20.797159
1,Dinner,20.797159
2,Dinner,20.797159
3,Dinner,20.797159
4,Dinner,20.797159
...,...,...
239,Dinner,20.797159
240,Dinner,20.797159
241,Dinner,20.797159
242,Dinner,20.797159
