# OneHotEncoding

### If we have many many categorical columns, using onehot encoding it will create a sparse matrix which leads tgo overfitting

In [4]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [6]:
## Create a simple dataframe
df = pd.DataFrame({
    'color' : ['red', 'blue', 'green', 'green', 'red', 'blue']
})

In [8]:
df.head()

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,red


In [10]:
## Create an instance of of OneHotEncoder
encoder = OneHotEncoder()

In [16]:
## Perform fit and transform
encoded = encoder.fit_transform(df[['color']]).toarray()

In [18]:
import pandas as pd
encoder_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out())

In [20]:
encoder_df

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0
5,1.0,0.0,0.0


In [22]:
## For new data
encoder.transform([['blue']]).toarray()



array([[1., 0., 0.]])

In [24]:
pd.concat([df, encoder_df], axis=1)

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,blue,1.0,0.0,0.0
2,green,0.0,1.0,0.0
3,green,0.0,1.0,0.0
4,red,0.0,0.0,1.0
5,blue,1.0,0.0,0.0


# LabelEncoding

In [27]:
from sklearn.preprocessing import LabelEncoder
lbl_encoder = LabelEncoder()

In [29]:
lbl_encoder.fit_transform(df[['color']])

  y = column_or_1d(y, warn=True)


array([2, 0, 1, 1, 2, 0])

In [31]:
lbl_encoder.transform([['red']])

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([2])

### When we are assigning unique labels, red has got 2 so my model when this data will come it will start thinking red may be have higher value as compare to blue or green, so this should not happen, these are nominal values. But if there is any situation where you need to assign rank to this particular values we can definitly do it with the help of
## Ordinal Encoding

In [36]:
from sklearn.preprocessing import OrdinalEncoder

In [38]:
## Create a simple dataframe with an ordinal variable
df = pd.DataFrame({
    'size' : ['small', 'medium', 'large', 'medium', 'small', 'large']
})

In [40]:
df

Unnamed: 0,size
0,small
1,medium
2,large
3,medium
4,small
5,large


In [42]:
encoder = OrdinalEncoder(categories=[['small', 'medium', 'large']])

In [44]:
encoder.fit_transform(df[['size']])

array([[0.],
       [1.],
       [2.],
       [1.],
       [0.],
       [2.]])

In [46]:
encoder.transform([['small']])



array([[0.]])

# Target Guided Ordinal Encoding
#### It is a technique used to encode categorical variables based on their relationship with the target variable. This encoding is used when we have a categorical variable with a larger number of unique categories.
#### We replace each category in the categorical variable with a numerical value based on the mean or median of the target variable of that category.

In [50]:
import pandas as pd

## Create a simple dataframe with a categorical variable and a target variable
df = pd.DataFrame({
    'city' : ['New York', 'London', 'Paris', 'Tokya', 'New York', 'Paris'],
    'price' : [200, 150, 300, 250, 180, 320]
})

In [52]:
df

Unnamed: 0,city,price
0,New York,200
1,London,150
2,Paris,300
3,Tokya,250
4,New York,180
5,Paris,320


In [60]:
mean_price = df.groupby('city')['price'].mean().to_dict()

In [62]:
mean_price

{'London': 150.0, 'New York': 190.0, 'Paris': 310.0, 'Tokya': 250.0}

In [64]:
df['city_encoded'] = df['city'].map(mean_price)

In [66]:
df

Unnamed: 0,city,price,city_encoded
0,New York,200,190.0
1,London,150,150.0
2,Paris,300,310.0
3,Tokya,250,250.0
4,New York,180,190.0
5,Paris,320,310.0


In [68]:
df[['price', 'city_encoded']]

Unnamed: 0,price,city_encoded
0,200,190.0
1,150,150.0
2,300,310.0
3,250,250.0
4,180,190.0
5,320,310.0
