# Data Encoding
     1. Nominal/OHE encoding
     2. Label & ordinal encoding
     3. Target guided Ordinal encoding

# Nominal/OHE encoding
### One hot encoding, also known as nominal encoding, is a technique used to represent categorical data as numerical data, which is more   suitable for machine learning algorithms. In this technique, each category is represented as a binary vector where each bit corresponds to a unique category. For example, if we have a categorical variable "color" with three possible values (red, green, blue), we can represent it using one hot encoding as follows:
    1. Red:[1,0,0]
    2. Green: [0,1,0]
    3. Blue: [0, 0, 1]

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.DataFrame({
        "color": ["red", "blue", "green", "green", "red", "blue"]
})

In [3]:
df.head()

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,red


In [4]:
encoder = OneHotEncoder()

In [5]:
encoded = encoder.fit_transform(df[["color"]]).toarray()

In [6]:
encoder_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out())

In [7]:
encoder_df

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0
5,1.0,0.0,0.0


In [8]:
import seaborn as sns
df = sns.load_dataset("tips")

In [9]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [10]:
encoded_tips = encoder.fit_transform(df[["sex", "smoker", "day"]]).toarray()

In [11]:
new_df = pd.DataFrame(encoded_tips, columns=encoder.get_feature_names_out())

In [12]:
new_df

Unnamed: 0,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
239,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
240,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
241,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
242,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0


In [13]:
df = pd.DataFrame({
        "color": ["red", "blue", "green", "green", "red"]
})

In [14]:
df

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,red


In [15]:
from sklearn.preprocessing import LabelEncoder
lbl_encoder = LabelEncoder()

In [16]:
lbl_encoder.fit_transform(df[["color"]])

  y = column_or_1d(y, warn=True)


array([2, 0, 1, 1, 2])

### ordinal encoding

In [17]:
from sklearn.preprocessing import OrdinalEncoder

In [18]:
df = pd.DataFrame({
        "size": ["small", "medium", "large", "medium", "small", "large"]
})

In [19]:
df

Unnamed: 0,size
0,small
1,medium
2,large
3,medium
4,small
5,large


In [20]:
encoder = OrdinalEncoder(categories=[["small", "medium", "large"]])

In [21]:
encoder.fit_transform(df[["size"]])

array([[0.],
       [1.],
       [2.],
       [1.],
       [0.],
       [2.]])

## target guided ordinal encoding

In [22]:
df = pd.DataFrame({
        "city": ["new york", "london", "paris", "tokyo", "new york", "paris"],
        "price": [200, 150, 300, 250, 180, 320]
})

In [23]:
df

Unnamed: 0,city,price
0,new york,200
1,london,150
2,paris,300
3,tokyo,250
4,new york,180
5,paris,320


In [24]:
mean_price = df.groupby("city")["price"].mean().to_dict()

In [25]:
mean_price

{'london': 150.0, 'new york': 190.0, 'paris': 310.0, 'tokyo': 250.0}

In [26]:
df["city_encoded"] = df["city"].map(mean_price)

In [27]:
df

Unnamed: 0,city,price,city_encoded
0,new york,200,190.0
1,london,150,150.0
2,paris,300,310.0
3,tokyo,250,250.0
4,new york,180,190.0
5,paris,320,310.0


In [28]:
df[["price", "city_encoded"]]

Unnamed: 0,price,city_encoded
0,200,190.0
1,150,150.0
2,300,310.0
3,250,250.0
4,180,190.0
5,320,310.0


In [29]:
import seaborn as sns
df = sns.load_dataset("tips")

In [30]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [32]:
mean_price = df.groupby("day")["total_bill"].mean().to_dict()

In [33]:
mean_price

{'Thur': 17.682741935483868,
 'Fri': 17.15157894736842,
 'Sat': 20.44137931034483,
 'Sun': 21.41}

In [34]:
df["day_encoded"] = df["day"].map(mean_price)

In [35]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,day_encoded
0,16.99,1.01,Female,No,Sun,Dinner,2,21.410000
1,10.34,1.66,Male,No,Sun,Dinner,3,21.410000
2,21.01,3.50,Male,No,Sun,Dinner,3,21.410000
3,23.68,3.31,Male,No,Sun,Dinner,2,21.410000
4,24.59,3.61,Female,No,Sun,Dinner,4,21.410000
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,20.441379
240,27.18,2.00,Female,Yes,Sat,Dinner,2,20.441379
241,22.67,2.00,Male,Yes,Sat,Dinner,2,20.441379
242,17.82,1.75,Male,No,Sat,Dinner,2,20.441379


In [37]:
df[["total_bill", "day_encoded"]]

Unnamed: 0,total_bill,day_encoded
0,16.99,21.410000
1,10.34,21.410000
2,21.01,21.410000
3,23.68,21.410000
4,24.59,21.410000
...,...,...
239,29.03,20.441379
240,27.18,20.441379
241,22.67,20.441379
242,17.82,20.441379


In [38]:
df[["day", "day_encoded"]]

Unnamed: 0,day,day_encoded
0,Sun,21.410000
1,Sun,21.410000
2,Sun,21.410000
3,Sun,21.410000
4,Sun,21.410000
...,...,...
239,Sat,20.441379
240,Sat,20.441379
241,Sat,20.441379
242,Sat,20.441379
