### Create the dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Sample dataset
data = {
    'Color': ['Red', 'Green', 'Blue', 'Green', 'Red', 'Blue', 'Green', 'Red', 'Blue', 'Red',
              'Green', 'Blue', 'Red', 'Green', 'Blue', 'Red', 'Green', 'Blue', 'Red', 'Green'],
    'Size': ['S', 'M', 'L', 'S', 'M', 'L', 'S', 'M', 'L', 'S',
             'M', 'L', 'S', 'M', 'L', 'S', 'M', 'L', 'S', 'M'],
    'Price': [10, 15, 20, 12, 14, 21, 11, 16, 19, 13,
              15, 22, 12, 14, 21, 11, 16, 20, 13, 15]
}


df = pd.DataFrame(data)
df.head()

Unnamed: 0,Color,Size,Price
0,Red,S,10
1,Green,M,15
2,Blue,L,20
3,Green,S,12
4,Red,M,14


### Categorical columns and its values

In [3]:
for col in df.columns:
    if df[col].dtype == 'O':
        print( f"{col}->{df[col].unique()}")

Color->['Red' 'Green' 'Blue']
Size->['S' 'M' 'L']


### Label Encoding

In [4]:
from sklearn.preprocessing import LabelEncoder

##### Label Encoding is a data preprocessing technique that converts categorical text labels (like "Red", "Green", "Blue") into numerical values (like 0, 1, 2)

In [5]:
le = LabelEncoder()

In [6]:
df_le = df.copy( deep=True)

df_le.head()

Unnamed: 0,Color,Size,Price
0,Red,S,10
1,Green,M,15
2,Blue,L,20
3,Green,S,12
4,Red,M,14


In [7]:
df_le['Color_encoder'] = le.fit_transform( df['Color'] )
df_le['Size_encoder'] = le.fit_transform( df['Size'] )

In [8]:
df_le.head()

Unnamed: 0,Color,Size,Price,Color_encoder,Size_encoder
0,Red,S,10,2,2
1,Green,M,15,1,1
2,Blue,L,20,0,0
3,Green,S,12,1,2
4,Red,M,14,2,1


In [9]:
le.classes_

array(['L', 'M', 'S'], dtype=object)

### One-Hot Encoding

In [10]:
from sklearn.preprocessing import OneHotEncoder

#### One-hot encoding is a process of converting categorical data into a numerical format by creating a binary vector for each category. It assigns a unique vector to each category where a '1' indicates the presence of that category and '0's indicate its absence.

In [11]:
oe = OneHotEncoder( dtype='int' )

#### drop='first' removes one category per feature after one-hot encoding.

#### This is done to avoid the Dummy Variable Trap, which causes multicollinearity.

#### When you one-hot encode a categorical variable with k categories, you create k binary columns.

#### But: One column can be perfectly predicted from the others.

#### This leads to redundant features and multicollinearity.

In [12]:
df_one_hot = df.copy( deep=True )

df_one_hot.head()

Unnamed: 0,Color,Size,Price
0,Red,S,10
1,Green,M,15
2,Blue,L,20
3,Green,S,12
4,Red,M,14


In [13]:
array = oe.fit_transform( df_one_hot[['Color','Size']] ).toarray()
array

array([[0, 0, 1, 0, 0, 1],
       [0, 1, 0, 0, 1, 0],
       [1, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0],
       [1, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0],
       [1, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 1],
       [0, 1, 0, 0, 1, 0],
       [1, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 1],
       [0, 1, 0, 0, 1, 0],
       [1, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 1],
       [0, 1, 0, 0, 1, 0],
       [1, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 1],
       [0, 1, 0, 0, 1, 0]])

In [14]:
oe.get_feature_names_out()

array(['Color_Blue', 'Color_Green', 'Color_Red', 'Size_L', 'Size_M',
       'Size_S'], dtype=object)

In [15]:
temp_df = pd.DataFrame( data=array , columns= oe.get_feature_names_out())
temp_df.head()

Unnamed: 0,Color_Blue,Color_Green,Color_Red,Size_L,Size_M,Size_S
0,0,0,1,0,0,1
1,0,1,0,0,1,0
2,1,0,0,1,0,0
3,0,1,0,0,0,1
4,0,0,1,0,1,0


In [16]:
df_one_hot = pd.concat( [df_one_hot , temp_df]  ,axis=1 )

In [17]:
df_one_hot.head()

Unnamed: 0,Color,Size,Price,Color_Blue,Color_Green,Color_Red,Size_L,Size_M,Size_S
0,Red,S,10,0,0,1,0,0,1
1,Green,M,15,0,1,0,0,1,0
2,Blue,L,20,1,0,0,1,0,0
3,Green,S,12,0,1,0,0,0,1
4,Red,M,14,0,0,1,0,1,0


In [18]:
df_one_hot[["Color_Blue","Color_Green","Color_Red","Size_L","Size_M","Size_S",'Price']].corr()

Unnamed: 0,Color_Blue,Color_Green,Color_Red,Size_L,Size_M,Size_S,Price
Color_Blue,1.0,-0.480384,-0.480384,1.0,-0.480384,-0.480384,0.892525
Color_Green,-0.480384,1.0,-0.538462,-0.480384,0.56044,-0.098901,-0.300129
Color_Red,-0.480384,-0.538462,1.0,-0.480384,-0.098901,0.56044,-0.557382
Size_L,1.0,-0.480384,-0.480384,1.0,-0.480384,-0.480384,0.892525
Size_M,-0.480384,0.56044,-0.098901,-0.480384,1.0,-0.538462,-0.100043
Size_S,-0.480384,-0.098901,0.56044,-0.480384,-0.538462,1.0,-0.757468
Price,0.892525,-0.300129,-0.557382,0.892525,-0.100043,-0.757468,1.0


### One hot encoding using Dummies

In [19]:
df_onehot = pd.get_dummies(df, columns=['Color', 'Size'])
df_onehot.head()

Unnamed: 0,Price,Color_Blue,Color_Green,Color_Red,Size_L,Size_M,Size_S
0,10,False,False,True,False,False,True
1,15,False,True,False,False,True,False
2,20,True,False,False,True,False,False
3,12,False,True,False,False,False,True
4,14,False,False,True,False,True,False


In [20]:
bin = {
    True:1,
    False:0
}

df_onehot[["Color_Blue","Color_Green","Color_Red","Size_L","Size_M","Size_S"]] = df_onehot[["Color_Blue","Color_Green","Color_Red","Size_L","Size_M","Size_S"]].map( lambda x:bin[x] )

df_onehot.head()

Unnamed: 0,Price,Color_Blue,Color_Green,Color_Red,Size_L,Size_M,Size_S
0,10,0,0,1,0,0,1
1,15,0,1,0,0,1,0
2,20,1,0,0,1,0,0
3,12,0,1,0,0,0,1
4,14,0,0,1,0,1,0


### Target Encoding

#### Target encoding, also known as mean encoding, is a feature engineering technique that converts categorical data into numerical form by replacing each category with the average (mean) of the target variable for that specific category

In [21]:
df_tar = df.copy( deep=True )
df_tar.head()

Unnamed: 0,Color,Size,Price
0,Red,S,10
1,Green,M,15
2,Blue,L,20
3,Green,S,12
4,Red,M,14


In [22]:
# Compute mean Price per Color

target_mean = df_tar.groupby('Color')['Price'].mean()
df_tar['Color_TargetEnc'] = round( df_tar['Color'].map(target_mean) , 3 )

target_mean = df_tar.groupby('Size')['Price'].mean()
df_tar['Size_TargetEnc'] = round( df_tar['Size'].map(target_mean) , 3 )

df_tar.head()

Unnamed: 0,Color,Size,Price,Color_TargetEnc,Size_TargetEnc
0,Red,S,10,12.714,11.714
1,Green,M,15,14.0,15.0
2,Blue,L,20,20.5,20.5
3,Green,S,12,14.0,11.714
4,Red,M,14,12.714,15.0


### Frequency Encoding

#### Frequency encoding is a technique used in machine learning to represent a categorical variable as a numerical value based on the frequency of each category in the dataset.

In [23]:
df_freq = df.copy( deep=True )
df_freq.head()

Unnamed: 0,Color,Size,Price
0,Red,S,10
1,Green,M,15
2,Blue,L,20
3,Green,S,12
4,Red,M,14


In [24]:
freq = df_freq['Color'].value_counts()
df_freq['Color_FreqEnc'] = df_freq['Color'].map(freq)
df_freq.head()

Unnamed: 0,Color,Size,Price,Color_FreqEnc
0,Red,S,10,7
1,Green,M,15,7
2,Blue,L,20,6
3,Green,S,12,7
4,Red,M,14,7
