#Preprocessing

In [1]:
# Import relevant libraries
import pandas as pd
import numpy as np

# lets import libraries for encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import LabelBinarizer
from category_encoders import BinaryEncoder
from category_encoders import TargetEncoder
from category_encoders import HashingEncoder
from category_encoders import WOEEncoder

In [2]:
# Load dataset from github raw
df = pd.read_csv("https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/feature_encoding_full_dataset.csv")

Encoding gender

In [3]:
# This is a binary category(we can have label or binary encoding)
# Lets initialize LabelEncoder()

le = LabelEncoder()
df["gender_encoded"] = le.fit_transform(df["Gender"])
df[["gender_encoded", "Gender"]]

Unnamed: 0,gender_encoded,Gender
0,1,Male
1,0,Female
2,0,Female
3,0,Female
4,1,Male
...,...,...
295,1,Male
296,0,Female
297,1,Male
298,1,Male


Encode Education level

In [4]:
df.columns

Index(['Gender', 'Education Level', 'Marital Status', 'Employment Type',
       'Country', 'Income', 'gender_encoded'],
      dtype='object')

In [5]:
df["Education Level"].unique()

array(['PhD', 'High School', "Bachelor's", "Master's"], dtype=object)

In [6]:
#Assuming that education level has an inherent order
# "High School" < "Bachelor's" < "Master's" < "PhD"

#Using ordinal encoding
#Step1: Define Order
edu_order = [["High School", "Bachelor's", "Master's", "PhD"]]

#Step2: Initialize ordinal encoder
oe = OrdinalEncoder(categories = edu_order)

#Step3: Fit and transform
df["education_encoded"] = oe.fit_transform(df[["Education Level"]])

#Viewing the encoded data against the actual data
df[["education_encoded", "Education Level"]]

Unnamed: 0,education_encoded,Education Level
0,3.0,PhD
1,3.0,PhD
2,0.0,High School
3,0.0,High School
4,3.0,PhD
...,...,...
295,2.0,Master's
296,1.0,Bachelor's
297,1.0,Bachelor's
298,3.0,PhD


Encode Marital status

In [7]:
df["Marital Status"].unique()

# Onehot encoding will make sense for this category expecially when you are using a tree-based algorithm.

array(['Single', 'Married', 'Divorced', 'Widowed'], dtype=object)

In [8]:
#Initializing One-Hot Encoding
ohe = OneHotEncoder(sparse_output=False, drop = "first") #dropping first to avoid dummy variable trap
marital_encoded = ohe.fit_transform(df[["Marital Status"]])
# marital_encoded

In [9]:
#Converting to DataFrame and concatenating with the dataset
df_ohe =pd.DataFrame(marital_encoded, columns=ohe.get_feature_names_out(["Marital Status"]))
df_marital_encod = pd.concat([df["Marital Status"], df_ohe], axis=1)
df_marital_encod.head()

Unnamed: 0,Marital Status,Marital Status_Married,Marital Status_Single,Marital Status_Widowed
0,Single,0.0,1.0,0.0
1,Single,0.0,1.0,0.0
2,Married,1.0,0.0,0.0
3,Divorced,0.0,0.0,0.0
4,Widowed,0.0,0.0,1.0


In [10]:
df.columns

Index(['Gender', 'Education Level', 'Marital Status', 'Employment Type',
       'Country', 'Income', 'gender_encoded', 'education_encoded'],
      dtype='object')

Encode Employment Type

In [11]:
# Reloading the dataset
sal_df = pd.read_csv("https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/feature_encoding_with_salary.csv")

In [12]:
sal_df = sal_df.drop("Income", axis=1)

In [13]:
sal_df.columns

Index(['Gender', 'Education Level', 'Marital Status', 'Employment Type',
       'Country', 'Salary'],
      dtype='object')

In [14]:
sal_df["Employment Type"].unique()

array(['Part-time', 'Self-employed', 'Unemployed', 'Full-time'],
      dtype=object)

In [15]:
# LEts assume that it has a strong correlation with the target variable which is "Salary" though not present in  this dataset
# Lets initialize target encoding

te = TargetEncoder()
sal_df["employment_encoded"] = te.fit_transform(sal_df["Employment Type"], sal_df["Salary"])
sal_df[["Employment Type", "employment_encoded"]].head()

Unnamed: 0,Employment Type,employment_encoded
0,Part-time,34136.472609
1,Self-employed,71419.916754
2,Unemployed,4707.339548
3,Part-time,34136.472609
4,Unemployed,4707.339548


Encode Employment Type 2

In [16]:
# Assuming we have imbalance categories
# Using Frequency Encoding

sal_df["Employment_Frequency_Encoded"] = sal_df["Employment Type"].map(sal_df["Employment Type"].value_counts(normalize=True))
encoded = sal_df[["Employment Type", "Employment_Frequency_Encoded"]]
encoded.head()

Unnamed: 0,Employment Type,Employment_Frequency_Encoded
0,Part-time,0.246667
1,Self-employed,0.246667
2,Unemployed,0.283333
3,Part-time,0.246667
4,Unemployed,0.283333


In [17]:
sal_df.columns

Index(['Gender', 'Education Level', 'Marital Status', 'Employment Type',
       'Country', 'Salary', 'employment_encoded',
       'Employment_Frequency_Encoded'],
      dtype='object')