#Preprocessing

In [2]:
# Import relevant libraries
import pandas as pd
import numpy as np

# lets import libraries for encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import LabelBinarizer
from category_encoders import BinaryEncoder
from category_encoders import TargetEncoder
from category_encoders import HashingEncoder
from category_encoders import WOEEncoder

In [3]:
# Load dataset from github raw
df = pd.read_csv("https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/feature_encoding_full_dataset.csv")

Encoding gender

In [5]:
# This is a binary category(we can have label or binary encoding)
# Lets initialize LabelEncoder()

le = LabelEncoder()
df["gender_encoded"] = le.fit_transform(df["Gender"])
df[["gender_encoded", "Gender"]]

Unnamed: 0,gender_encoded,Gender
0,1,Male
1,0,Female
2,0,Female
3,0,Female
4,1,Male
...,...,...
295,1,Male
296,0,Female
297,1,Male
298,1,Male


Encode Education level

In [6]:
df.columns

Index(['Gender', 'Education Level', 'Marital Status', 'Employment Type',
       'Country', 'Income', 'gender_encoded'],
      dtype='object')

In [7]:
df["Education Level"].unique()

array(['PhD', 'High School', "Bachelor's", "Master's"], dtype=object)

In [13]:
#Assuming that education level has an inherent order
# "High School" < "Bachelor's" < "Master's" < "PhD"

#Using ordinal encoding
#Step1: Define Order
edu_order = [["High School", "Bachelor's", "Master's", "PhD"]]

#Step2: Initialize ordinal encoder
oe = OrdinalEncoder(categories = edu_order)

#Step3: Fit and transform
df["education_encoded"] = oe.fit_transform(df[["Education Level"]])

#Viewing the encoded data against the actual data
df[["education_encoded", "Education Level"]]

Unnamed: 0,education_encoded,Education Level
0,3.0,PhD
1,3.0,PhD
2,0.0,High School
3,0.0,High School
4,3.0,PhD
...,...,...
295,2.0,Master's
296,1.0,Bachelor's
297,1.0,Bachelor's
298,3.0,PhD


Encode Marital status

In [None]:
df["Marital Status"].unique()

# Onehot encoding will make sense for this category expecially when you are using a tree-based algorithm.

array(['Single', 'Married', 'Divorced', 'Widowed'], dtype=object)

In [34]:
#Initializing One-Hot Encoding
ohe = OneHotEncoder(sparse_output=False, drop = "first") #dropping first to avoid dummy variable trap
marital_encoded = ohe.fit_transform(df[["Marital Status"]])
marital_encoded

array([[0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 1.],
       [0., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0

In [35]:
#Converting to DataFram and concatenating with the dataset
df_ohe =pd.DataFrame(marital_encoded, columns=ohe.get_feature_names_out(["Marital Status"]))
df_marital_encod = pd.concat([df["Marital Status"], df_ohe], axis=1)
df_marital_encod.head()

Unnamed: 0,Marital Status,Marital Status_Married,Marital Status_Single,Marital Status_Widowed
0,Single,0.0,1.0,0.0
1,Single,0.0,1.0,0.0
2,Married,1.0,0.0,0.0
3,Divorced,0.0,0.0,0.0
4,Widowed,0.0,0.0,1.0
