# Import used libraries

In [1]:
import pandas as pd
import numpy as np

# Implement One Hot Encoder Class

In [2]:
class OneHotEncoder:
    def __init__(self, data):
        self.data = data
        self.bin_unique_values = None
        self.unique_values = None

    def binary_one_hot_encode(self, column):
        self.bin_unique_values = column.unique()
        if len(self.bin_unique_values) != 2:
            raise ValueError("Column must have exactly two unique values for binary encoding.")
        return pd.concat([self.data.reset_index(drop=True), pd.DataFrame(columns=[f"one_hot_{column.name}"], data=np.where(column == self.bin_unique_values[0], 1, 0)).reset_index(drop=True)], axis=1).drop(column.name, axis=1)
    
    def one_hot_encode(self, column):
        self.unique_values = column.unique()
        if len(self.unique_values) <= 2:
            raise ValueError("Column must have at least three unique values, if you have two unique values, use `binary_one_hot_encode` instead.")
        tmp_df = pd.DataFrame(index=column.index, columns=self.unique_values)
        for value in self.unique_values:
            tmp_df[value] = np.where(column == value, 1, 0)
            # print(tmp_df, end='\n----------------------------------------------------------------------------------------------\n')
        return pd.concat([self.data, tmp_df], axis=1).drop(column.name, axis=1)

# Read Data

In [3]:
data = pd.read_csv("auto-mpg.csv") # Data with categorical predictors with more than two unique values
bin_data = pd.read_csv("Student_Performance.csv") # Data with categorical predictors with two unique values

# Binary One Hot Encoding

In [4]:
bin_one_hot_enc = OneHotEncoder(bin_data) # Initialize the encoder with binary data
bin_encoded_df = bin_one_hot_enc.binary_one_hot_encode(bin_data["Extracurricular Activities"]) # Perform binary one-hot encoding
print("Binary One-Hot Encoded DataFrame:")
bin_encoded_df.head(10)

Binary One-Hot Encoded DataFrame:


Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Performance Index,one_hot_Extracurricular Activities
0,7,99,9,1,91.0,1
1,4,82,4,2,65.0,0
2,8,51,7,2,45.0,1
3,5,52,5,2,36.0,1
4,7,75,8,5,66.0,0
5,3,78,9,6,61.0,0
6,7,73,5,6,63.0,1
7,8,45,4,6,42.0,1
8,5,77,8,2,61.0,0
9,4,89,4,0,69.0,0


# Multi-value One Hot Encoding

In [5]:
one_hot_enc = bin_enc = OneHotEncoder(data) # Initialize the encoder with the main data
encoded_df = one_hot_enc.one_hot_encode(data["car name"]) # Perform one-hot encoding on the 'car name' column
print("One-Hot Encoded DataFrame:")
encoded_df.head(10)

One-Hot Encoded DataFrame:


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,chevrolet chevelle malibu,buick skylark 320,...,chrysler lebaron medallion,ford granada l,toyota celica gt,dodge charger 2.2,chevrolet camaro,ford mustang gl,vw pickup,dodge rampage,ford ranger,chevy s-10
0,18.0,8,307.0,130,3504,12.0,70,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,15.0,8,350.0,165,3693,11.5,70,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,18.0,8,318.0,150,3436,11.0,70,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16.0,8,304.0,150,3433,12.0,70,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,17.0,8,302.0,140,3449,10.5,70,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,15.0,8,429.0,198,4341,10.0,70,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,14.0,8,454.0,220,4354,9.0,70,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,14.0,8,440.0,215,4312,8.5,70,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8,14.0,8,455.0,225,4425,10.0,70,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9,15.0,8,390.0,190,3850,8.5,70,1,0,0,...,0,0,0,0,0,0,0,0,0,0
