# <u>Data Science Essentials</u>

## <u>Topic</u>: One-hot Encoding

## <u>Category</u>: Feature Engineering

### <u>Created By</u>: Mohammed Misbahullah Sheriff
- [LinkedIn](https://www.linkedin.com/in/mohammed-misbahullah-sheriff/)
- [GitHub](https://github.com/MisbahullahSheriff)

## 1. Importing Libraries

In [1]:
import numpy as np
import pandas as pd

import seaborn as sns

import sklearn

from sklearn.preprocessing import OneHotEncoder

from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.utils.validation import check_is_fitted

In [2]:
sklearn.set_config(transform_output="pandas")

## 2. Getting the Data

In [3]:
df = sns.load_dataset("tips")
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


### 2.1 Splitting into Train and Test Data

In [4]:
X = df.drop(columns="total_bill")
y = df.total_bill.copy()

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=7
)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(195, 6) (195,)
(49, 6) (49,)


### 2.2 Features to Encode

In [5]:
cols = (
    X_train
      .select_dtypes(exclude="number")
      .columns
      .to_list()
)

cols

['sex', 'smoker', 'day', 'time']

## 3. Demo 1 - Scikit-learn

### 3.1 Setting up the Data Preprocessor

In [6]:
imputer = SimpleImputer(strategy="constant", fill_value="missing")
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

pipeline = Pipeline(steps=[
    ("imputer", imputer),
    ("encoder", encoder)
])

preprocessor = ColumnTransformer(transformers=[
    ("pre", pipeline, cols)
], remainder="passthrough")

### 3.2 Transforming the Data

In [7]:
X_train_enc = preprocessor.fit_transform(X_train)

print("Shape of Preprocessed Data:", X_train_enc.shape)
X_train_enc.head()

Shape of Preprocessed Data: (195, 12)


Unnamed: 0,pre__sex_Female,pre__sex_Male,pre__smoker_No,pre__smoker_Yes,pre__day_Fri,pre__day_Sat,pre__day_Sun,pre__day_Thur,pre__time_Dinner,pre__time_Lunch,remainder__tip,remainder__size
66,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.47,2
208,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,2.03,2
193,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,2.02,2
77,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,4.0,4
96,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,4.0,2


- The original dataset had 4 categorical features to encode
- As a result of one-hot encoding, the 4 features were transformed into 10 features
- The numeric variables were untouched and are located at the end of the dataset

## 4. Demo 2 - Custom Implementation

### 4.1 Custom Class

In [8]:
class CustomOHE(BaseEstimator, TransformerMixin):
  """
  Description:
  ------------
  This class will perform one-hot encoding on given dataset
  """

  def __init__(self,
               drop=None,
               variables=None,
               handle_unknown="ignore"):
    """
    Parameters:
    -----------
    drop: str
          To determine whether or not to drop an encoded feature. Helps to deal with multi-collinearity.
          Possible values are ('first', 'if_binary', None)
          - 'first' is useful for linear models where multi-collinearity can impact interpretation of coefficients
          - 'if_binary' can be used for tree-based models

    variables: list
               List of variables to encode. If 'None', will encode all variables of object type.

    handle_unknown: str
                    Determines how to deal with unknown categories during transform.
                    Possible values are ('ignore', error')
    """
    self.drop = drop
    self.variables = variables
    self.handle_unknown = handle_unknown


  def fit(self, X, y=None):
    """
    Description:
    ------------
    Tihs method will identify the categories of each feature to encode

    Parameters:
    -----------
    X: dataframe
       The dataset to encode

    y: array-like
       Optional. Default value is 'None'
    """
    if self.variables is None:
      encode_cols = (
          X
          .select_dtypes(include=["object"])
          .columns
          .to_list()
      )
    else:
      encode_cols = self.variables

    # determine categories for each feature to encode
    self.categories_ = dict()
    for col in encode_cols:
      categories = list(X.loc[:, col].unique())
      if self.drop == "first":
        if len(categories) == 1:
          pass
        else:
          self.categories_[col] = categories[1:]
      elif self.drop == "if_binary":
        if len(categories) == 2:
          self.categories_[col] = categories[1:]
        else:
          self.categories_[col] = categories
      else:
        self.categories_[col] = categories

    return self


  def transform(self, X):
    """
    Description:
    ------------
    Tihs method will transform the given dataset

    Parameters:
    -----------
    X: dataframe
       The dataset to trasnform
    """
    temp = X.copy()
    for column, categories in self.categories_.items():
      curr_categories = X.loc[:, column].unique()
      if (self.handle_unknown == "error") and (list(np.setdiff1d(curr_categories, categories)) != []):
        raise KeyError(f"Unknown category found in '{column}' during transform.")
      else:
        temp = temp.assign(**{
            f"{column}_{category}": X.loc[:, column].eq(category).astype(int) for category in categories
        }).drop(columns=column)

    return temp

### 4.2 Setting up the Data Preprocessor

In [9]:
imputer = SimpleImputer(strategy="constant", fill_value="missing")
encoder = CustomOHE(handle_unknown="ignore", variables=cols, drop=None)

pipeline = Pipeline(steps=[
    ("imputer", imputer),
    ("encoder", encoder)
])

### 4.3 Transforming the Data

In [10]:
X_train_enc = pipeline.fit_transform(X_train)

print("Shape of Preprocessed Data:", X_train_enc.shape)
X_train_enc.head()

Shape of Preprocessed Data: (195, 12)


Unnamed: 0,tip,size,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Sat,day_Thur,day_Fri,day_Sun,time_Dinner,time_Lunch
66,2.47,2,1,0,1,0,1,0,0,0,1,0
208,2.03,2,0,1,0,1,1,0,0,0,1,0
193,2.02,2,0,1,0,1,0,1,0,0,0,1
77,4.0,4,0,1,1,0,0,1,0,0,0,1
96,4.0,2,0,1,0,1,0,0,1,0,1,0


In [11]:
pipeline.transform(X_test).head()

Unnamed: 0,tip,size,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Sat,day_Thur,day_Fri,day_Sun,time_Dinner,time_Lunch
80,3.0,2,0,1,0,1,0,1,0,0,0,1
129,2.18,3,0,1,1,0,0,1,0,0,0,1
3,3.31,2,0,1,1,0,0,0,0,1,1,0
184,3.0,2,0,1,0,1,0,0,0,1,1,0
177,2.0,2,0,1,0,1,0,0,0,1,1,0


- Even with the custom implementation, similar results are observed as with the scikit-learn implementation