# Solutions I: Transformers

In [None]:
import numpy as np
import pandas as pd

In [None]:
# Read in the banking data set
df = pd.read_csv("../../0_data/banking/bank-additional-full.csv", sep=";")
df.head(3)

In [None]:
# Drop irrelevant / label columns.
df = df.drop(columns=["y", "duration", "pdays"])

## Encode Categorical Data

In [None]:
# Import the transformer from the preprocessing module.
from sklearn.preprocessing import OneHotEncoder

In [None]:
# Create a OneHotEncoder to encode the categorical data.
# Note: set sparse_output to False to get readable output.
ohe = OneHotEncoder(sparse_output=False)

In [None]:
# Fit the encoder to the job column.
ohe.fit(df[["job"]])

In [None]:
# Transform the job column, look at first 3 records.
ohe.transform(df[["job"]])[0:3]

In [None]:
# What are the feature names created by the encoder?
ohe.get_feature_names_out()

In [None]:
# Combine to get output as a DataFrame.
(
    pd.DataFrame(
        data=ohe.transform(df[["job"]]),
        columns=ohe.get_feature_names_out(),
    )
    .head(3)
)

In [None]:
# Another way to get a pandas DataFrame.
ohe.set_output(transform="pandas")
ohe.transform(df[["job"]]).head(3)

## Transformations per Column

Use a ColumnTransformer to define transformations per column.


See documentation: https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html

In [None]:
# Import the ColumnTransformer from the compose module.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [None]:
# Get column names per data type.
categorical = df.select_dtypes("object").columns
numerical = df.select_dtypes("number").columns

In [None]:
# Create a OneHotEncoder and StandardScaler.
ohe = OneHotEncoder(sparse_output=False)
ss = StandardScaler()

In [None]:
# Set up the ColumnTransformer.
transformer = ColumnTransformer(
    transformers=[
        ("encode_categorical", ohe, categorical),
        ("scale_numerical", ss, numerical),
    ]
)

In [None]:
# Fit and transform the data.
transformer.set_output(transform="pandas")
transformer.fit_transform(df).head(3)

## Create Winsorizing Transformer

In [None]:
# Data that needs to be transformed.
n = 101
df = pd.DataFrame({
    "linear": np.linspace(0, 100, n),
    "uniform": np.random.uniform(0, 100, n),
    "lognormal": np.random.lognormal(2.5, 1.2, n),
})

In [None]:
# Create a Transformer that Winsorizes numerical data.

In [None]:
class Winsorizer:
    """Transformer for Winsorizing extreme values.
    
    Parameters
    ----------
    lower_bound : float, default=0
        Lower quantile to restrict values to [0 - 1].
    upper_bound : float, default=.95
        Upper quantile to restrict values to [0 - 1].
    """
    def __init__(self, lower_bound=.0, upper_bound=.95):
        if not 0 <= lower_bound <= 1:
            raise ValueError("Lower bound quantile must be between 0 and 1.")
        if not 0 <= upper_bound <= 1:
            raise ValueError("Upper bound quantile must be between 0 and 1.")
        
        self.lower_bound = lower_bound
        self.upper_bound = upper_bound
        
    def fit(self, X, y=None):
        """Fit the transformer by storing quantile values."""
        self.lower = X.quantile(self.lower_bound).to_dict()
        self.upper = X.quantile(self.upper_bound).to_dict()
        
        return self
    
    def transform(self, X, y=None):
        """Transform X by clipping extreme values."""
        # Define clipping transformation for each column.
        winsorize = {}
        for column in self.lower:
            winsorize[column] = X[column].clip(lower=self.lower[column], upper=self.upper[column])

        # Apply the transformations.
        return X.assign(**winsorize)

In [None]:
# Create and fit the Transformer.
ws = Winsorizer(lower_bound=.03, upper_bound=.97)
ws.fit(df)

In [None]:
# Transform the dummy data.
# Note: Linear values are indeed clipped.
ws.transform(df)

### More formal approach

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
from sklearn.utils.validation import check_is_fitted


class Winsorizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
    """Transformer for Winsorizing extreme values.
    
    Parameters
    ----------
    lower_bound : float, default=0
        Lower quantile to restrict values to [0 - 1].
    upper_bound : float, default=.95
        Upper quantile to restrict values to [0 - 1].
    """
    def __init__(self, lower_bound=.0, upper_bound=.95):
        if not 0 <= lower_bound <= 1:
            raise ValueError("Lower bound quantile must be between 0 and 1.")
        if not 0 <= upper_bound <= 1:
            raise ValueError("Upper bound quantile must be between 0 and 1.")
        
        self.lower_bound = lower_bound
        self.upper_bound = upper_bound
        
    def fit(self, X, y=None):
        """Fit the transformer by storing quantile values."""
        if hasattr(X, "columns"):
            # Store input feature names if available.
            self.feature_names_in_ = list(X.columns)
            
            # Convert to numpy.ndarray
            X = X.values
        
        # Get quantiles from the data using numpy.
        self.quantiles_ = np.quantile(
            X, [self.lower_bound, self.upper_bound], axis=0
        )
        
        return self
    
    def transform(self, X, y=None):
        """Transform X by clipping extreme values."""
        # Check fit was called before transforming.
        check_is_fitted(self)
        
        if hasattr(X, "columns"):
            # Verify column names.
            if list(X.columns) != self.feature_names_in_:
                raise KeyError(f"Column names do not match: {self.feature_names_in_}.")
            
            # Convert to numpy.ndarray.
            X = X.values
        
        # Apply Winsorizing to appropriate columns.
        for idx, lower in enumerate(self.quantiles_[0]):
            upper = self.quantiles_[1, idx]
            X[:, idx] = X[:, idx].clip(lower, upper)
        
        return X

In [None]:
# Create and fit the Transformer.
ws = Winsorizer(lower_bound=.03, upper_bound=.97)
ws.fit(df)

In [None]:
# Now inherits get_feature_names_out().
ws.get_feature_names_out()

In [None]:
# Also inherits fit_transform method.
ws.fit_transform(df)[0:3]

In [None]:
# And supports pandas output.
ws.set_output(transform="pandas")
ws.transform(df).head(3)