# Exercises III: Transformer Class

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder

In [None]:
# Generate some dummy data to transform.
n = 150
cities = {
    "Amsterdam": .20,
    "Rotterdam": .20,
    "Den Haag":  .18,
    "Utrecht":   .12,
    "Eindhoven": .10,
    "Groningen": .06,
    "Tilburg":   .04,
    "Almere":    .04,
    "Breda":     .03,
    "Nijmegen":  .03,
}

df = pd.DataFrame({
    "city": np.random.choice(list(cities), n, p=[p for _, p in cities.items()]),
    "label": np.random.choice(list("ABCDEF"), n, p=[.05, .3, .3, .2, .1, .05])
})
df.head(3)

In [3]:
class InfrequentRecoder:
    """Transformer for recoding infrequent categories."""
    def __init__(self, topn=3, replace="other"):
        # Dict for the most frequent categories per column.
        self._most_frequent = {}

        # Store additional settings.
        self.topn = topn
        self.replace = replace

    def fit(self, X, y=None):
        """Store most frequent categories per column."""
        # Get all categorical columns.
        columns = X.select_dtypes("object").columns

        # Store top 3 categories per column.
        for column in columns:
            most_frequent = X[column].value_counts().head(self.topn)
            self._most_frequent[column] = most_frequent.index

        return self

    def transform(self, X, y=None):
        """Recode all but the most frequent categories."""

        # Go through the stored columns and categories.
        recodes = {}
        for column, most_frequent in self._most_frequent.items():

            # Create recoded Series for each column.
            recodes[column] = X[column].map(
                lambda v: v if v in most_frequent else self.replace
            )

        # Return transformed data.
        return X.assign(**recodes)

In [None]:
# Create and fit the InfrequentRecoder.
recoder = InfrequentRecoder()
recoder.fit(df)

In [None]:
# Transform the dummy data.
df_recoded = recoder.transform(df)
df_recoded

In [None]:
# Check recoded city values.
df_recoded["city"].value_counts()

In [None]:
# Check recoded label values.
df_recoded["label"].value_counts()

In [None]:
# Create a OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
encoder.set_output(transform="pandas")

In [None]:
# Fit and transform the data.
df_recoded = recoder.transform(df)
encoder.fit_transform(df_recoded)