# Exercises III: Transformer Class

In [25]:
import numpy as np
import pandas as pd

In [26]:
# Generate some dummy data to transform.
n = 150
cities = {
    "Amsterdam": .20,
    "Rotterdam": .20,
    "Den Haag":  .18,
    "Utrecht":   .12,
    "Eindhoven": .10,
    "Groningen": .06,
    "Tilburg":   .04,
    "Almere":    .04,
    "Breda":     .03,
    "Nijmegen":  .03,
}

df = pd.DataFrame({
    "city": np.random.choice(list(cities), n, p=[p for _, p in cities.items()]),
    "label": np.random.choice(list("ABCDEF"), n, p=[.05, .3, .3, .2, .1, .05])
})
df.head(3)

Unnamed: 0,city,label
0,Amsterdam,D
1,Rotterdam,C
2,Eindhoven,F


In [29]:
class InfrequentRecoder:
    """Transformer for recoding infrequent categories."""
    def __init__(self, topn=3, replace="other"):
        # Dict for storing most frequent categories
        self._most_frequent = {}

        # Store additional settings.
        self.topn = topn
        self.replace = replace

    def fit(self, X, y=None):
        """Store most frequent categories per column."""
        # Get all categorical columns.
        columns = X.select_dtypes("object").columns
        
        # Store top 3 categories per column.
        for column in columns:
            most_frequent = X[column].value_counts().head(self.topn)
            self._most_frequent[column] = most_frequent.index
        
        return self
    
    def transform(self, X, y=None):
        """Recode all but the most frequent categories."""
        
        # Go through the stored columns and categories.
        recodes = {}
        for column, most_frequent in self._most_frequent.items():
            
            # Define recoding for each column.
            recodes[column] = X[column].map(
                lambda v: v if v in most_frequent else self.replace
            )

        # Return transformed data.
        return X.assign(**recodes)

In [30]:
# Create and fit the InfrequentRecoder.
recoder = InfrequentRecoder()
recoder.fit(df)

<__main__.InfrequentRecoder at 0x182e4fb4370>

In [31]:
# Transform the dummy data.
df_recoded = recoder.transform(df)
df_recoded

Unnamed: 0,city,label
0,Amsterdam,D
1,Rotterdam,C
2,other,other
3,other,C
4,other,C
...,...,...
145,Amsterdam,D
146,other,D
147,Rotterdam,other
148,Amsterdam,other


In [32]:
# Check recoded values.
df_recoded["city"].value_counts()

city
other        59
Rotterdam    36
Amsterdam    32
Den Haag     23
Name: count, dtype: int64