# Solutions II: Transformers

In [None]:
import numpy as np
import pandas as pd

In [None]:
# Read in the banking data set
df = pd.read_csv("../../0_data/banking/bank-additional-full.csv", sep=";")
df.head(3)

In [None]:
# Drop irrelevant / label columns.
df = df.drop(columns=["y", "duration", "pdays"])

## Encode Categorical Data

In [None]:
# Import the transformer from the preprocessing module.
from sklearn.preprocessing import OneHotEncoder

In [None]:
# Create a OneHotEncoder to encode the categorical data.
# Note: set sparse_output to False to get readable output.
ohe = OneHotEncoder(sparse_output=False)

In [None]:
# Fit the encoder to the job column.
ohe.fit(df[["job"]])

In [None]:
# Transform the job column, look at first 3 records.
ohe.transform(df[["job"]])[0:3]

In [None]:
# What are the feature names created by the encoder?
ohe.get_feature_names_out()

In [None]:
# Combine to get output as a DataFrame.
(
    pd.DataFrame(
        data=ohe.transform(df[["job"]]),
        columns=ohe.get_feature_names_out(),
    )
    .head(3)
)

In [None]:
# Another way to get a pandas DataFrame.
ohe.set_output(transform="pandas")
ohe.transform(df[["job"]]).head(3)

## Transformations per Column

Use a ColumnTransformer to define transformations per column.


See documentation: https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html

In [None]:
# Import the ColumnTransformer from the compose module.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [None]:
# Get column names per data type.
categorical = df.select_dtypes("object").columns
numerical = df.select_dtypes("number").columns

In [None]:
# Create a OneHotEncoder and StandardScaler.
ohe = OneHotEncoder(sparse_output=False)
ss = StandardScaler()

In [None]:
# Set up the ColumnTransformer.
transformer = ColumnTransformer(
    transformers=[
        ("encode_categorical", ohe, categorical),
        ("scale_numerical", ss, numerical),
    ]
)

In [None]:
# Fit and transform the data.
transformer.set_output(transform="pandas")
transformer.fit_transform(df).head(3)