# Solutions II: Transformers

In [1]:
import pandas as pd

In [None]:
# Read in the banking data set
df = pd.read_csv("../../0_data/banking/bank-additional-full.csv", sep=";")
df.head(3)

In [3]:
# Drop irrelevant / label columns.
df = df.drop(columns=["y", "duration", "pdays"])

## Encode Categorical Data

In [4]:
# Import the transformer from the preprocessing module.
from sklearn.preprocessing import OneHotEncoder

In [5]:
# Create a OneHotEncoder to encode categorical data.
# Note: set sparse_output to False to get readable output.
encoder = OneHotEncoder(sparse_output=False)

In [None]:
# Fit the encoder to the job column.
encoder.fit(df[["job"]])

In [None]:
# Transform the job column, look at first 3 records.
encoder.transform(df[["job"]])[0:3]

In [None]:
# What are the feature names created by the encoder?
encoder.get_feature_names_out()

In [None]:
# Combine to get output as a DataFrame.
(
    pd.DataFrame(
        data=encoder.transform(df[["job"]]),
        columns=encoder.get_feature_names_out(),
    )
    .head(3)
)

In [None]:
# Or set transform="pandas" to get a DataFrame.
encoder.set_output(transform="pandas")
encoder.transform(df[["job"]]).head(3)

## Transformations per Column

Use a ColumnTransformer to define transformations per column.


See documentation: https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html

In [11]:
# Import the ColumnTransformer from the compose module.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [12]:
# Get column names per data type.
categorical = df.select_dtypes("object").columns
numerical = df.select_dtypes("number").columns

In [13]:
# Create a OneHotEncoder and StandardScaler.
encoder = OneHotEncoder(sparse_output=False)
scaler = StandardScaler()

In [14]:
# Set up the ColumnTransformer.
transformer = ColumnTransformer(
    transformers=[
        ("encode_categorical", encoder, categorical),
        ("scale_numerical", scaler, numerical),
    ]
)

In [None]:
# Fit and transform the data.
transformer.set_output(transform="pandas")
transformer.fit_transform(df).head(3)