In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, StandardScaler, QuantileTransformer

# Banking data: Preparation

In [None]:
df = pd.read_csv("../0_data/banking/bank-additional-full.csv", sep=";")
df.shape

## Stateless preparation

In [None]:
# Drop duration as it cannot be used in prediction
df = df.drop(columns="duration")

In [None]:
# Set 999 as NaN in pdays
df["pdays"] = df["pdays"].replace({999: np.NaN})

## Categorical features

In [None]:
# Encode categorical features using pd.get_dummies()...
# Note: dummies depend on what is in the sample!
pd.get_dummies(df["job"].sample(5))

In [None]:
# Using OneHotEncoderand fit it to a specific sample
encoder = OneHotEncoder(sparse=False)
encoder.fit_transform(df[["job"]].sample(5))

In [None]:
# Sample contained these categories...
encoder.get_feature_names_out()

In [None]:
# Transforming a new sample which may have new categories.
# Note: Use handle_unknown="ignore" on OneHotEncoder to ignore new categories.
encoder.transform(df[["job"]].sample(5))

In [None]:
df.columns

## Numeric features

In [None]:
scaler = StandardScaler()
age = scaler.fit_transform(df[["age"]].sample(500))

print(f"""
    StandardScaler learned mean and SD
    Mean: {scaler.mean_[0]:.2f}
    SD:   {scaler.scale_[0]:.2f}
""")

In [None]:
print(f"""
    Transformed age values mean and SD
    ----------------------------------
    Mean: {age.mean():.2f}
    SD:   {age.std():.2f}
""")

In [None]:
(
    pd.Series(age.squeeze())
    .plot
    .hist(
        bins=25,
        edgecolor="white",
        figsize=(4, 2)
    )
)

In [None]:
qtransformer = QuantileTransformer(n_quantiles=100, output_distribution="normal")
age = qtransformer.fit_transform(df[["age"]].sample(500))

In [None]:
(
    pd.Series(age.squeeze())
    .plot
    .hist(
        bins=25,
        edgecolor="white",
        figsize=(4, 2)
    )
)