# Solutions III: Modelling

In [None]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

## Load Data

In [None]:
# Read in the banking data set.
df = pd.read_csv("../../0_data/banking/bank-additional-full.csv", sep=";")
df.head(3)

In [None]:
# Split features and labels.
X = df.drop(columns=["duration", "pdays", "y"])
y = df["y"]

## Data Preparation

In [None]:
# Get column names per data type.
categorical = X.select_dtypes("object").columns

In [None]:
# Create a OneHotEncoder and StandardScaler.
ohe = OneHotEncoder(sparse_output=False)

In [None]:
# Set up the ColumnTransformer.
transformer = ColumnTransformer(
    transformers=[
        ("encode_categorical", ohe, categorical),
    ],
    remainder="passthrough",
)

In [None]:
# Transform the data.
# Note: Use transform="pandas" to get a DataFrame.
transformer.set_output(transform="pandas")
Xt = transformer.fit_transform(X)

In [None]:
# Quick inspection of the transformed data.
Xt.head(3)

## RandomForest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Create random forest model instance.
rf = RandomForestClassifier(n_jobs=-1)

In [None]:
# Fit to the data.
rf.fit(Xt, y)

In [None]:
# Create predictions.
predicted = rf.predict(Xt)

In [None]:
# Accuracy is extremely high.
(y == predicted).mean()

### Feature Importances

In [None]:
# Important features; not very readable...
rf.feature_importances_

In [None]:
# Combine with feature names into a DataFrame
importance = pd.DataFrame({
    "feature": rf.feature_names_in_,
    "importance": rf.feature_importances_,
})

In [None]:
# Show top 10 features
(
    importance
    .sort_values("importance", ascending=False)
    .round(3)
    .head(10)
)

### Probabilities

In [None]:
# Get the prediction probabilities
probabilities = rf.predict_proba(Xt)
probabilities[0]

In [None]:
# First probability correspondes to "no" label.
predicted[0]

In [None]:
# Combine the data with predictions and probabilities (for "yes").
analysis = (
    df.assign(
        predicted=predicted,
        probability_yes=probabilities[:, 1],
    )
    
)

In [None]:
# Uncertain predictions are close to the decision boundary.
# Note: Compute the absolute distance to the 0.5 boundary.
analysis = analysis.assign(
    certainty=(0.5 - analysis["probability_yes"]).abs()
)
analysis.sample(3)

In [None]:
# Find the top 10 most uncertain predictions.
analysis.sort_values("certainty").head(10).T

In [None]:
# Find the 10 most confident predictions that were wrong.
(
    analysis
    
    # Filter wrong predictions only.
    .query("y != predicted")

    # Sort values on descencing certainty.
    .sort_values("certainty", ascending=False)
    
    .head(10)
    .T
)

## Combine with Pipeline

In [None]:
# Get column names per data type.
categorical = X.select_dtypes("object").columns
numerical = X.select_dtypes("number").columns

In [None]:
# Set up the ColumnTransformer.
transformer = ColumnTransformer(
    transformers=[
        ("encode_categorical", OneHotEncoder(), categorical),
    ],
    remainder="passthrough",
)

In [None]:
# Set up the model.
random_forest = RandomForestClassifier(n_jobs=-1)

In [None]:
# Combine preparation and model using a Pipeline.
pipe = Pipeline(
    steps=[
        ("preparation", transformer),
        ("model", random_forest),
    ]
)

In [None]:
# Fit now prepares the data and then feeds it to the model.
pipe.fit(X, y)

In [None]:
# Predict also prepares the data before feeding it to the model.
pipe.predict(X)

In [None]:
# Accessing a Pipeline step by its name.
pipe["model"]

In [None]:
# Getting feature names produced in preparation.
pipe["preparation"].get_feature_names_out()

In [None]:
# Getting feature importances
pipe["model"].feature_importances_