# Banking data: Modelling

In [None]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

## Load Data

In [None]:
# Read in the banking data set.
df = pd.read_csv("../../0_data/banking/bank-additional-full.csv", sep=";")
df.head(3)

In [None]:
# Split features and labels.
X = df.drop(columns=["duration", "pdays", "y"])
y = df["y"]

## Data Preparation

In [None]:
# Get column names per data type.
categorical = X.select_dtypes("object").columns
numerical = X.select_dtypes("number").columns

In [None]:
# Create a OneHotEncoder and StandardScaler.
ohe = OneHotEncoder(sparse_output=False)
ss = StandardScaler()

In [None]:
# Set up the ColumnTransformer.
transformer = ColumnTransformer(
    transformers=[
        ("encode_categorical", ohe, categorical),
        ("scale_numerical", ss, numerical),
    ]
)

In [None]:
# Another way to get a pandas DataFrame.
transformer.set_output(transform="pandas")
Xt = transformer.fit_transform(X)

In [None]:
# Quick inspection of the transformed data.
Xt.head(3)

## Dummy Model

In [None]:
from sklearn.dummy import DummyClassifier

In [None]:
# Create logistic regression model instance.
dm = DummyClassifier(strategy="stratified")

In [None]:
# Fit to the training data.
dm.fit(Xt, y)

In [None]:
# Create predictions.
predicted = dm.predict(Xt)

In [None]:
# Create a DataFrame with actual and predicted.
result = pd.DataFrame({
    "actual": y,
    "predicted": predicted,
})
result.head(3)

In [None]:
# Check correspondence using pandas.
result.value_counts()

## Linear Model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# Create logistic regression model instance.
lm = LogisticRegression(max_iter=500)

In [None]:
# Fit to the training data.
lm.fit(Xt, y)

In [None]:
# Create predictions.
predicted = lm.predict(Xt)

In [None]:
# Create a DataFrame with actual and predicted.
result = pd.DataFrame({
    "actual": y,
    "predicted": predicted,
})
result.head(3)

In [None]:
# Check correspondence using pandas.
result.value_counts()

In [None]:
# Or using sklearn.
from sklearn.metrics import confusion_matrix

# Note: rows are actual, columns are predicted.
confusion_matrix(y, predicted)

In [None]:
# Get coefficients from the model.
(
    pd.DataFrame(
        # The coef_ attribute holds the values.
        data=lm.coef_,
        
        # The feature_names_in the names.
        columns=lm.feature_names_in_,
    )
    .T
    .sort_values(0, ascending=False)
)

## RandomForest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Create random forest model instance.
rf = RandomForestClassifier(n_jobs=-1)

In [None]:
# Fit to the data.
rf.fit(Xt, y)

In [None]:
# Create predictions.
predicted = rf.predict(Xt)

In [None]:
# Create a DataFrame with actual and predicted.
result = pd.DataFrame({
    "actual": y,
    "predicted": predicted,
})

# Check correspondence using pandas.
result.value_counts()

## Combine with Pipeline

In [None]:
# Get column names per data type.
categorical = X.select_dtypes("object").columns
numerical = X.select_dtypes("number").columns

In [None]:
# Set up the ColumnTransformer.
transformer = ColumnTransformer(
    transformers=[
        ("encode_categorical", OneHotEncoder(), categorical),
        ("scale_numerical", StandardScaler(), numerical),
    ]
)

In [None]:
# Set up the model.
random_forest = RandomForestClassifier(n_jobs=-1)

In [None]:
# Combine preparation and model using a Pipeline.
pipe = Pipeline(
    steps=[
        ("preparation", transformer),
        ("model", random_forest),
    ]
)

In [None]:
# Fit now prepares the data and then feeds it to the model.
pipe.fit(X, y)

In [None]:
# Predict also prepares the data before feeding it to the model.
pipe.predict(X)

In [None]:
# Accessing a Pipeline step by its name.
pipe["model"]

In [None]:
# Getting feature names produced in preparation.
pipe["preparation"].get_feature_names_out()

In [None]:
# Getting feature importances
pipe["model"].feature_importances_