# Validation

In [1]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

## Load Data

In [None]:
# Read in the banking data set.
df = pd.read_csv("../0_data/banking/bank-additional-full.csv", sep=";")
df.head(3)

In [3]:
# Split features and labels.
X = df.drop(columns=["duration", "pdays", "y"])
y = df["y"]

## Create Model

In [4]:
# Get column names per data type.
categorical = X.select_dtypes("object").columns
numerical = X.select_dtypes("number").columns

In [5]:
# Set up the ColumnTransformer.
transformer = ColumnTransformer(
    transformers=[
        ("encode_categorical", OneHotEncoder(handle_unknown="ignore"), categorical),
        ("scale_numerical", StandardScaler(), numerical),
    ]
)

In [6]:
# Linear model.
linear = Pipeline(
    steps=[
        ("preparation", transformer),
        ("model", LogisticRegression(max_iter=500)),
    ]
)

## Evaluation Metrics

In [7]:
# Get predictions for the train set from the model.
linear.fit(X, y)
predictions = linear.predict(X)

In [None]:
# Get the confusion matrix.
# Note: rows are actual [N, P], columns are predicted [N, P].
conf_mtx = confusion_matrix(y, predictions)
conf_mtx

In [None]:
# Compute accuracy score.
accuracy_score(y, predictions)

In [None]:
# Double check...
tn, fp, fn, tp = conf_mtx.ravel()
total = conf_mtx.sum()

(tp + tn) / total

In [None]:
# Precision score
# Note: Percentage correct of positive predictions.
precision_score(y, predictions, pos_label="yes")

In [None]:
 # Double check...
tp / (tp + fp)

In [None]:
# Recall score
# Note: Percentage correct of actual positive.
recall_score(y, predictions, pos_label="yes")

In [None]:
# Double check...
tp / (tp + fn)