# Model Validation and Selection

In [None]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

## Load Data

In [None]:
# Read in the banking data set.
df = pd.read_csv("../../0_data/banking/bank-additional-full.csv", sep=";")
df.head(3)

In [None]:
# Split features and labels.
X = df.drop(columns=["duration", "pdays", "y"])
y = df["y"]

## Create the models

In [None]:
# Get column names per data type.
categorical = X.select_dtypes("object").columns
numerical = X.select_dtypes("number").columns

In [None]:
# Set up the ColumnTransformer.
transformer = ColumnTransformer(
    transformers=[
        ("encode_categorical", OneHotEncoder(handle_unknown="ignore"), categorical),
        ("scale_numerical", StandardScaler(), numerical),
    ]
)

In [None]:
# Dummy model.
dummy = Pipeline(
    steps=[
        ("model", DummyClassifier(strategy="stratified")),
    ]
)

In [None]:
# Linear model.
linear = Pipeline(
    steps=[
        ("preparation", transformer),
        ("model", LogisticRegression(max_iter=500)),
    ]
)

In [None]:
# RandomForest model.
forest = Pipeline(
    steps=[
        ("preparation", transformer),
        ("model", RandomForestClassifier(n_jobs=-1)),
    ]
)

## Evaluation Metrics

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

In [None]:
# Get predictions for the train set from the model.
linear.fit(X, y)
predictions = linear.predict(X)

In [None]:
# Get the confusion matrix.
# Note: rows are actual, columns are predicted.
conf_mtx = confusion_matrix(y, predictions)
conf_mtx

In [None]:
# Compute accuracy score.
accuracy_score(y, predictions)

In [None]:
# Double check...
tn, fp, fn, tp = conf_mtx.ravel()
total = conf_mtx.sum()

(tp + tn) / total

In [None]:
# Precision score
# Note: Percentage correctly predicted as positive.
precision_score(y, predictions, pos_label="yes")

In [None]:
 # Double check...
tp / (tp + fp)

In [None]:
# Recall score
# Note: Percentage correct from actual positive.
recall_score(y, predictions, pos_label="yes")

In [None]:
# Double check...
tp / (tp + fn)

## Train versus Test Performance

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Split X and y into train and test.
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.7)

In [None]:
# Peek at the data.
X_train.head(3)

In [None]:
# And the labels.
y_train.head(3)

In [None]:
# Fit on the training data.
linear.fit(X_train, y_train)

In [None]:
# Prediction for the training data.
predictions_train = linear.predict(X_train)

In [None]:
# Accuracy on the training data.
# Note: Accuracy on the training set is usually high.
accuracy_score(y_train, predictions_train)

In [None]:
# Predictions for the test data.
# Note: We do not re-fit the model!
predictions_test = linear.predict(X_test)

In [None]:
# And accuracy on the test data.
# Note: Accuracy is similar to the train data; no overfitting.
accuracy_score(y_test, predictions_test)

## Best Model on Test Set

In [None]:
# Split X and y into train and test.
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.7)

In [None]:
# Train and test the models
models = {"dummy": dummy, "linear": linear, "random forest": forest}
chars = 45


print("=" * chars)
for model_name, model in models.items():
    # Train the model.
    model.fit(X_train, y_train)
    
    # Generate predictions for the test set.
    pred_train = model.predict(X_train) 
    pred_test = model.predict(X_test)
    
    # Compute performance metrics
    metrics = {
        "accuracy": (
            accuracy_score(y_train, pred_train),
            accuracy_score(y_test, pred_test)
        ),
        "precision": (
            precision_score(y_train, pred_train, pos_label="yes"),
            precision_score(y_test, pred_test, pos_label="yes")
        ),
        "recall": (
            recall_score(y_train, pred_train, pos_label="yes"),
            recall_score(y_test, pred_test, pos_label="yes")
        ),
    }
    
    # Print
    print(f"Model: {model_name}")
    print("-" * chars)
    for metric, (train, test) in metrics.items():
          print(f"{metric:20s}: {train:8.2f}    :  {test:8.2f}")
    print("=" * chars)

## Most Uncertain Predictions

In [None]:
# Split X and y into train and test.
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.7)

In [None]:
# Get predictions and probabilities.
linear.fit(X_train, y_train)
predictions = linear.predict(X_test)
probabilities = linear.predict_proba(X_test)

In [None]:
# First label is "no".
predictions[0]

In [None]:
# First probability corresponds to "no"
probabilities[0]

In [None]:
# Combine everything into a DataFrame.
analysis = (
    X_test
    .assign(
        actual=y_test,
        predicted=predictions,
        probability_yes=probabilities[:, 1],
    )
)
analysis.head(3)

In [None]:
# Compute distance to the decision boundary (= 0.5).
# Note: Cases close to the decision boundary are "uncertain" / have low probability.
analysis = analysis.assign(certainty=(0.5 - analysis["probability_yes"]).abs() * 2)
analysis.head(3)

In [None]:
# Get the top 10 most uncertain cases.
analysis.sort_values("certainty").head(10).T

In [None]:
# Get most confident errors for "yes"
(
    analysis
    
    # Filter only wrong predictions.
    .query("actual != predicted")
    
    # Note: Set ascending=True to get most confident "no" errors.
    .sort_values("probability_yes", ascending=False)
    
    .head(8)
    .T
)

## Cross Validation

In [None]:
from sklearn.model_selection import KFold

In [None]:
# Create KFold object and specify splits.
kfold = KFold(n_splits=5)

In [None]:
# Show rotation of the test set.
# Note: All other cases will end up in the train set.
for fold, (train_idx, test_idx) in enumerate(kfold.split(X, y), start=1):
    print(f"Fold {fold} -- Test cases:    {test_idx[0]:5d} - {test_idx[-1]:5d}")

In [None]:
metrics = {
    "fold": [],
    "accuracy": [],
    "precision": [],
    "recall": []
}
model = linear

# Perform cross-validation with 5 splits.
kfold = KFold(n_splits=5)
for fold, (train_idx, test_idx) in enumerate(kfold.split(X, y), start=1):
    
    # Create data sets for the fold.
    # Note: Must use .iloc[] because we have indices!
    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]
        
    X_test = X.iloc[test_idx]
    y_test = y.iloc[test_idx]
        
    # Train the model and get predictions
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(f"Fold {fold} -- Predicted yes: ", (predictions == "yes").sum())
        
    # Compute and store performance metrics
    metrics["fold"].append(fold)
    metrics["accuracy"].append(accuracy_score(y_test, predictions))
    metrics["precision"].append(precision_score(y_test, predictions, pos_label="yes"))
    metrics["recall"].append(recall_score(y_test, predictions, pos_label="yes"))
    
pd.DataFrame(metrics)

In [None]:
# Plot how "yes" labels are dispersed through the data.
(
    pd.DataFrame({
        "index": df.index,
        "total_yes": (df["y"] == "yes").cumsum(),
    })
    .plot(
        x="index",
        y="total_yes",
        title="Label: Yes - Cumulative",
        figsize=(10, 3),
    )
)
None

In [None]:
# Plot how campaigns in March are dispersed through the data.
months = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]
ax = (
    pd.DataFrame({
        "index": df.index,
        "month_num": df["month"].map(lambda m: months.index(m) + 1)
    })
    .plot(
        x="index",
        y="month_num",
        title="Months",
        figsize=(10, 3),
        legend=False,
    )
)
ax.set_yticks([m + 1 for m in range(12)], months)
ax.grid(visible=True, color="lightgrey", axis="y")
None

In [None]:
# Plot how campaigns in March are dispersed through the data.
ax = (
    pd.DataFrame({
        "index": df.index,
        "price_index": df["cons.price.idx"]
    })
    .plot(
        x="index",
        y="price_index",
        title="Price Ixdex",
        figsize=(10, 3),
        legend=False,
    )
)
None

## Learning Curve

In [None]:
train_sizes = [50, 100, 150, 200, 250, 300]
test_size = 1000
model = forest

results = {
    "train_size": [],
    "train": [],
    "test": [],
}
for train_size in train_sizes:
    
    # Make the datas sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        train_size=train_size,
        test_size=test_size,
        shuffle=True,
        stratify=y
    )
    
    
    # Fit and compute accuracies
    model.fit(X_train, y_train)
    acc_train = accuracy_score(y_train, model.predict(X_train))
    acc_test = accuracy_score(y_test, model.predict(X_test))
    
    # Store results
    results["train_size"].append(train_size)
    results["train"].append(acc_train)
    results["test"].append(acc_test)
    
# Plot results
(
    pd.DataFrame(results)
    .plot(
        x="train_size",
        y=["train", "test"],
        marker="."
    )
)