In [None]:
# conda env create --force -f environment.yml

In [None]:
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import pandas as pd
import sklearn
import scipy

from typing import List

from sklearn.calibration import calibration_curve
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPRegressor

In [None]:
plt.style.use("seaborn-v0_8-deep")

font = {"family": "serif", "size": 14}

matplotlib.rc("font", **font)

# Load data

In [None]:
median_ensemble_test = pd.read_csv("./delivery_1_cv_v7_seeds/split_v7_2021_test_cc_dispatcher_precision_recall_harmonic_mean_median_ensemble_individual_predictions.csv")
median_ensemble_val = pd.read_csv("./delivery_1_cv_v7_seeds/split_v7_2021_test_cc_dispatcher_precision_recall_harmonic_mean_median_ensemble_individual_predictions_val.csv")

In [None]:
all_ensembles_test = pd.read_csv("./delivery_1_cv_v7_seeds/split_v7_2021_test_cc_dispatcher_precision_recall_harmonic_mean_predictions.csv")
all_ensembles_val = pd.read_csv("./delivery_1_cv_v7_seeds/split_v7_2021_test_cc_dispatcher_precision_recall_harmonic_mean_predictions_val.csv")

In [None]:
column_map = {
    "logits 20": "logits 1",
    "logits 21": "logits 2",
    "logits 22": "logits 3",
    "logits 23": "logits 4",
    "logits 24": "logits 5",
    "probs 20": "probs 1",
    "probs 21": "probs 2",
    "probs 22": "probs 3",
    "probs 23": "probs 4",
    "probs 24": "probs 5",
}
median_ensemble_test.rename(columns=column_map, inplace=True)
median_ensemble_test["ensemble_preds"] = median_ensemble_test["ensemble_probs"] > 0.5

median_ensemble_val.rename(columns=column_map, inplace=True)
median_ensemble_val["ensemble_preds"] = median_ensemble_val["ensemble_probs"] > 0.5

median_ensemble_test

In [None]:
def plot_histogram(arrays: List[np.ndarray], labels: List, **kwargs):
    
    density = kwargs.get("density", False)
    fig, ax = plt.subplots(figsize=(6.4, 4.8))

    for array, label in zip(arrays, labels):
        ax.hist(array, bins=kwargs.get("bins", 50), alpha=kwargs.get("alpha", 0.5), label=label, density=density)

    ax.set_yscale(kwargs.get("yscale", "log"))
    ax.set_xlabel("Predicted probability")
    if density:
        ax.set_ylabel("Density")
    else:
        ax.set_ylabel("Count")
    ax.legend()
    return fig, ax

In [None]:
def plot_calibration_curve(targets: List[np.ndarray], model_probs: List[np.ndarray], labels=None, ax=None, **kwargs):
    if not isinstance(targets, list):
        targets = [targets]
    if not isinstance(model_probs, list):
        model_probs = [model_probs]

    if len(model_probs) != len(targets):
        assert len(model_probs) == 1 or len(targets) == 1, "Number of models and targets must be equal or 1"
        if len(model_probs) == 1:
            model_probs = model_probs * len(targets)
        else:
            targets = targets * len(model_probs)

    if ax is None:
        fig, ax = plt.subplots(figsize=(6.4, 4.8))
        ax.plot([0, 1], [0, 1], "k:", label="Perfect calibration")
    else:
        fig = None

    for target, probs, label in zip(targets, model_probs, labels):
        prob_true, prob_pred = calibration_curve(
            target,
            probs,
            n_bins=kwargs.get("n_bins", 20),
            strategy=kwargs.get("strategy", "uniform"),
        )
        ax.plot(prob_pred, prob_true, marker="o", markersize=3, label=label)

    ax.set_xlabel("Mean predicted probability")
    ax.set_ylabel("Fraction of positives")
    ax.legend()
    return fig, ax

In [None]:
fig, ax = plot_calibration_curve(
    targets=median_ensemble_test["y"],
    model_probs=[median_ensemble_test["ensemble_probs"], median_ensemble_test["probs 1"]],
    labels=["Ensemble", "Single model"],
    n_bins=20,
)
# fig.savefig("calibration_curve_ensemble_and_single_model_uncalibrated.pdf", bbox_inches="tight")
fig, ax = plot_histogram(arrays=[median_ensemble_test["ensemble_probs"], median_ensemble_test["probs 1"]], labels=["Ensemble", "Constituent model 1"], bins=30)
fig.savefig("histogram_ensemble_and_single_model.pdf", bbox_inches="tight")

In [None]:
fig, ax = plot_calibration_curve(
    targets=median_ensemble_test["y"],
    model_probs=[
        median_ensemble_test["probs 1"],
        median_ensemble_test["probs 2"],
        median_ensemble_test["probs 3"],
        median_ensemble_test["probs 4"],
        median_ensemble_test["probs 5"],
        median_ensemble_test["ensemble_probs"],
    ],
    labels=["Constituent model 1", "Constituent model 2", "Constituent model 3", "Constituent model 4", "Constituent model 4", "Ensemble"],
    n_bins=20,
)
fig.savefig("calibration_curve_ensemble_and_all_models_uncalibrated.pdf", bbox_inches="tight")
plot_histogram(
    arrays=[
        median_ensemble_test["probs 1"],
        median_ensemble_test["probs 2"],
        median_ensemble_test["probs 3"],
        median_ensemble_test["probs 4"],
        median_ensemble_test["probs 5"],
        median_ensemble_test["ensemble_probs"],
    ],
    labels=["Constituent model 1", "Constituent model 2", "Constituent model 3", "Constituent model 4", "Constituent model 4", "Ensemble"],
    alpha=0.3
)

In [None]:
is_male = median_ensemble_test["gender"] == "M"
is_female = median_ensemble_test["gender"] == "K"
plot_calibration_curve(
    targets=[median_ensemble_test["y"][is_male], median_ensemble_test["y"][is_female]],
    model_probs=[median_ensemble_test["ensemble_probs"][is_male], median_ensemble_test["ensemble_probs"][is_female]],
    labels=["Male", "Female"],
    n_bins=20,
)
plot_histogram(
    arrays=[median_ensemble_test["ensemble_probs"][is_male], median_ensemble_test["ensemble_probs"][is_female]],
    labels=["Male", "Female"],
)

In [None]:
condition = median_ensemble_test["h"]

plot_calibration_curve(
    targets=[median_ensemble_test["y"][condition], median_ensemble_test["y"][~condition]],
    model_probs=[median_ensemble_test["ensemble_probs"][condition], median_ensemble_test["ensemble_probs"][~condition]],
    labels=["Call-taker recognition", "No call-taker recognition"],
    n_bins=20,
)
plot_histogram(
    arrays=[median_ensemble_test["ensemble_probs"][condition], median_ensemble_test["ensemble_probs"][~condition]],
    labels=["Call-taker recognition", "No call-taker recognition"],
)

In [None]:
plot_histogram(
    arrays=[median_ensemble_test["ensemble_probs"][condition]],
    labels=["Call-taker recognition"],
    yscale="linear"
)

In [None]:
condition = median_ensemble_test["h"] == median_ensemble_test["ensemble_preds"]

plot_calibration_curve(
    targets=[median_ensemble_test["y"][condition], median_ensemble_test["y"][~condition]],
    model_probs=[median_ensemble_test["ensemble_probs"][condition], median_ensemble_test["ensemble_probs"][~condition]],
    labels=["Model/call-taker agreement", "Model/call-taker disagreement"],
    n_bins=20,
)
plot_histogram(
    arrays=[median_ensemble_test["ensemble_probs"][condition], median_ensemble_test["ensemble_probs"][~condition]],
    labels=["Model/call-taker agreement", "Model/call-taker disagreement"],
)

In [None]:
is_old = median_ensemble_test["age"] >= 65
plot_calibration_curve(
    targets=[median_ensemble_test["y"][is_old], median_ensemble_test["y"][~is_old]],
    model_probs=[median_ensemble_test["ensemble_probs"][is_old], median_ensemble_test["ensemble_probs"][~is_old]],
    labels=["65+", "18-65"],
    n_bins=20,
)
plot_histogram(
    arrays=[median_ensemble_test["ensemble_probs"][is_old], median_ensemble_test["ensemble_probs"][~is_old]],
    labels=["65+", "18-65"],
)

In [None]:
condition = median_ensemble_test["y"]

plot_calibration_curve(
    targets=[median_ensemble_test["y"][condition], median_ensemble_test["y"][~condition]],
    model_probs=[median_ensemble_test["ensemble_probs"][condition], median_ensemble_test["ensemble_probs"][~condition]],
    labels=["Model/call-taker agreement", "Model/call-taker disagreement"],
    n_bins=20,
)
plot_histogram(
    arrays=[median_ensemble_test["ensemble_probs"][condition], median_ensemble_test["ensemble_probs"][~condition]],
    labels=["Model/call-taker agreement", "Model/call-taker disagreement"],
)

# Calibration

## Platt scaling

In [None]:
logistic = LogisticRegression(penalty="none", fit_intercept=True)
logistic.fit(median_ensemble_val["ensemble_probs"].to_numpy()[:, np.newaxis], median_ensemble_val["y"].to_numpy())
ensemble_probs_logistic = logistic.predict_proba(median_ensemble_test["ensemble_probs"].to_numpy()[:, np.newaxis])[:, 1]
ensemble_probs_logistic_val = logistic.predict_proba(median_ensemble_val["ensemble_probs"].to_numpy()[:, np.newaxis])[:, 1]

individual_probs_logistic = []
individual_probs_logistic_val = []
for i in range(1, 6):
    logistic = LogisticRegression(penalty="none", fit_intercept=True)
    logistic.fit(median_ensemble_val[f"probs {i}"].to_numpy()[:, np.newaxis], median_ensemble_val["y"].to_numpy())
    individual_probs_logistic += [logistic.predict_proba(median_ensemble_test[f"probs {i}"].to_numpy()[:, np.newaxis])[:, 1]]
    individual_probs_logistic_val += [logistic.predict_proba(median_ensemble_val[f"probs {i}"].to_numpy()[:, np.newaxis])[:, 1]]

In [None]:
logistic.coef_, logistic.intercept_, logistic.n_iter_

In [None]:
# Plot logistic sigmoid fit
x = np.linspace(0, 1, 200)
y = logistic.predict_proba(x[:, np.newaxis])[:, 1]

fig, ax = plt.subplots(figsize=(6.4, 4.8))
ax.plot([0, 1], [0, 1], "k:", label="Perfect calibration")
ax.plot(x, y, label="Logistic sigmoid fit")

In [None]:
ensemble_probs_logistic.min(), ensemble_probs_logistic.max()

In [None]:
fig, ax = plot_calibration_curve(
    targets=median_ensemble_val["y"],
    model_probs=[ensemble_probs_logistic_val, *individual_probs_logistic_val],
    labels=["Ensemble", *["Constituent model"] * 5],
    n_bins=20,
)
plot_histogram(arrays=[ensemble_probs_logistic_val], labels=["Ensemble"])

In [None]:
fig, ax = plot_calibration_curve(
    targets=median_ensemble_test["y"],
    # model_probs=[ensemble_probs_logistic, *individual_probs_logistic],
    model_probs=[ensemble_probs_logistic, *individual_probs_logistic],
    labels=["Ensemble"], #, *["Constituent model"] * 5],
    n_bins=20,
    strategy="uniform",
    # n_bins=1000,
    # strategy="quantile",
)
fig.savefig("calibration_curve_ensemble_logistic.pdf", bbox_inches="tight")

## Isotonic

In [None]:
isotonic = IsotonicRegression(y_min=0, y_max=1, increasing=True, out_of_bounds="clip")
isotonic.fit(median_ensemble_val["ensemble_probs"].to_numpy(), median_ensemble_val["y"].to_numpy())
ensemble_probs_isotonic = isotonic.transform(median_ensemble_test["ensemble_probs"].to_numpy())
ensemble_probs_isotonic_val = isotonic.transform(median_ensemble_val["ensemble_probs"].to_numpy())

individual_probs_isotonic = []
individual_probs_isotonic_val = []
for i in range(1, 6):
    isotonic_i = IsotonicRegression(y_min=0, y_max=1, increasing=True, out_of_bounds="clip")
    isotonic_i.fit(median_ensemble_val[f"probs {i}"].to_numpy(), median_ensemble_val["y"].to_numpy())
    individual_probs_isotonic += [isotonic_i.transform(median_ensemble_test[f"probs {i}"].to_numpy())]
    individual_probs_isotonic_val += [isotonic_i.transform(median_ensemble_val[f"probs {i}"].to_numpy())]

In [None]:
median_ensemble_val["ensemble_probs"]

In [None]:
plot_histogram(arrays=[ensemble_probs_isotonic, ensemble_probs_isotonic_val], labels=["Ensemble test", "Ensemble validation"])

In [None]:
x = np.linspace(0, 1, 200)
y = isotonic.transform(x)

fig, ax = plt.subplots(figsize=(6.4, 4.8))
ax.plot([0, 1], [0, 1], "k:", label="Perfect calibration")
# ax.plot(isotonic.X_thresholds_, isotonic.y_thresholds_, "-", marker="o", markersize=3, label="Isotonic")
ax.plot(x, y, label="Isotonic fit")

val_pos = median_ensemble_val["ensemble_probs"][median_ensemble_val["y"] == 1]
val_neg = median_ensemble_val["ensemble_probs"][median_ensemble_val["y"] == 0]
plot_histogram(arrays=[val_neg, val_pos], labels=["Ensemble probs (val-neg)", "Ensemble probs (val-pos)"])

In [None]:
ensemble_probs_isotonic

In [None]:
fig, ax = plot_calibration_curve(
    targets=median_ensemble_val["y"],
    model_probs=[ensemble_probs_isotonic_val, *individual_probs_isotonic_val],
    labels=["Ensemble"],#, *["Constituent model"] * 5],
    n_bins=10,
    strategy="uniform",
    # n_bins=1000,
    # strategy="quantile",
)
plot_histogram(arrays=[ensemble_probs_isotonic_val], labels=["Ensemble"])

In [None]:
fig, ax = plot_calibration_curve(
    targets=median_ensemble_test["y"],
    model_probs=[ensemble_probs_isotonic, *individual_probs_isotonic],
    labels=["Ensemble"],#, *["Constituent model"] * 5],
    # n_bins=10,
    # strategy="uniform",
    n_bins=1000,
    strategy="quantile",
)
plot_histogram(arrays=[ensemble_probs_isotonic], labels=["Ensemble"])

## MLP

In [None]:
mlp = MLPRegressor(hidden_layer_sizes=(8, 8), activation="tanh", solver="adam", max_iter=3000, random_state=0)
mlp.out_activation_ = "sigmoid"
mlp.fit(median_ensemble_val["ensemble_probs"].to_numpy()[:, np.newaxis], median_ensemble_val["y"].to_numpy()[:, np.newaxis])
ensemble_probs_mlp = mlp.predict(median_ensemble_test["ensemble_probs"].to_numpy()[:, np.newaxis]).clip(0)
ensemble_probs_mlp_val = mlp.predict(median_ensemble_val["ensemble_probs"].to_numpy()[:, np.newaxis]).clip(0)

individual_probs_mlp = []
individual_probs_mlp_val = []
for i in range(1, 6):
    mlp_i = MLPRegressor(hidden_layer_sizes=(8, 8), activation="tanh", solver="adam", max_iter=3000, random_state=0)
    mlp_i.out_activation_ = "sigmoid"
    mlp_i.fit(median_ensemble_val[f"probs {i}"].to_numpy()[:, np.newaxis], median_ensemble_val["y"].to_numpy()[:, np.newaxis])
    individual_probs_mlp += [mlp_i.predict(median_ensemble_test[f"probs {i}"].to_numpy()[:, np.newaxis]).clip(0)]
    individual_probs_mlp_val += [mlp_i.predict(median_ensemble_val[f"probs {i}"].to_numpy()[:, np.newaxis]).clip(0)]

In [None]:
plot_histogram(arrays=[ensemble_probs_mlp, ensemble_probs_mlp_val], labels=["Ensemble test", "Ensemble validation"])

In [None]:
fig, ax = plot_calibration_curve(
    targets=median_ensemble_val["y"],
    model_probs=[ensemble_probs_mlp_val, *individual_probs_mlp_val],
    labels=["Ensemble"],#, *["Constituent model"] * 5],
    n_bins=15,
    strategy="uniform",
    # n_bins=1000,
    # strategy="quantile",
)
plot_histogram(arrays=[ensemble_probs_mlp_val], labels=["Ensemble"])

In [None]:
fig, ax = plot_calibration_curve(
    targets=median_ensemble_test["y"],
    model_probs=[ensemble_probs_mlp, *individual_probs_mlp],
    labels=["Ensemble"],#, *["Constituent model"] * 5],
    # n_bins=10,
    # strategy="uniform",
    n_bins=1000,
    strategy="quantile",
)
plot_histogram(arrays=[ensemble_probs_mlp], labels=["Ensemble"])

## Plot-making

In [None]:
# All calibration fits

# Logistic sigmoid
x_logistic = np.linspace(0, 1, 200)
y_logistic = logistic.predict_proba(x_logistic[:, np.newaxis])[:, 1]

# Isothonic regression
x_isotonic = np.linspace(0, 1, 200)
y_isotonic = isotonic.transform(x_isotonic)


# MLP
x_mlp = np.linspace(0, 1, 200)
y_mlp = mlp.predict(x_mlp[:, np.newaxis]).clip(0)

fig, ax = plt.subplots(figsize=(6.4, 4.8))
ax.plot([0, 1], [0, 1], label="Uncalibrated")
ax.plot(x_logistic, y_logistic, label="Logistic fit")
# ax.plot(isotonic.X_thresholds_, isotonic.y_thresholds_, "-", marker="o", markersize=3, label="Isotonic fit")
ax.plot(x_isotonic, y_isotonic, "-", label="Isotonic fit")
# ax.plot(x_mlp, y_mlp, label="MLP")
ax.set_xlabel("Predicted probability")
ax.set_ylabel("Calibrated probability")
ax.legend()
fig.savefig("calibration_fits_ensemble.pdf", bbox_inches="tight")

In [None]:
# All calibration curves
arrays = [median_ensemble_test["ensemble_probs"], ensemble_probs_logistic, ensemble_probs_isotonic]#, ensemble_probs_mlp]
labels = ["Ensemble uncalibrated", "Ensemble logistic calibration", "Ensemble isotonic calibration", "Ensemble MLP calibration"]

fig, ax = plot_calibration_curve(
    targets=median_ensemble_test["y"],
    model_probs=arrays[0:1],
    labels=labels[0:1], #, *["Constituent model"] * 5],
    n_bins=20,
    strategy="uniform",
    # n_bins=1000,
    # strategy="quantile",
)

plot_calibration_curve(
    targets=median_ensemble_test["y"],
    model_probs=arrays[1:],
    labels=labels[1:], #, *["Constituent model"] * 5],
    # n_bins=20,
    # strategy="uniform",
    n_bins=1000,
    strategy="quantile",
    ax=ax,
)
fig.savefig("calibration_curves_ensemble.pdf", bbox_inches="tight")
plot_histogram(arrays=arrays, labels=labels)

In [None]:
# All calibration curves (transformed bin centers)
# > Not great
arrays = [median_ensemble_test["ensemble_probs"], ensemble_probs_logistic, ensemble_probs_isotonic]#, ensemble_probs_mlp]
labels = ["Ensemble uncalibrated", "Ensemble logistic calibration", "Ensemble isotonic calibration", "Ensemble MLP calibration"]

fig, ax = plot_calibration_curve(
    targets=median_ensemble_test["y"],
    model_probs=median_ensemble_test["ensemble_probs"],
    labels=labels[0:1], #, *["Constituent model"] * 5],
    n_bins=20,
    strategy="uniform",
    # n_bins=1000,
    # strategy="quantile",
)

uncalibrated_bin_centers = ax.lines[1].get_xdata()
fraction_of_positives = ax.lines[1].get_ydata()

logistic_bin_centers = logistic.predict_proba(uncalibrated_bin_centers[:, np.newaxis])[:, 1]
isotonic_bin_centers = isotonic.transform(uncalibrated_bin_centers)

ax.plot(logistic_bin_centers, fraction_of_positives, label="Logistic fit")
ax.plot(isotonic_bin_centers, fraction_of_positives, label="Isotonic fit")
ax.legend()

In [None]:
# Brier scores
brier_scores = {
    "uncalibrated": sklearn.metrics.brier_score_loss(median_ensemble_test["y"], median_ensemble_test["ensemble_probs"]),
    "logistic": sklearn.metrics.brier_score_loss(median_ensemble_test["y"], ensemble_probs_logistic),
    "isotonic": sklearn.metrics.brier_score_loss(median_ensemble_test["y"], ensemble_probs_isotonic),
    # "mlp": sklearn.metrics.brier_score_loss(median_ensemble_test["y"], ensemble_probs_mlp),
    "average": sklearn.metrics.brier_score_loss(median_ensemble_test["y"], median_ensemble_test["y"].mean()[np.newaxis].repeat(len(median_ensemble_test))),
}
print("Brier scores: ", brier_scores)

# Brier skill scores
# - uncalibrated reference
brier_skill_scores_uncalibrated = {
    "logistic": 1 - brier_scores["logistic"] / brier_scores["uncalibrated"],
    "isotonic": 1 - brier_scores["isotonic"] / brier_scores["uncalibrated"],
    # "mlp": 1 - brier_scores["mlp"] / brier_scores["uncalibrated"],
}
print("BSS uncalibrated reference: ", brier_skill_scores_uncalibrated)

# - average target reference
brier_skill_scores_mean = {
    "logistic": 1 - brier_scores["logistic"] / brier_scores["average"],
    "isotonic": 1 - brier_scores["isotonic"] / brier_scores["average"],
    # "mlp": 1 - brier_scores["mlp"] / brier_scores["average"],
}
print("BSS mean reference: ", brier_skill_scores_mean)


In [None]:
# F1-score of model and call-taker as function of model output probabilities

def plot_f1_score_vs_probabilities(targets, model_probs, model_preds, human_preds, model_label: str, target_label: str = "Call-taker", num_bins: int = 10):

    bin_edges = [i**2 for i in range(1, num_bins, 1)] #np.linspace(0, 1, num_bins + 1)
    bin_edges[0] = 0.0
    bin_edges = np.array(bin_edges) / (num_bins)**2
    hist, bin_edges = np.histogram(model_probs, bins=bin_edges)
    print("Bin edges: ", bin_edges)

    bin_widths = bin_edges[1:] - bin_edges[:-1]

    fig1 = plt.figure(figsize=(6.4, 4.8))
    ax1 = fig1.gca()
    ax1.bar(bin_edges[:-1], hist, width=bin_widths, align="edge", alpha=0.5, label=model_label)
    ax1.set_yscale("log")
    ax1.set_xlabel("Predicted probability")
    ax1.set_ylabel("Count")

    f1s_model = []
    f1s_calltaker = []
    for low, high in zip(bin_edges[:-1], bin_edges[1:]):
        cond = (model_probs >= low) & (model_probs < high)
        
        f1_score = sklearn.metrics.f1_score(targets[cond], model_preds[cond])
        f1s_model.append(f1_score)
        
        f1_score_calltaker = sklearn.metrics.f1_score(targets[cond], human_preds[cond])
        f1s_calltaker.append(f1_score_calltaker)

    f1s_model = np.array(f1s_model)
    f1s_calltaker = np.array(f1s_calltaker)

    fig2 = plt.figure(figsize=(6.4, 4.8))
    ax2 = fig2.gca()

    ax2.bar(bin_edges[:-1], f1s_model, width=bin_widths, align="edge", alpha=0.5, label=model_label)
    ax2.bar(bin_edges[:-1], f1s_calltaker, width=bin_widths, align="edge", alpha=0.5, label=target_label)

    ax2.set_ylabel("F1-score")

    ax2.set_xlabel("Predicted probability")
    ax2.legend(loc="upper left")
    return (fig1, ax1), (fig2, ax2)

In [None]:
plot_f1_score_vs_probabilities(median_ensemble_test["y"], median_ensemble_test["ensemble_probs"], median_ensemble_test["ensemble_preds"], median_ensemble_test["h"], "Uncalibrated ensemble", "Call-taker", num_bins=10)

In [None]:
(fig1, ax1), (fig2, ax2) = plot_f1_score_vs_probabilities(median_ensemble_test["y"], ensemble_probs_isotonic, median_ensemble_test["ensemble_preds"], median_ensemble_test["h"], "Calibrated ensemble (isotonic)", "Call-taker", num_bins=10)
ax1.set_xlim(0, 0.64)
ax2.set_xlim(0, 0.64)

fig1.savefig("predicted_probability_histogram.pdf", bbox_inches="tight")
fig2.savefig("f1_score_vs_predicted_probability_ensemble_calltaker.pdf", bbox_inches="tight")


In [None]:
ensemble_probs_isotonic_val.shape, median_ensemble_val["y"].shape, median_ensemble_val["ensemble_preds"].shape

In [None]:
plot_f1_score_vs_probabilities(
    median_ensemble_val["y"],
    ensemble_probs_isotonic_val,
    median_ensemble_val["ensemble_preds"],
    median_ensemble_val["h"],
    "Calibrated ensemble (isotonic) [val]",
    "Call-taker [val]",
    num_bins=10
)

In [None]:
# Evaluate model performance on subset of test set with probability score higher than the threshold (computed on the 
# validation set) where model F1 drops below call-taker F1

def compute_metrics(targets, labels):
    """Compute and return recall, precision, F1, FPR and FOR."""
    tp = np.sum((targets == 1) & (labels == 1))
    fp = np.sum((targets == 0) & (labels == 1))
    tn = np.sum((targets == 0) & (labels == 0))
    fn = np.sum((targets == 1) & (labels == 0))

    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    f1 = 2 * (precision * recall) / (precision + recall)
    fpr = fp / (fp + tn)
    for_ = fn / (fn + tn)
    return recall, precision, f1, fpr, for_


In [None]:
compute_metrics(median_ensemble_test["y"], median_ensemble_test["ensemble_preds"])

In [None]:
compute_metrics(median_ensemble_test["y"], ensemble_probs_isotonic > 0.0748)

In [None]:
cond = ensemble_probs_isotonic >= 0.05
compute_metrics(median_ensemble_test["y"][cond], median_ensemble_test["ensemble_preds"][cond])

In [None]:
raise Exception("Stop here")

In [None]:
# F1-score ratio of model and call-taker as function of model output probabilities

num_bins = 10
bin_edges = [i**2 for i in range(1, num_bins, 1)] #np.linspace(0, 1, num_bins + 1)
bin_edges[0] = 0.0
bin_edges = np.array(bin_edges) / (num_bins)**2
hist, bin_edges = np.histogram(ensemble_probs_isotonic, bins=bin_edges)
print("Bin edges: ", bin_edges)

bin_widths = bin_edges[1:] - bin_edges[:-1]

fig = plt.figure(figsize=(6.4, 4.8))
ax = fig.gca()
ax.bar(bin_edges[:-1], hist, width=bin_edges[1] - bin_edges[0], align="edge", alpha=0.5, label="Isotonic")
ax.set_yscale("log")

f1s_model = []
f1s_calltaker = []
x = ensemble_probs_isotonic #[ensemble_probs_isotonic < cutoff]
for low, high in zip(bin_edges[:-1], bin_edges[1:]):
    cond = (x >= low) & (x < high)
    
    f1_score = sklearn.metrics.f1_score(median_ensemble_test["y"][cond], median_ensemble_test["ensemble_preds"][cond])
    f1s_model.append(f1_score)
    
    f1_score_calltaker = sklearn.metrics.f1_score(median_ensemble_test["y"][cond], median_ensemble_test["h"][cond])
    f1s_calltaker.append(f1_score_calltaker)

f1s_model = np.array(f1s_model)
f1s_calltaker = np.array(f1s_calltaker)

f1s_ratio = f1s_model / f1s_calltaker

fig = plt.figure(figsize=(6.4, 4.8))
ax = fig.gca()

ax.bar(bin_edges[:-1], f1s_model, width=bin_widths, align="edge", alpha=0.5, label="Calibrated ensemble (isotonic)")
ax.bar(bin_edges[:-1], f1s_calltaker, width=bin_widths, align="edge", alpha=0.5, label="Call taker")

# ax2 = ax.twinx()
# ax2.plot(bin_edges[:-1] + 0.5 * bin_widths, f1s_ratio, label="Model/call-taker F1-score ratio", color="black", linestyle="dashed")

ax.set_xlim(0, bin_edges[-2])

ax.set_ylabel("F1-score")
# ax2.set_ylabel("F1-score ratio")

ax.set_xlabel("Predicted probability")
ax.legend(loc="upper left")
# ax2.legend()

fig.savefig("f1_score_ratio.pdf", bbox_inches="tight")

In [None]:
# What is the fraction of calls where the model and call-taker agree as a function of the model probability?
z_twosided = scipy.stats.norm.ppf(.975)

agreement = median_ensemble_test["ensemble_preds"] == median_ensemble_test["h"]

# - calibrated (isotonic)
hist_agreement_isotonic, _ = np.histogram(ensemble_probs_isotonic[agreement], bins=bin_edges, range=(0, 1))
hist_isotonic, _ = np.histogram(ensemble_probs_isotonic, bins=bin_edges, range=(0, 1))
frac_agreement_isotonic = hist_agreement_isotonic / hist_isotonic

print("bin_edges[:-1]: ", bin_edges[:-1])
print("hist_isotonic: ", hist_isotonic)
print("frac_agreement_isotonic: ", frac_agreement_isotonic)


frac_agreement_isotonic_ci = z_twosided * np.sqrt(frac_agreement_isotonic * (1 - frac_agreement_isotonic) / hist_isotonic)


fig = plt.figure(figsize=(6.4, 4.8))
ax = fig.gca()

# ax.plot(bin_edges_uncalibrated[:-1], frac_agreement_uncalibrated, label="Uncalibrated")
# ax.fill_between(
#     bin_edges_uncalibrated[:-1],
#     (frac_agreement_uncalibrated - frac_agreement_uncalibrated_ci),
#     (frac_agreement_uncalibrated + frac_agreement_uncalibrated_ci),
#     alpha=0.3,
# )

ax.plot(bin_edges[:-1], frac_agreement_isotonic, label="Isotonic")
# ax.fill_between(
#     bin_edges[:-1],
#     (frac_agreement_isotonic - frac_agreement_isotonic_ci),
#     (frac_agreement_isotonic + frac_agreement_isotonic_ci),
#     alpha=0.3,
# )

# ax.set_xlim(0, bin_edges[-2])
ax.set_ylim(0, 1)

ax.set_xlabel("Predicted probability")
ax.set_ylabel("Fraction of calls with agreement")

ax.legend()

fig.savefig("model_calltaker_agreement_fraction.pdf", bbox_inches="tight")

In [None]:
def compute_metrics(targets, labels)

In [None]:
# What is the fraction of calls where the model and call-taker agree as a function of the model probability?
z_twosided = scipy.stats.norm.ppf(.975)

correct_agreement = median_ensemble_test["ensemble_preds"] == median_ensemble_test["h"]
correct_agreement = correct_agreement == median_ensemble_test["y"]

wrong_agreement = median_ensemble_test["ensemble_preds"] == median_ensemble_test["h"]
wrong_agreement = wrong_agreement != median_ensemble_test["y"]

# - uncalibrated
bins = 20
bin_edges_uncalibrated = np.linspace(0, 1, bins + 1)

hist_agreement_uncalibrated, _ = np.histogram(median_ensemble_test["ensemble_probs"][correct_agreement], bins=bin_edges_uncalibrated, range=(0, 1))
hist_uncalibrated, _ = np.histogram(median_ensemble_test["ensemble_probs"], bins=bin_edges_uncalibrated, range=(0, 1))
frac_agreement_uncalibrated = hist_agreement_uncalibrated / hist_uncalibrated

frac_agreement_uncalibrated_ci = z_twosided * np.sqrt(frac_agreement_uncalibrated * (1 - frac_agreement_uncalibrated) / hist_uncalibrated)

# - calibrated (isotonic)
bins = 20
bin_edges_isotonic = np.array([0, 0.05, 0.11, 0.18, 0.26, 0.40, 1])
# bin_edges_isotonic = np.linspace(0, 0.4, bins + 1)
# bin_edges_isotonic = np.logspace(-3, 0, bins + 1)
# bin_edges_isotonic = np.quantile(ensemble_probs_isotonic, np.linspace(0, 1, bins + 1))

hist_agreement_isotonic, _ = np.histogram(ensemble_probs_isotonic[correct_agreement], bins=bin_edges_isotonic, range=(0, 1))
hist_isotonic, _ = np.histogram(ensemble_probs_isotonic, bins=bin_edges_isotonic, range=(0, 1))
frac_agreement_isotonic = hist_agreement_isotonic / hist_isotonic

frac_agreement_isotonic_ci = z_twosided * np.sqrt(frac_agreement_isotonic * (1 - frac_agreement_isotonic) / hist_isotonic)


fig = plt.figure(figsize=(6.4, 4.8))
ax = fig.gca()

ax.plot(bin_edges_uncalibrated[:-1], frac_agreement_uncalibrated, label="Uncalibrated")
ax.fill_between(
    bin_edges_uncalibrated[:-1],
    (frac_agreement_uncalibrated - frac_agreement_uncalibrated_ci),
    (frac_agreement_uncalibrated + frac_agreement_uncalibrated_ci),
    alpha=0.3,
)

ax.plot(bin_edges_isotonic[:-1], frac_agreement_isotonic, label="Isotonic")
ax.fill_between(
    bin_edges_isotonic[:-1],
    (frac_agreement_isotonic - frac_agreement_isotonic_ci),
    (frac_agreement_isotonic + frac_agreement_isotonic_ci),
    alpha=0.3,
)

ax.set_xlim(0, 1)
ax.set_ylim(0, 1)

ax.set_xlabel("Predicted probability")
ax.set_ylabel("Fraction of calls with agreement")

ax.legend()

fig.savefig("wrong_agreement_fraction.pdf", bbox_inches="tight")

In [None]:
hist_agg, hist

In [None]:
arrays = [median_ensemble_test["ensemble_probs"], ensemble_probs_logistic, ensemble_probs_isotonic]#, ensemble_probs_mlp]
labels = ["Ensemble uncalibrated", "Ensemble logistic calibration", "Ensemble isotonic calibration", "Ensemble MLP calibration"]


In [None]:
median_ensemble_test["ensemble_probs"]

In [None]:
uncalibrated_probs_pos = median_ensemble_test["ensemble_probs"][median_ensemble_test["h"] == 1]
uncalibrated_probs_neg = median_ensemble_test["ensemble_probs"][median_ensemble_test["h"] == 0]

calibrated_probs_pos = ensemble_probs_isotonic[median_ensemble_test["y"] == 1]
calibrated_probs_neg = ensemble_probs_isotonic[median_ensemble_test["y"] == 0]

fig = plt.figure(figsize=(6.4, 4.8))
ax = fig.gca()
ax.violinplot([calibrated_probs_neg, calibrated_probs_pos], positions=[0, 1], showmedians=True, bw_method=0.2)
ax.set_xticks([0, 1])
ax.set_xticklabels(["Negative", "Positive"])
# ax.set_xlabel("Predicted probability")
# ax.violinplot([uncalibrated_probs_neg, uncalibrated_probs_pos], positions=[0, 1], showmedians=True)

In [None]:
x, y = calibration_curve(
    median_ensemble_test["h"],
    median_ensemble_test["ensemble_probs"],
    n_bins=30,
    strategy="uniform",
)
plt.plot(x, y)

In [None]:
dispatcher_correct = median_ensemble_test["h"] == median_ensemble_test["y"]
model_correct = median_ensemble_test["ensemble_preds"] == median_ensemble_test["y"]

fig = plt.figure(figsize=(6.4, 4.8))
ax = fig.gca()
ax.hist(median_ensemble_test["ensemble_probs"][dispatcher_correct], bins=30, alpha=0.5, label="Dispatcher correct")

In [None]:
ax.violinplot?

In [None]:
median_ensemble_test["y"].astype(int).to_numpy()

In [None]:
raise Exception("Stop here")

# Ensemble of ensembles

In [None]:
all_ensembles_test

In [None]:
all_ensembles_val

In [None]:
all_ensembles_test_probs = [all_ensembles_test[f"ensemble {i} probs"] for i in range(1, 12)]
labels = [f"Ensemble {i}" for i in range(1, 12)]

plot_calibration_curve(
    targets=median_ensemble_test["y"],
    model_probs=all_ensembles_test_probs,
    labels=labels,
    n_bins=20,
)

In [None]:
all_ensembles_test_probs = np.stack([all_ensembles_test[f"ensemble {i} probs"] for i in range(1, 12)])
all_ensembles_test_preds = np.stack([all_ensembles_test[f"ensemble {i} preds"] for i in range(1, 12)])
all_ensembles_val_probs = np.stack([all_ensembles_val[f"ensemble {i} probs"] for i in range(1, 12)])
all_ensembles_val_preds = np.stack([all_ensembles_val[f"ensemble {i} preds"] for i in range(1, 12)])
all_ensembles_test_preds.shape

In [None]:
med_ensemble_test = median_ensemble_test["ensemble_probs"]
# majority_vote_test = np.mean(all_ensembles_test_preds, axis=0) > 0.5
super_ensemble_probs_test = scipy.stats.hmean(all_ensembles_test_probs, axis=0)
mean_ensemble_probs_test = np.mean(all_ensembles_test_probs, axis=0)

med_ensemble_val = median_ensemble_val["ensemble_probs"]
# majority_vote_val = np.mean(all_ensembles_test_preds, axis=0) > 0.5
super_ensemble_probs_val = scipy.stats.hmean(all_ensembles_val_probs, axis=0)
mean_ensemble_probs_val = np.mean(all_ensembles_val_probs, axis=0)

In [None]:
plot_calibration_curve(
    targets=median_ensemble_test["y"],
    model_probs=[med_ensemble_test, super_ensemble_probs_test, mean_ensemble_probs_test],
    labels=["Median ensemble", "Harmonic mean of all ensembles", "Mean of all ensembles"],
    n_bins=20,
)
plot_histogram(arrays=[med_ensemble_test, super_ensemble_probs_test, mean_ensemble_probs_test], labels=["Majority vote", "Super ensemble", "Mean ensemble"])

## Platt scaling

In [None]:
logistic = LogisticRegression(penalty="none", fit_intercept=True)
logistic.fit(super_ensemble_probs_val[:, np.newaxis], median_ensemble_val["y"].to_numpy())
ensemble_probs_logistic = logistic.predict_proba(super_ensemble_probs_test[:, np.newaxis])[:, 1]

In [None]:
logistic.coef_, logistic.intercept_, logistic.n_iter_

In [None]:
ensemble_probs_logistic.min(), ensemble_probs_logistic.max()

In [None]:
plot_calibration_curve(
    targets=median_ensemble_test["y"],
    model_probs=[ensemble_probs_logistic],
    labels=["Ensemble"],
    n_bins=20,
)
plot_histogram(arrays=[ensemble_probs_logistic], labels=["Ensemble"])

In [None]:
is_old = median_ensemble_test["age"] >= 65
plot_calibration_curve(
    targets=[median_ensemble_test["y"][is_old], median_ensemble_test["y"][~is_old]],
    model_probs=[ensemble_probs_logistic[is_old], ensemble_probs_logistic[~is_old]],
    labels=["65+", "18-65"],
    n_bins=20,
)
plot_histogram(
    arrays=[ensemble_probs_logistic[is_old], ensemble_probs_logistic[~is_old]],
    labels=["65+", "18-65"],
)

## Isotonic

In [None]:
isotonic = IsotonicRegression(y_min=0, y_max=1, increasing=True, out_of_bounds="clip")
isotonic.fit(super_ensemble_probs_val, median_ensemble_val["y"])
ensemble_probs_isotonic = isotonic.transform(super_ensemble_probs_test)

In [None]:
plot_histogram(arrays=[ensemble_probs_isotonic], labels=["Ensemble validation"])

In [None]:
fig, ax = plt.subplots(figsize=(6.4, 4.8))
ax.plot([0, 1], [0, 1], "k:", label="Perfect calibration")
ax.plot(isotonic.X_thresholds_, isotonic.y_thresholds_, "-", marker="o", markersize=3, label="Isotonic")

In [None]:
plot_calibration_curve(
    targets=median_ensemble_test["y"],
    model_probs=[ensemble_probs_isotonic],
    labels=["Ensemble"],
    n_bins=20,
)
plot_histogram(arrays=[ensemble_probs_isotonic], labels=["Ensemble"])