In [None]:
# default_exp metrics

# Metrics

> API details.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
# export

from riverreliability import utils

import numpy as np
import scipy.stats
import scipy.integrate

import riverreliability.beta
import sklearn.metrics
import sklearn.datasets
import sklearn.model_selection
import sklearn.svm
import sklearn.preprocessing
import sklearn.utils

## Probabilistic classification: toy example

In [None]:
np.random.seed(42)

In [None]:
X, y = sklearn.datasets.make_classification(n_samples=5000, n_features=5, n_informative=3, n_classes=3)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2, shuffle=True)

In [None]:
logreg = sklearn.svm.SVC(probability=True)

In [None]:
logreg.fit(X_train, y_train)

SVC(probability=True)

In [None]:
y_probs = logreg.predict_proba(X_test)
y_preds = y_probs.argmax(axis=1)

print(f"Accuracy: {sklearn.metrics.accuracy_score(y_test, y_preds)}")
print(f"Balanced accuracy: {sklearn.metrics.balanced_accuracy_score(y_test, y_preds)}")

Accuracy: 0.868
Balanced accuracy: 0.8682246668791995


## Metric definition

In [None]:
# export

def peace(y_probs, y_preds, y_true, samples=1000, bins="fd", **bin_args):
    """Compute the posterior expected balanced accuracy-based calibration error (PEACE).

    Parameters:
    y_probs (np.array): predicted class probabilities
    y_preds (np.array): predicted class labels
    y_true (np.array): true class labels
    samples (int): number of samples for numerical integration

    Returns:
    exp_ce (float): expected calibration error

    """

    sklearn.utils.check_consistent_length(y_preds, y_true)
    classes = sklearn.utils.multiclass.unique_labels(y_preds, y_true)

    # define the bin function
    def bin_func(y_probs_bin, y_preds_bin, y_true_bin):

        xs = np.linspace(0, 1, samples)
        conf = y_probs_bin.mean()

        if len(np.unique(y_preds_bin)) > 1:
            # estimate beta parameters
            confusion = sklearn.metrics.confusion_matrix(y_true_bin, y_preds_bin, labels=classes)
            params = riverreliability.beta.get_beta_parameters(confusion)
            ys = abs(xs - conf) * riverreliability.beta.beta_avg_pdf(xs, params, fft=True)
        else:
            params = sum(y_preds_bin == y_true_bin)+1, sum(y_preds_bin != y_true_bin)+1
            ys = abs(xs - conf) * scipy.stats.beta.pdf(xs, params[0], params[1])

        # approximate the integral using Simpson's rule
        return scipy.integrate.simps(ys, xs)

    # compute the full result
    bin_indices = utils.get_bin_indices(y_probs, bins=bins, lower=0, upper=1, **bin_args)
    return utils.binning(y_probs, y_preds, y_true, bin_indices, bin_func)

Provide the metric with the probabilities associated with the prediction, the prediction itself, and the true labels.

In [None]:
peace(y_probs.max(axis=1), y_preds, y_test, bins="count", n_bins=15)

0.0709577286056104

In [None]:
peace(y_probs.max(axis=1), y_preds, y_test, bins="fd")

0.05893617924357199

In [None]:
peace(y_probs.max(axis=1), y_preds, y_test, bins=15)

0.03765348930383863

In [None]:
# export

def ece(y_probs, y_preds, y_true, balanced=False, bins="fd", **bin_args):
    """Compute the expected calibration error (ECE).

    Parameters:
    y_probs (np.array): predicted class probabilities
    y_preds (np.array): predicted class labels
    y_true (np.array): true class labels

    Returns:
    exp_ce (float): expected calibration error

    """

    sklearn.utils.check_consistent_length(y_preds, y_true)

    # define the bin function
    def bin_func(y_probs_bin, y_preds_bin, y_true_bin):
        acc = (y_preds_bin == y_true_bin).mean()
        conf = y_probs_bin.mean()
        return abs(acc - conf)

    # define the balanced bin function
    def balanced_bin_func(y_probs_bin, y_preds_bin, y_true_bin):
        balacc = sklearn.metrics.balanced_accuracy_score(y_true_bin, y_preds_bin)
        conf = y_probs_bin.mean()
        return abs(balacc - conf)

    # compute the full result
    bin_indices = utils.get_bin_indices(y_probs, bins=bins, lower=0, upper=1, **bin_args)
    func = balanced_bin_func if balanced else bin_func
    return utils.binning(y_probs, y_preds, y_true, bin_indices, func)

In [None]:
ece(y_probs.max(axis=1), y_preds, y_test, bins="count")

0.03872284153605901

In [None]:
# export

def ece_v2(y_probs, y_preds, y_true, bins="fd", **bin_args):
    """Compute the expected calibration error based on the expected posterior balanced accuracy (ECEv2).

    Parameters:
    y_probs (np.array): predicted class probabilities
    y_preds (np.array): predicted class labels
    y_true (np.array): true class labels

    Returns:
    exp_ce (float): expected calibration error

    """

    sklearn.utils.check_consistent_length(y_preds, y_true)
    classes = sklearn.utils.multiclass.unique_labels(y_preds, y_true)

    # define the bin function
    def bin_func(y_probs_bin, y_preds_bin, y_true_bin):
        confusion = sklearn.metrics.confusion_matrix(y_true_bin, y_preds_bin, labels=classes)
        acc = riverreliability.beta.balanced_accuracy_expected(confusion, fft=True)
        conf = y_probs_bin.mean()
        return abs(acc - conf)

    # compute the full result
    bin_indices = utils.get_bin_indices(y_probs, bins=bins, lower=0, upper=1, **bin_args)
    return utils.binning(y_probs, y_preds, y_true, bin_indices, bin_func)

In [None]:
ece_v2(y_probs.max(axis=1), y_preds, y_test)

0.04830056103874785

In [None]:
# export

def class_wise_error(y_probs, y_preds, y_true, base_error, *base_error_args, **base_error_kwargs):
    """Compute classwise-error as proposed in "Beyond temperature scaling: Obtaining well-calibrated
    multiclass probabilities with Dirichlet calibration" (Kull, 2019).

    Parameters:
    y_probs (np.array): predicted class probabilities
    y_preds (np.array): predicted class labels
    y_true (np.array): true class labels
    base_error (callable): function that returns ECE for given probabilities, label predictions and true labels
    base_error_[kw]args ([kw]args): [Keyword ]arguments that should be passed to the base_ece callable.

    Returns:
    exp_ce (float): class-wise expected calibration error

    """

    if hasattr(y_preds, "to_numpy"):
        y_preds = y_preds.to_numpy()
    if hasattr(y_preds, "to_numpy"):
        y_true = y_true.to_numpy()

    sklearn.utils.check_consistent_length(y_preds, y_true)
    classes = sklearn.utils.multiclass.unique_labels(y_preds, y_true)

    result = 0.
    for i, c in enumerate(classes):
        selector = y_preds == c
        if sum(selector) == 0:
            continue

        result += base_error(y_probs[selector, i], y_preds[selector], y_true[selector], *base_error_args, **base_error_kwargs)

    return result/len(classes)

In [None]:
class_wise_error(y_probs, y_preds, y_test, base_error=peace, bins=15)

0.05637736547369911

In [None]:
class_wise_error(y_probs, y_preds, y_test, base_error=peace, bins="count", n_bins=15)

0.0758478434092897