# Intuition on AURC

This notebooks contains examples of how to use the calibration and failure prediction modules.
We measure metrics of interest on:
- CIFAR-10 ResNet-18

We use the following metrics:
- Accuracy
- F1 score (micro,macro)
- ECE
- Brier score
- Negative Loglikelihood
- AURC

As a bonus, we plot the risk-coverage curves.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
## necessary installs
!pip3 install numpy
!pip3 install scipy
!pip3 install scikit-learn
!pip3 install matplotlib
!pip3 install pandas
!pip3 install plotly

^C
[31mERROR: Operation cancelled by user[0m
You should consider upgrading via the '/home/jordy/.virtualenvs/SOTA/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/jordy/.virtualenvs/SOTA/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/jordy/.virtualenvs/SOTA/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/jordy/.virtualenvs/SOTA/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/jordy/.virtualenvs/SOTA/bin/python -m pip install --upgrade pip' command.[0m




## Measuring calibration on CIFAR-10


In [1]:
"""
This testing script loads actual probabilisitic predictions from a resnet finetuned on CIFAR

There are a number of logits-groundtruth pickles available @ https://github.com/markus93/NN_calibration/tree/master/logits
[Seems to have moved from Git-LFS to sharepoint]
https://tartuulikool-my.sharepoint.com/:f:/g/personal/markus93_ut_ee/EmW0xbhcic5Ou0lRbTrySOUBF2ccSsN7lo6lvSfuG1djew?e=l0TErb

See https://github.com/markus93/NN_calibration/blob/master/logits/Readme.txt to decode the [model_dataset] filenames

As a bonus, one could consider temperature scaling and measuring after calibration.
"""
import numpy as np
from scipy.special import softmax
import pickle
from sklearn.model_selection import train_test_split


# Open file with pickled variables
def unpickle_probs(file, verbose=0, normalize=False):
    with open(file, "rb") as f:  # Python 3: open(..., 'rb')
        y1, y2 = pickle.load(f)  # unpickle the content

    if isinstance(y1, tuple):
        y_probs_val, y_val = y1
        y_probs_test, y_test = y2
    else:
        y_probs_val, y_probs_test, y_val, y_test = train_test_split(
            y1, y2.reshape(-1, 1), test_size=len(y2) - 5000, random_state=15
        )  # Splits the data in the ca%load_ext autoreload

    if normalize:
        y_probs_val = softmax(y_probs_val, -1)
        y_probs_test = softmax(y_probs_test, -1)

    if verbose:
        print("y_probs_val:", y_probs_val.shape)  # (5000, 10); Validation set probabilities of predictions
        print("y_true_val:", y_val.shape)  # (5000, 1); Validation set true labels
        print("y_probs_test:", y_probs_test.shape)  # (10000, 10); Test set probabilities
        print("y_true_test:", y_test.shape)  # (10000, 1); Test set true labels

    return ((y_probs_val, y_val.ravel()), (y_probs_test, y_test.ravel()))

In [2]:
import pandas as pd
import json
from collections import OrderedDict
from metrics import accuracy, brier_loss, nll, f1_micro, f1_macro, aurc_logits, ece_logits  # AUROC_logits

METRICS = [accuracy, brier_loss, nll, f1_micro, f1_macro, ece_logits, aurc_logits]


def apply_metrics(y_true, y_probs, metrics=METRICS):
    predictive_performance = OrderedDict()
    for metric in metrics:
        try:
            predictive_performance[f"{metric.__name__.replace('_logits', '')}"] = metric(y_true, y_probs)
        except Exception as e:
            print(e)
    print(json.dumps(predictive_performance, indent=4))
    return predictive_performance


(p_val, y_val), (p_test, y_test) = unpickle_probs("../data/resnet110_c10_logits.p", verbose=1)
output = apply_metrics(y_test, p_test)
#
df = pd.DataFrame.from_dict(output, orient="index", columns=["resnet110_c10"])
print(df.to_latex())

y_probs_val: (5000, 10)
y_true_val: (5000, 1)
y_probs_test: (10000, 10)
y_true_test: (10000, 1)


Using the latest cached version of the module from /home/jordy/.cache/huggingface/modules/evaluate_modules/metrics/jordyvl--ece/e1bf3aa6b59d6093d30bcd8fd77af2c6f1a958ca907e20911c16ab85c3a2caf8 (last modified on Tue Jan 10 19:22:35 2023) since it couldn't be found locally at jordyvl--ece, or remotely on the Hugging Face Hub.


{
    "accuracy": 0.9356,
    "brier_loss": 0.11018865559514414,
    "nll": 1.0624524426782735,
    "f1_micro": 0.9356,
    "f1_macro": 0.9356251148224297,
    "ece": 0.05030814408063887,
    "aurc": 0.008406399931071139
}
\begin{tabular}{lr}
\toprule
{} &  resnet110\_c10 \\
\midrule
accuracy   &       0.935600 \\
brier\_loss &       0.110189 \\
nll        &       1.062452 \\
f1\_micro   &       0.935600 \\
f1\_macro   &       0.935625 \\
ece        &       0.050308 \\
aurc       &       0.008406 \\
\bottomrule
\end{tabular}



## AURC (manual)

In [16]:
g = lambda x: np.max(softmax(x, axis=-1), -1)  # maximum softmax probability

def entropy(x):
    exp_x = np.exp(x)
    A = np.sum(exp_x, axis=-1)  # sum of exp(x_i)
    B = np.sum(x * exp_x, axis=-1)  # sum of x_i * exp(x_i)
    return np.log(A) - B / A


#g = lambda x: -entropy(x)  # negative entropy to not have to deal with sign


def coverage_at_threshold(f_X, g, tau=0.5):
    return np.mean(g(f_X) >= tau)


def risk_at_threshold(f_X, g, Y, tau=0.5, loss="0-1"):
    return np.logical_and((g(f_X) >= tau), (f_X.argmax(-1) != Y)).mean()


# for t in [0.2, 0.5, 0.8, 0.9, 0.95, 0.97, 0.99]:
#     print(f"Risk at {t}: {risk_at_threshold(p_test, g, y_test, tau=t)}")
#     print(f"Coverage at {t}: {coverage_at_threshold(p_test, g, tau=t)}")

# now defining AURC as a curve


def manual_AURC(f_X, g, Y):
    "The RC curve is obtained by computing the risk of the coverage from the beginning of g(x) (most confident) to the end (least confident)."
    incorrect = f_X.argmax(-1) != Y  # instance-level mask
    g_X = g(f_X)
    idx_sorted = np.argsort(g_X)  # in ascending format; construct curve from right to left

    coverages, risks = [], []

    # well-defined starting point: risk = 1-accuracy (loss), coverage=100%; threshold=0
    coverages.append(1)
    risks.append(incorrect.mean())

    # will keep these as intermediate absolute values to facilitate calculation
    N = len(idx_sorted)
    coverage = len(idx_sorted)
    error_sum = sum(incorrect[idx_sorted])

    weights = [] # just forms some mask of points that were different/used for integration (x/N) --> with percentage of data captured

    # DEV: could very well do binning or set some tolerance

    tmp_weight = 0
    for tau in range(0, len(idx_sorted) - 1):  # each
        coverage = coverage - 1
        error_sum = error_sum - incorrect[idx_sorted[tau]]
        selective_risk = error_sum / (N - 1 - tau)
        tmp_weight += 1
        if tau == 0 or g_X[idx_sorted[tau]] != g_X[idx_sorted[tau - 1]]:  # unique or starting threshold
            coverages.append(coverage / N)
            risks.append(selective_risk)
            weights.append(tmp_weight / N)
            tmp_weight = 0

    # well-defined ending (if not already done): last known risk for 0 coverage (threshold=100%)
    if tmp_weight > 0:
        coverages.append(0)
        risks.append(risks[-1])
        weights.append(tmp_weight / N)  # should be 1?

    # how to deal with uniques? binning?

    print(f"risk at coverages:{list(zip(risks, coverages))}")

    # now define curve here
    aurc = sum([(risks[i] + risks[i + 1]) * 0.5 * weights[i] for i in range(len(weights))])

    return aurc


manual_AURC(p_test, g, y_test)

risk at coverages:[(0.0644, 1), (0.06430643064306431, 0.9999), (0.0642128425685137, 0.9998), (0.06411923577073123, 0.9997), (0.06402561024409764, 0.9996), (0.0639319659829915, 0.9995), (0.06393836301781068, 0.9994), (0.06384469128389873, 0.9993), (0.06375100080064051, 0.9992), (0.06365729156240617, 0.9991), (0.06356356356356356, 0.999), (0.06356992691961157, 0.9989), (0.06347617140568683, 0.9988), (0.06338239711625113, 0.9987), (0.06338874424193872, 0.9986), (0.06329494241362044, 0.9985), (0.06320112179487179, 0.9984), (0.06310728238004608, 0.9983), (0.06301342416349429, 0.9982), (0.06301973750125238, 0.9981), (0.06292585170340681, 0.998), (0.06283194708888666, 0.9979), (0.06273802365203447, 0.9978), (0.06264408138719053, 0.9977), (0.06255012028869286, 0.9976), (0.06255639097744362, 0.9975), (0.06256266292360137, 0.9974), (0.06246866539657074, 0.9973), (0.0623746490172483, 0.9972), (0.06228061377996189, 0.9971), (0.062286860581745235, 0.997), (0.062192797672785634, 0.9969), (0.06219903

0.008259328966633058

In [None]:
# let's create a version where the integration points (weights) are defined in advance
# np.unique(np.floor(a/TOL).astype(int))*TOL

def crude_AURC(f_X, g, Y, NUM_AUC_VALUES=50): 
    coverages, risks, weights = [], [], []
    incorrect = f_X.argmax(-1) != Y  # instance-level mask
    g_X = g(f_X)
    idx_sorted = np.argsort(g_X)  # in ascending format; construct curve from right to left

    # well-defined starting point: risk = 1-accuracy (loss), coverage=100%; threshold=0
    coverages.append(1)
    risks.append(incorrect.mean())

    # will keep these as intermediate absolute values to facilitate calculation
    N = len(idx_sorted)
    #TODO: I feel like I have solved this problem already somewhere [binning?]
    thresholds = bin_boundaries #get help from copilot
    

In [None]:
# experiment with many randomly sampled dirichlet distribution values; try with different N

# Running example

#### The original data comes from [Gupta & Ramdas 2021](https://openreview.net/pdf?id=WqoBaaPHS-)

In [9]:
import numpy as np

f = np.array(
    [
        [0.1, 0.0, 0.6, 0.3],
        [0.6, 0.0, 0.1, 0.3],
        [0.2, 0.7, 0.0, 0.1],
        [0.0, 0.1, 0.1, 0.8],
        [0.0, 0.1, 0.8, 0.1],
        [0.9, 0.1, 0.0, 0.0],
    ]
)

confidence = np.array([[0.6, 0.6, 0.7, 0.8, 0.8, 0.9]])

predicted_y = np.array([3, 1, 2, 4, 3, 1])
correct_y = np.array([3, 4, 2, 1, 4, 1])
y_correct = correct_y - 1  # 0-indexing

## Risk-coverage curves


In [19]:
aurcs = []
caches = []
for probs in [p_test, scaled_test_p]:
    res = aurc_logits(y_test, probs, plot=False, get_cache=True)  # resnet
    aurc, cache = res["aurc"], res["cache"]
    aurcs.append(aurc)
    caches.append(cache)

multi_aurc_plot(caches, ["CIFAR10-ResNet18", "CIFAR10-ResNet18+T"], aurcs=aurcs)