# Computing Calibration Metrics
We focus on evaluating models' confidence in predictions before and after quantization in a zero-shot setting.
In an ideal scenario, we expect the model's performance and confidence to remain consistent after quantization, preserving the initial calibration level.
We evaluate the performance of LLMs post-compression using accuracy (Acc.) and calibration error (CE).

In this notebook, we provide code for computing model confidence in answers, calibration errors and entropy.
To run the notebook, you need to have predictions of models obtained with evaluation-harness framework.

Running this code does not require GPU.


In [1]:
from pathlib import Path
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy import stats

In [2]:
def ace(y_true: np.array, y_pred: np.array, num_ranges: int = 15) -> float:
    """
     Measure the Adaptive Calibration Error (ACE) by [2], an version of the static calibration error that uses ranges
     instead of bins. Every range contains the same number of predictions.

    Parameters
     ----------
     y_true: np.array
         True labels for each input.
     y_pred: np.array
         Categorical probability distribution for each input.
     num_ranges: int
         Number of ranges. Default is 10.

     Returns
     -------
     float
         Adaptive Calibration Error.
    """
    assert len(y_pred.shape) == 2, "y_pred must be a matrix!"
    assert (
        y_true.shape[0] == y_pred.shape[0]
    ), "Shapes of y_true and y_pred do not match!"

    N = len(y_true)
    num_classes = y_pred.shape[1]
    confs = np.sort(np.max(y_pred, axis=1))
    step = int(np.floor(N / num_ranges))  # Inputs per range
    thresholds = np.repeat(
        np.array([confs[i] for i in range(0, step * num_ranges, step)])[np.newaxis, ...], N, axis=0
    )  # Get the thresholds corresponding to ranges

    max_preds = np.repeat(
        np.max(y_pred, axis=1)[..., np.newaxis], num_ranges, axis=1
    )  # Repeat all maximum predictions
    b = (max_preds <= thresholds).astype(
        int
    )  # Compare max predictions against thresholds
    bin_indices = np.argmax(b, axis=1)
    ace = 0

    for bin in range(num_ranges):
        bin_preds = y_pred[bin_indices == bin, :]
        bin_labels = y_true[bin_indices == bin]

        for k in range(num_classes):
            bin_class_preds = bin_preds[bin_labels == k, :]

            if bin_class_preds.shape[0] == 0:
                continue

            bin_class_acc = np.mean(
                (np.argmax(bin_class_preds, axis=1) == k).astype(int)
            )
            bin_class_conf = np.mean(np.max(bin_class_preds, axis=1))
            ace += abs(bin_class_acc - bin_class_conf)

    ace /= num_classes * num_ranges

    return ace
def sce(y_true: np.array, y_pred: np.array, num_bins: int = 15) -> float:
    """
    Measure the Static Calibration Error (SCE) by [2], an extension to the Expected Calibration Error to multiple
    classes.

    Parameters
    ----------
    y_true: np.array
        True labels for each input.
    y_pred: np.array
        Categorical probability distribution for each input.
    num_bins: int
        Number of bins. Default is 10.

    Returns
    -------
    float
        Static Calibration Error.
    """
    assert len(y_pred.shape) == 2, "y_pred must be a matrix!"
    assert (
        y_true.shape[0] == y_pred.shape[0]
    ), "Shapes of y_true and y_pred do not match!"

    N = len(y_true)
    num_classes = y_pred.shape[1]
    bins = np.arange(0, 1, 1 / num_bins)
    bin_indices = np.digitize(np.max(y_pred, axis=1), bins)
    sce = 0

    for bin in range(num_bins):
        bin_preds = y_pred[bin_indices == bin, :]
        bin_labels = y_true[bin_indices == bin]

        for k in range(num_classes):
            bin_class_preds = bin_preds[bin_labels == k, :]

            if bin_class_preds.shape[0] == 0:
                continue

            n_bk = bin_class_preds.shape[0]
            bin_class_acc = np.mean(
                (np.argmax(bin_class_preds, axis=1) == k).astype(float)
            )
            bin_class_conf = np.mean(np.max(bin_class_preds, axis=1))
            sce += n_bk / N * abs(bin_class_acc - bin_class_conf)

    sce /= num_classes

    return sce

def mce(y_true: np.array, y_pred: np.array, num_bins: int = 15) -> float:
    """
    Measure the Maximum Calibration Error based on SCE metric

    Parameters
    ----------
    y_true: np.array
        True labels for each input.
    y_pred: np.array
        Categorical probability distribution for each input.
    num_bins: int
        Number of bins. Default is 10.

    Returns
    -------
    float
        Static Calibration Error.
    """
    assert len(y_pred.shape) == 2, "y_pred must be a matrix!"
    assert (
        y_true.shape[0] == y_pred.shape[0]
    ), "Shapes of y_true and y_pred do not match!"

    N = len(y_true)
    num_classes = y_pred.shape[1]
    bins = np.arange(0, 1, 1 / num_bins)
    bin_indices = np.digitize(np.max(y_pred, axis=1), bins)
    mce = -1

    for bin in range(num_bins):
        bin_preds = y_pred[bin_indices == bin, :]
        bin_labels = y_true[bin_indices == bin]

        for k in range(num_classes):
            bin_class_preds = bin_preds[bin_labels == k, :]

            if bin_class_preds.shape[0] == 0:
                continue

            n_bk = bin_class_preds.shape[0]
            bin_class_acc = np.mean(
                (np.argmax(bin_class_preds, axis=1) == k).astype(float)
            )
            bin_class_conf = np.mean(np.max(bin_class_preds, axis=1))
            mce = max(mce, abs(bin_class_acc - bin_class_conf))

    return mce


def mce_binary(y_true: np.array, y_pred: np.array, num_bins: int = 100) -> float:
    """

    Calculate the Expected Calibration Error: for each bin, the absolute difference between
    the mean fraction of positives and the average predicted probability is taken. The ECE is
    the weighed mean of these differences.

    Parameters
    ----------
    y: np.ndarray
        The true labels.
    y_pred: np.ndarray
        The predicted probabilities
    num_bins: int
        The number of bins to use.
    Returns
    -------
    ece: float
        The expected calibration error.
    """
    n = len(y_pred)
    bins = np.arange(0.0, 1.0, 1.0 / num_bins)
    y_pred = np.max(y_pred, axis=-1)
    bins_per_prediction = np.digitize(y_pred, bins)

    df = pd.DataFrame({"y_pred": y_pred, "y": y_true, "pred_bins": bins_per_prediction})
    # print(df)
    grouped_by_bins = df.groupby("pred_bins")
    # calculate the mean y and predicted probabilities per bin
    binned = grouped_by_bins.mean()

    # calculate the number of items per bin
    binned_counts = grouped_by_bins["y"].count()

    # calculate the proportion of data per bin
    binned["weight"] = binned_counts / n

    weighed_diff = max(binned["y_pred"] - binned["y"])
    return weighed_diff
def ece(y_true: np.array, y_pred: np.array, n_bins: int = 100) -> float:
    """

    Calculate the Expected Calibration Error: for each bin, the absolute difference between
    the mean fraction of positives and the average predicted probability is taken. The ECE is
    the weighed mean of these differences.

    Parameters
    ----------
    y: np.ndarray
        The true labels.
    y_pred: np.ndarray
        The predicted probabilities
    n_bins: int
        The number of bins to use.
    Returns
    -------
    ece: float
        The expected calibration error.
    """
    n = len(y_pred)
    bins = np.arange(0.0, 1.0, 1.0 / n_bins)
    y_pred = np.max(y_pred, axis=-1)
    bins_per_prediction = np.digitize(y_pred, bins)

    df = pd.DataFrame({"y_pred": y_pred, "y": y_true, "pred_bins": bins_per_prediction})
    # print(df)
    grouped_by_bins = df.groupby("pred_bins")
    # calculate the mean y and predicted probabilities per bin
    binned = grouped_by_bins.mean()

    # calculate the number of items per bin
    binned_counts = grouped_by_bins["y"].count()

    # calculate the proportion of data per bin
    binned["weight"] = binned_counts / n

    weighed_diff = abs(binned["y_pred"] - binned["y"]) * binned["weight"]
    return weighed_diff.sum()

In [16]:
!ls

1_Quantization_AutoGPTQ.ipynb  3_Calibration_Error_Metrics.ipynb
2_LLMs_eval.ipynb	       lm-evaluation-harness


In [24]:
directory_path = Path('./lm-evaluation-harness/')
all_models=[]
for subdir in directory_path.iterdir():
    if subdir.is_dir():
        print(subdir)
        print_ = False
        if "iproskurina" in str(subdir):
            print_=True
        if str(subdir).startswith("bigscience"):
            print_=True
        if "bigscience" in str(subdir):
            print_=True
        if print_:
            all_models.append(subdir)
print(all_models)

lm-evaluation-harness/.git
lm-evaluation-harness/bigscience
lm-evaluation-harness/docs
lm-evaluation-harness/iproskurina
lm-evaluation-harness/lm_eval
lm-evaluation-harness/lm_eval.egg-info
lm-evaluation-harness/results
lm-evaluation-harness/scripts
lm-evaluation-harness/templates
lm-evaluation-harness/tests
[PosixPath('lm-evaluation-harness/bigscience'), PosixPath('lm-evaluation-harness/iproskurina')]


In [58]:
combined_names = [f"{dataset}_{metric}" for dataset in dataset_names for metric in metrics]
combined_names.append("model")
data_loaded_computed = {key: [] for key in combined_names}
all_results=dict()
# print(all_models) [PosixPath('lm-evaluation-harness/bigscience'), PosixPath('lm-evaluation-harness/iproskurina')]
for subdir in tqdm(all_models, desc='Processing models'):
    path_subdir = directory_path / subdir #lm-evaluation-harness/lm-evaluation-harness/bigscience
    data_loaded_computed['model'].append(str(subdir))
    all_results[str(subdir)]={} 
    all_dict_ace={}
    
    for file in path_subdir.iterdir():
        print(path_subdir.iterdir())

Processing models:   0%|          | 0/2 [00:00<?, ?it/s]


FileNotFoundError: [Errno 2] No such file or directory: 'lm-evaluation-harness/lm-evaluation-harness/bigscience'

In [51]:
# dataset_names = ['hellaswag', 'piqa', 'arc', 'openbookqa', 'truthfulqa', 'xstory']
dataset_names = ['hellaswag']
metrics = ['conf', 'conf_true', 'c_pos', 'c_neg', 'ace', 'mce', 'entropy']
performance_metric_name = {
    "boolq": ['acc'],
    "truthfulqa": ['mc1', 'mc2'],
    "xstory": ['acc']
}
combined_names = [f"{dataset}_{metric}" for dataset in dataset_names for metric in metrics]
combined_names.append("model")
data_loaded_computed = {key: [] for key in combined_names}
all_results=dict()
for subdir in tqdm(all_models, desc='Processing models'):
    if 'opt' in str(subdir) or '7b' not in str(subdir):
        continue
    path_subdir = directory_path / subdir
    data_loaded_computed['model'].append(str(subdir))
    p_filename = f"{subdir}.json" # wrong
    performance_file = directory_path / p_filename
    all_results[str(subdir)]={} 
    all_dict_ace={}
    if performance_file.exists():
        with open(performance_file, 'r') as file:
            d_perf = json.load(file)

    for _file in path_subdir.iterdir():
        if _file.is_file() and 'write' in str(_file):
            key_n = str(_file).split("/")[-1].split()[-1]
            dataset_name = key_n.split("_")[0]

            with open(_file, 'r') as file:
                qa_data = json.load(file)

            entropies_, conf_, conf_pos, conf_neg, conf_true, true_, probs, pred_ = [], [], [], [], [], [], [], []
            shape_p = len([key for key in qa_data[0] if key.startswith('logit_')])

            for data_i in qa_data:
                true_label = data_i['truth']
                logits = [data_i[key] for key in data_i if key.startswith('logit_')]
                probabilities = np.exp(logits - np.max(logits)) / np.sum(np.exp(logits - np.max(logits)))
                entropy = -np.sum(probabilities * np.log2(probabilities))
                if probabilities.shape[0] == shape_p:
                    entropies_.append(entropy)
                    try:
                        truth_ = 0 if "yes" in true_label else 1
                    except:
                        truth_=int(true_label)
                    pred_i = probabilities[truth_]
                    conf_true.append(pred_i)
                    true_.append(truth_)
                    probs_i = probabilities.tolist()
                    probs.append(probs_i)
                    max_ = np.argmax(probabilities)
                    pred_.append(max_)
                    conf_.append(probabilities[max_])
                    (conf_pos if max_ == truth_ else conf_neg).append(probabilities[max_])
            y_true=np.array(true_)
            y_pred=np.array(probs)
            metrics_data = {
                'c_pos': np.mean(conf_pos),
                'c_neg': np.mean(conf_neg),
                'conf': np.mean(conf_),
                'conf_true': np.mean(conf_true),
                'ace': ace(y_true=y_true, y_pred=y_pred) if len(y_pred[0]) > 2 else ece(y_true=y_true, y_pred=y_pred),
                'mce': mce(y_true=y_true, y_pred=y_pred, num_bins=100) if len(y_pred[0]) > 2 else mce_binary(y_true=y_true, y_pred=y_pred, num_bins=100),
                'entropy': np.mean(entropies_)
            }

            for key, value in metrics_data.items():
                metrics_data[key] = round(value, 4)
                data_loaded_computed[f"{dataset_name}_{key}"].append(metrics_data[key])

            all_dict_ace[dataset_name] = metrics_data['ace'] * 100
            all_results[str(subdir)]=all_dict_ace
all_results

Processing models: 100%|██████████| 2/2 [00:00<00:00, 4707.41it/s]


{}

# Testing $H_0$ hypothesis

In [23]:
with open('./bigscience-bloom-7b1/hellaswag_write_out_info.json', 'r') as file:
    qa_data = json.load(file)
with open('./iproskurina-bloom-7b1-gptq-4bit/hellaswag_write_out_info.json', 'r') as file:
    qa_data_8bit = json.load(file)

FileNotFoundError: [Errno 2] No such file or directory: './bigscience-bloom-7b1/hellaswag_write_out_info.json'

In [72]:
pred_full=[]
pred_quantized=[]
for data in qa_data:
    true_label=data['truth']
    logit_keys = [key for key in data if key.startswith('logit_')]
    logits = [data[key] for key in logit_keys]
    probabilities = np.exp(logits - np.max(logits)) / np.sum(np.exp(logits - np.max(logits)))
    # entropy = -np.sum(probabilities * np.log2(probabilities))
    # entropies_.append(entropy)
    try:
        truth_ = 0 if "yes" in true_label else 1
    except:
        truth_=int(true_label)
    pred_i = probabilities[truth_]
    pred_full.append(pred_i)
for data in qa_data_8bit:
    true_label=data['truth']
    logit_keys = [key for key in data if key.startswith('logit_')]
    logits = [data[key] for key in logit_keys]
    probabilities = np.exp(logits - np.max(logits)) / np.sum(np.exp(logits - np.max(logits)))
    # entropy = -np.sum(probabilities * np.log2(probabilities))
    # entropies_.append(entropy)
    try:
        truth_ = 0 if "yes" in true_label else 1
    except:
        truth_=int(true_label)
    pred_i = probabilities[truth_]
    pred_quantized.append(pred_i)

In [73]:
# to compute stat.significance between 2 predictions, we use the t-test
t_stat, p_value = stats.ttest_rel(pred_full, pred_quantized)
alpha = 0.01
if p_value < alpha:
    print("There is a significant difference between the arrays.")
else:
    print("There is no significant difference between the arrays.")
print(p_value)

There is a significant difference between the arrays.
4.1747311644578645e-28
