## Uncertainty estimation

This notebook inspects repeated experiments and tries to quantify the robustness of our experiments and methods.

In [1]:
import os
import sys

sys.path.insert(0, os.path.abspath('..'))
from pathlib import Path
from typing import Dict

import numpy as np
import pandas as pd
import torch
from progiter import ProgIter
import torch.nn.functional as F
from quapy.error import nkld
from numpy.linalg import LinAlgError
import quapy as qp
from quapy.method.aggregative import CC, ACC, PCC, PACC, EMQ, KDEyCS, KDEyHD, KDEyML, DMy
import IPython
import plotly.express as px
from src.prev.calibration import CalibrationMethod, calc_calibration_metrics, calibrate_logits_fast
from src.prev.data_loading import get_values, Kind, Split, all_tasks, binary_tasks
from src.prev.scaling import scale_prevalences_ir
from src.prev.quantification import adjust_priors_qp, absolute_error, compute_w_hat_and_mu_hat, IdentityClassifier
from src.prev.thresholding import ThresholdingMethod, find_best_thresholds
from src.prev.metrics import Metric, compute_all_metrics, compute_metric
from src.prev.plotting import multiplot

current_path = os.getcwd()
DATA_PATH = Path(current_path).parent / 'data'
RESULT_PATH = Path(current_path).parent / 'results'
assert DATA_PATH.exists() and RESULT_PATH.exists()
torch.manual_seed(seed=0)
# we only inspect a subset of imbalance ratios for performance reasons
IRS = [1.0, 5.0, 10.]

In [2]:
# load training/prediction data for 10 runs with varied random seed and 5 runs with varied data splitting seeds
data = {}
for repeat, proj in enumerate(
        ['mic23_predictions_original_0', 'mic23_predictions_reproduce_0', 'mic23_predictions_reproduce_1',
         'mic23_predictions_reproduce_2', 'mic23_predictions_reproduce_10', 'mic23_predictions_reproduce_11',
         'mic23_predictions_reproduce_12', 'mic23_predictions_reproduce_13', 'mic23_predictions_reproduce_14',
         'mic23_predictions_reproduce_15', 'mic23_predictions_datasplit_seed_3', 'mic23_predictions_datasplit_seed_31',
         'mic23_predictions_datasplit_seed_314', 'mic23_predictions_datasplit_seed_3141',
         'mic23_predictions_datasplit_seed_31415']):
    data[repeat] = {}
    for t in ProgIter(all_tasks, desc='Loading data'):
        data[repeat][t] = get_values(t, DATA_PATH, proj=proj)

Loading data 100.00% 30/30... rate=4.95 Hz, eta=0:00:00, total=0:00:06
Loading data 100.00% 30/30... rate=4.67 Hz, eta=0:00:00, total=0:00:06
Loading data 100.00% 30/30... rate=4.76 Hz, eta=0:00:00, total=0:00:06
Loading data 100.00% 30/30... rate=4.77 Hz, eta=0:00:00, total=0:00:06
Loading data 100.00% 30/30... rate=4.69 Hz, eta=0:00:00, total=0:00:06
Loading data 100.00% 30/30... rate=4.75 Hz, eta=0:00:00, total=0:00:06
Loading data 100.00% 30/30... rate=4.58 Hz, eta=0:00:00, total=0:00:06
Loading data 100.00% 30/30... rate=4.68 Hz, eta=0:00:00, total=0:00:06
Loading data 100.00% 30/30... rate=4.63 Hz, eta=0:00:00, total=0:00:06
Loading data 100.00% 30/30... rate=4.62 Hz, eta=0:00:00, total=0:00:06
Loading data 100.00% 30/30... rate=4.54 Hz, eta=0:00:00, total=0:00:06
Loading data 100.00% 30/30... rate=4.47 Hz, eta=0:00:00, total=0:00:06
Loading data 100.00% 30/30... rate=4.44 Hz, eta=0:00:00, total=0:00:06
Loading data 100.00% 30/30... rate=4.31 Hz, eta=0:00:00, total=0:00:06
Loadin

## uncertainty with respect to quantification


In [80]:
uncertainty_quantification_results = []
for repeat in data:
    for t in ProgIter(all_tasks):
        task_data = data[repeat][t]
        for ir in IRS:
            # modify DEV_TEST according to IR
            try:
                app_test_logits, app_test_classes = scale_prevalences_ir(logits=task_data[Kind.LOGITS][Split.APP_TEST],
                                                                         classes=task_data[Kind.LABELS][Split.APP_TEST],
                                                                         ir=ir)
            except:
                print(f'{t=}, {ir=}')
                raise
            mod_data = {Kind.LOGITS: {Split.DEV_CAL: task_data[Kind.LOGITS][Split.DEV_CAL],
                                      Split.DEV_TEST: task_data[Kind.LOGITS][Split.DEV_TEST],
                                      Split.APP_TEST: app_test_logits},
                        Kind.LABELS: {Split.DEV_CAL: task_data[Kind.LABELS][Split.DEV_CAL],
                                      Split.DEV_TEST: task_data[Kind.LABELS][Split.DEV_TEST],
                                      Split.APP_TEST: app_test_classes}}
            # estimate prevalence using BBSE
            try:
                _, bbse_prior = compute_w_hat_and_mu_hat(mod_data[Kind.LABELS][Split.DEV_TEST],
                                                         torch.argmax(mod_data[Kind.LOGITS][Split.DEV_TEST], dim=1),
                                                         torch.argmax(mod_data[Kind.LOGITS][Split.APP_TEST], dim=1))
            except LinAlgError:
                bbse_prior = None
            prior = (torch.bincount(app_test_classes) / len(app_test_classes)).numpy()
            d_size = len(app_test_classes)
            _info = {'ir': ir, 'task': t, 'repeat': repeat}
            _info.update({"BBSE": bbse_prior})
            _info.update({"prior": prior})
            _info.update({"d_size": d_size})
            # convert data to qp format
            dev_data = qp.data.LabelledCollection(torch.softmax(mod_data[Kind.LOGITS][Split.DEV_TEST], dim=1),
                                                  mod_data[Kind.LABELS][Split.DEV_TEST])
            app_data = qp.data.LabelledCollection(torch.softmax(mod_data[Kind.LOGITS][Split.APP_TEST], dim=1),
                                                  mod_data[Kind.LABELS][Split.APP_TEST])
            dset = qp.data.base.Dataset(training=dev_data, test=app_data)
            # compute estimated prevalences with methods from qp
            for method_name, method in {"CC": CC, "ACC": ACC, "PCC": PCC, "PACC": PACC, "EMQ": EMQ, "HDy": DMy,
                                        'KDEyCS': KDEyCS, 'KDEyHD': KDEyHD, 'KDEyML': KDEyML}.items():
                identity_class = IdentityClassifier(len(prior))
                model = method(identity_class)
                try:  # data[10][all_tasks[20]] is corrupted
                    model.fit(dset.training)
                    estim_prevalence = model.quantify(dset.test.instances)
                    _info.update({method_name: estim_prevalence})
                except ValueError:
                    _info.update({method_name: None})
            uncertainty_quantification_results.append(_info)
quantification_df = pd.DataFrame(uncertainty_quantification_results)
quantification_df.to_pickle(RESULT_PATH / '24_uncertainty_quantification.pkl')

 100.00% 30/30... rate=0.07 Hz, eta=0:00:00, total=0:06:52
 100.00% 30/30... rate=0.07 Hz, eta=0:00:00, total=0:06:47
 100.00% 30/30... rate=0.07 Hz, eta=0:00:00, total=0:06:47
 100.00% 30/30... rate=0.07 Hz, eta=0:00:00, total=0:06:58
 100.00% 30/30... rate=0.07 Hz, eta=0:00:00, total=0:06:53
 100.00% 30/30... rate=0.07 Hz, eta=0:00:00, total=0:07:04
 100.00% 30/30... rate=0.07 Hz, eta=0:00:00, total=0:07:08
 100.00% 30/30... rate=0.07 Hz, eta=0:00:00, total=0:07:03
 100.00% 30/30... rate=0.07 Hz, eta=0:00:00, total=0:07:03
 100.00% 30/30... rate=0.07 Hz, eta=0:00:00, total=0:07:03


  distributions = counts/counts.sum(axis=1)[:,np.newaxis]
  distributions = counts/counts.sum(axis=1)[:,np.newaxis]
  distributions = counts/counts.sum(axis=1)[:,np.newaxis]


 100.00% 30/30... rate=0.07 Hz, eta=0:00:00, total=0:06:47
 100.00% 30/30... rate=0.07 Hz, eta=0:00:00, total=0:06:49


  distributions = counts/counts.sum(axis=1)[:,np.newaxis]
  distributions = counts/counts.sum(axis=1)[:,np.newaxis]
  distributions = counts/counts.sum(axis=1)[:,np.newaxis]


 100.00% 30/30... rate=0.07 Hz, eta=0:00:00, total=0:07:05
 100.00% 30/30... rate=0.07 Hz, eta=0:00:00, total=0:07:02
 100.00% 30/30... rate=0.07 Hz, eta=0:00:00, total=0:06:58


In [19]:
quantification_df = pd.read_pickle(RESULT_PATH / '24_uncertainty_quantification.pkl').fillna(value=np.nan)
table_entries = []
_methods = ['BBSE', "CC", "ACC", "PCC", "PACC", "EMQ", "HDy", 'KDEyCS', 'KDEyHD', 'KDEyML']
_metrics = metrics = {"Absolute error": absolute_error, "Normalized KLD": nkld}
for ir in IRS:
    for method in _methods:
        train_uncertainty = {met: [] for met in _metrics}
        data_uncertainty = {met: [] for met in _metrics}
        for repeat in data:
            vals = train_uncertainty if repeat < 10 else data_uncertainty
            sub_df = quantification_df[(quantification_df.ir == ir) & (quantification_df.repeat == repeat)]
            for met in _metrics:
                # apply metric to all tasks (of this IR and repeat)
                if met == "Normalized KLD":
                    metric_series = sub_df.apply(
                        lambda row: _metrics[met](row['prior'], row[method], eps=1 / row['d_size']), axis=1)
                else:
                    metric_series = sub_df.apply(lambda row: _metrics[met](row['prior'], row[method]), axis=1)
                # average over tasks
                vals[met].append(metric_series.mean())
        _info = {'ir': ir, 'method': method}
        for met in _metrics:
            # average over all repeats
            _info[met] = (f'{np.mean(train_uncertainty[met]):.3f} ± {np.std(train_uncertainty[met]):.3f} | '
                          f'{np.mean(data_uncertainty[met]):.3f} ± {np.std(data_uncertainty[met]):.3f}')
        table_entries.append(_info)
quantification_table_df = pd.DataFrame(table_entries)
print('Uncertainty with respect to quantification: Absolute error')
quantification_table_df.pivot(columns='ir', index='method', values='Absolute error')

Uncertainty with respect to quantification: Absolute error


ir,1.0,5.0,10.0
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ACC,0.085 ± 0.026 | 0.073 ± 0.021,0.104 ± 0.039 | 0.133 ± 0.023,0.116 ± 0.043 | 0.150 ± 0.031
BBSE,0.127 ± 0.134 | 0.084 ± 0.031,0.087 ± 0.053 | 0.104 ± 0.074,0.101 ± 0.079 | 0.099 ± 0.062
CC,0.238 ± 0.072 | 0.306 ± 0.028,0.190 ± 0.066 | 0.259 ± 0.040,0.220 ± 0.062 | 0.288 ± 0.044
EMQ,0.363 ± 0.084 | 0.397 ± 0.019,0.234 ± 0.076 | 0.288 ± 0.039,0.193 ± 0.076 | 0.256 ± 0.042
HDy,0.071 ± 0.021 | 0.082 ± 0.019,0.102 ± 0.031 | 0.115 ± 0.013,0.118 ± 0.043 | 0.132 ± 0.025
KDEyCS,0.065 ± 0.017 | 0.060 ± 0.021,0.097 ± 0.028 | 0.105 ± 0.015,0.111 ± 0.040 | 0.128 ± 0.020
KDEyHD,0.060 ± 0.015 | 0.054 ± 0.015,0.094 ± 0.030 | 0.102 ± 0.010,0.106 ± 0.040 | 0.121 ± 0.019
KDEyML,0.152 ± 0.078 | 0.200 ± 0.046,0.168 ± 0.091 | 0.201 ± 0.058,0.171 ± 0.091 | 0.199 ± 0.070
PACC,0.086 ± 0.022 | 0.087 ± 0.027,0.102 ± 0.032 | 0.109 ± 0.008,0.109 ± 0.040 | 0.135 ± 0.015
PCC,0.139 ± 0.022 | 0.162 ± 0.018,0.210 ± 0.029 | 0.241 ± 0.027,0.277 ± 0.035 | 0.313 ± 0.030


In [4]:
print('Uncertainty with respect to quantification: Normalized KLD')
quantification_table_df.pivot(columns='ir', index='method', values='Normalized KLD')

Uncertainty with respect to quantification: Normalized KLD


ir,1.0,5.0,10.0
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ACC,0.020 ± 0.018 | 0.013 ± 0.012,0.022 ± 0.020 | 0.033 ± 0.016,0.030 ± 0.016 | 0.042 ± 0.016
BBSE,0.019 ± 0.018 | 0.014 ± 0.013,0.013 ± 0.011 | 0.021 ± 0.016,0.020 ± 0.013 | 0.020 ± 0.014
CC,0.066 ± 0.041 | 0.113 ± 0.021,0.059 ± 0.040 | 0.102 ± 0.021,0.063 ± 0.037 | 0.106 ± 0.022
EMQ,0.153 ± 0.063 | 0.191 ± 0.025,0.126 ± 0.055 | 0.160 ± 0.025,0.103 ± 0.047 | 0.136 ± 0.026
HDy,0.012 ± 0.010 | 0.026 ± 0.016,0.019 ± 0.011 | 0.025 ± 0.005,0.029 ± 0.017 | 0.032 ± 0.007
KDEyCS,0.009 ± 0.010 | 0.008 ± 0.007,0.015 ± 0.006 | 0.017 ± 0.005,0.025 ± 0.013 | 0.031 ± 0.006
KDEyHD,0.008 ± 0.010 | 0.005 ± 0.004,0.013 ± 0.007 | 0.020 ± 0.008,0.020 ± 0.012 | 0.031 ± 0.004
KDEyML,0.065 ± 0.049 | 0.106 ± 0.035,0.070 ± 0.054 | 0.105 ± 0.034,0.075 ± 0.053 | 0.101 ± 0.039
PACC,0.026 ± 0.015 | 0.016 ± 0.013,0.025 ± 0.013 | 0.023 ± 0.008,0.031 ± 0.018 | 0.041 ± 0.007
PCC,0.013 ± 0.005 | 0.027 ± 0.009,0.026 ± 0.009 | 0.042 ± 0.011,0.047 ± 0.013 | 0.064 ± 0.012


## uncertainty with respect to re-calibration

In [5]:
def get_estimated_prevalences(task_data: Dict[Kind, Dict[Split, torch.Tensor]]) -> torch.Tensor:
    estimated_prevalence = adjust_priors_qp(torch.softmax(task_data[Kind.LOGITS][Split.DEV_TEST], dim=1),
                                            task_data[Kind.LABELS][Split.DEV_TEST],
                                            torch.softmax(task_data[Kind.LOGITS][Split.APP_TEST], dim=1),
                                            task_data[Kind.LABELS][Split.APP_TEST])
    return estimated_prevalence

In [4]:
calibration_ir_results = []
for cal_method in ProgIter(list(CalibrationMethod)[:]):
    if cal_method in [CalibrationMethod.ADAPTED_TRAIN_WEIGHTS,
                      CalibrationMethod.ADAPTED_TRAIN_WEIGHTS_AND_AFFINE_SCALING_REWEIGHTED,
                      CalibrationMethod.ADAPTED_TRAIN_WEIGHTS_AND_TEMPERATURE_SCALING_REWEIGHTED,
                      CalibrationMethod.ADAPTED_TRAIN_WEIGHTS_ACC,
                      CalibrationMethod.ADAPTED_TRAIN_WEIGHTS_AND_AFFINE_SCALING_REWEIGHETD_ACC,
                      CalibrationMethod.ADAPTED_TRAIN_WEIGHTS_AND_TEMPERATURE_SCALING_REWEIGHTED_ACC]:
        # no repeated experiments for adapted train weights (due to high computational costs)
        continue
    for ir in IRS:
        for repeat in data:
            # do the calibration on all tasks
            calibrated_test_data: Dict[str, Dict[Kind, Dict[Split, torch.Tensor]]] = {}
            for t in all_tasks:
                # STEP 1: modify DEV_TEST according to IR
                task_data = data[repeat][t]
                app_test_logits, app_test_classes = scale_prevalences_ir(logits=task_data[Kind.LOGITS][Split.APP_TEST],
                                                                         classes=task_data[Kind.LABELS][Split.APP_TEST],
                                                                         ir=ir)
                mod_data = {Kind.LOGITS: {Split.DEV_CAL: task_data[Kind.LOGITS][Split.DEV_CAL],
                                          Split.DEV_TEST: task_data[Kind.LOGITS][Split.DEV_TEST],
                                          Split.APP_TEST: app_test_logits},
                            Kind.LABELS: {Split.DEV_CAL: task_data[Kind.LABELS][Split.DEV_CAL],
                                          Split.DEV_TEST: task_data[Kind.LABELS][Split.DEV_TEST],
                                          Split.APP_TEST: app_test_classes}}
                # STEP 2: determine prior knowledge
                prior = None  # by default, we know nothing
                if cal_method in [CalibrationMethod.AFFINE_REWEIGHTED,
                                  CalibrationMethod.TEMPERATURE_SCALING_REWEIGHTED]:
                    # adapt prevalences from DEV_CAL to APP_TEST (the latter is balanced)
                    prior = torch.bincount(app_test_classes)
                    # scaling for convergence stability
                    prior = prior / prior.sum()
                elif cal_method in [CalibrationMethod.AFFINE_ACC]:
                    prior = get_estimated_prevalences(task_data=mod_data)
                # STEP 3: re-calibrate
                calibrated_logits = calibrate_logits_fast(data=mod_data, calibration=cal_method, prior=prior)
                # STEP 4: calculate calibration metrics
                # suppress plotting from the metrics reloaded
                with IPython.utils.io.capture_output():
                    dev_metrics = calc_calibration_metrics(logits=calibrated_logits[Split.DEV_TEST],
                                                           labels=task_data[Kind.LABELS][Split.DEV_TEST])
                    app_metrics = calc_calibration_metrics(logits=calibrated_logits[Split.APP_TEST],
                                                           labels=app_test_classes)
                # going from dev to test
                diff_metrics = {m: app_metrics[m] - dev_metrics[m] for m in dev_metrics}
                _info = {'calibration': cal_method.name, 'ir': ir, 'task': t, 'repeat': repeat}
                _info.update({f'dev_{m}': v for m, v in dev_metrics.items()})
                _info.update({f'app_{m}': v for m, v in app_metrics.items()})
                _info.update({f'diff_{m}': v for m, v in diff_metrics.items()})
                calibration_ir_results.append(_info)
calibration_df = pd.DataFrame(calibration_ir_results)
calibration_df.to_csv(RESULT_PATH / '24_uncertainty_calibration.csv')

 100.00% 12/12... rate=0.06 Hz, eta=0:00:00, total=0:03:36


In [6]:
calibration_df = pd.read_csv(RESULT_PATH / '24_uncertainty_calibration.csv')
table_entries = []
_metrics = ['app_cwce', 'app_bs']
for ir in IRS:
    for method in calibration_df.calibration.unique():
        sub_df = calibration_df[(calibration_df.calibration == method) & (calibration_df.ir == ir)]
        train_uncertainty = {met: [] for met in _metrics}
        data_uncertainty = {met: [] for met in _metrics}
        for repeat in sub_df['repeat'].unique():
            vals = train_uncertainty if repeat < 10 else data_uncertainty
            for met in _metrics:
                vals[met].append(sub_df[sub_df['repeat'] == repeat][met].mean())
        _info = {'ir': ir, 'method': method}
        for met in _metrics:
            _info[met] = (f'{np.mean(train_uncertainty[met]):.3f} ± {np.std(train_uncertainty[met]):.3f} | '
                          f'{np.mean(data_uncertainty[met]):.3f} ± {np.std(data_uncertainty[met]):.3f}')
        table_entries.append(_info)
df = pd.DataFrame(table_entries)
enum_map = {elem.name: elem.value for elem in list(CalibrationMethod)}
print(f'Uncertainty with respect to re-calibration: CWCE')
df.replace(enum_map).pivot(columns='ir', index='method', values='app_cwce')

Uncertainty with respect to re-calibration: CWCE


ir,1.0,5.0,10.0
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Affine,0.078 ± 0.004 | 0.077 ± 0.003,0.077 ± 0.002 | 0.076 ± 0.002,0.108 ± 0.004 | 0.108 ± 0.001
Affine (dep. prev.),0.026 ± 0.002 | 0.024 ± 0.001,0.018 ± 0.001 | 0.016 ± 0.002,0.013 ± 0.001 | 0.012 ± 0.001
Affine (est. prev.),0.041 ± 0.009 | 0.040 ± 0.008,0.041 ± 0.011 | 0.047 ± 0.010,0.041 ± 0.012 | 0.049 ± 0.011
No re-calibration,0.071 ± 0.005 | 0.073 ± 0.007,0.094 ± 0.008 | 0.098 ± 0.007,0.122 ± 0.009 | 0.127 ± 0.008
TempScal,0.058 ± 0.005 | 0.059 ± 0.004,0.099 ± 0.006 | 0.098 ± 0.007,0.131 ± 0.007 | 0.131 ± 0.007
TempScal (dep. prev.),0.047 ± 0.003 | 0.045 ± 0.005,0.088 ± 0.007 | 0.089 ± 0.008,0.111 ± 0.008 | 0.113 ± 0.009


In [7]:
print(f'Uncertainty with respect to re-calibration: Brier Score')
df.replace(enum_map).pivot(columns='ir', index='method', values='app_bs')

Uncertainty with respect to re-calibration: Brier Score


ir,1.0,5.0,10.0
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Affine,0.124 ± 0.008 | 0.127 ± 0.003,0.086 ± 0.006 | 0.088 ± 0.002,0.078 ± 0.005 | 0.079 ± 0.002
Affine (dep. prev.),0.112 ± 0.007 | 0.115 ± 0.002,0.072 ± 0.005 | 0.073 ± 0.002,0.049 ± 0.004 | 0.050 ± 0.002
Affine (est. prev.),0.117 ± 0.009 | 0.119 ± 0.005,0.078 ± 0.009 | 0.086 ± 0.010,0.057 ± 0.008 | 0.066 ± 0.012
No re-calibration,0.121 ± 0.008 | 0.126 ± 0.003,0.095 ± 0.007 | 0.100 ± 0.005,0.089 ± 0.008 | 0.094 ± 0.006
TempScal,0.118 ± 0.008 | 0.122 ± 0.002,0.094 ± 0.006 | 0.096 ± 0.004,0.088 ± 0.006 | 0.090 ± 0.005
TempScal (dep. prev.),0.116 ± 0.007 | 0.119 ± 0.002,0.092 ± 0.006 | 0.095 ± 0.004,0.085 ± 0.006 | 0.087 ± 0.005


## uncertainty with respect to decision rules

In [8]:
def thresholds_accross_ir(task_data, calibration: CalibrationMethod = CalibrationMethod.NONE,
                          thresholding: ThresholdingMethod = ThresholdingMethod.ARGMAX):
    """
    Computes the values of metrics for optimal thresholds computed for the application test set and
    thresholds set on the development test set for a given task across imbalance ratios.

    :param task_data:
    :param calibration: calibration method
    :param thresholding: thresholding method to be used
    :return results: dictionary of metric values
    """
    # initialize the results dictionary
    results = {m: [] for m in Metric}
    results.update({"reference " + m.value: [] for m in Metric})
    for ir in IRS:
        # scale prevalences in the deployment set according to imbalance ratio
        app_test_logits, app_test_classes = scale_prevalences_ir(logits=task_data[Kind.LOGITS][Split.APP_TEST],
                                                                 classes=task_data[Kind.LABELS][Split.APP_TEST],
                                                                 ir=ir)
        # create a data dictionary with the modified deployment test set
        mod_data = {Kind.LOGITS: {Split.DEV_CAL: task_data[Kind.LOGITS][Split.DEV_CAL],
                                  Split.DEV_TEST: task_data[Kind.LOGITS][Split.DEV_TEST],
                                  Split.APP_TEST: app_test_logits},
                    Kind.LABELS: {Split.DEV_CAL: task_data[Kind.LABELS][Split.DEV_CAL],
                                  Split.DEV_TEST: task_data[Kind.LABELS][Split.DEV_TEST],
                                  Split.APP_TEST: app_test_classes}}
        # compute the number of samples in each class of the app test set with modified prevalence
        exact_prevalence = torch.bincount(mod_data[Kind.LABELS][Split.APP_TEST]) / len(mod_data[Kind.LABELS][Split.APP_TEST])
        # estimated version of these prevalences
        estimated_prevalence = torch.Tensor(
            adjust_priors_qp(torch.softmax(mod_data[Kind.LOGITS][Split.DEV_TEST], dim=1),
                             mod_data[Kind.LABELS][Split.DEV_TEST],
                             torch.softmax(mod_data[Kind.LOGITS][Split.APP_TEST], dim=1),
                             mod_data[Kind.LABELS][Split.APP_TEST]))
        if calibration == CalibrationMethod.AFFINE_REWEIGHTED:
            # define exact prior to use if calibrating using real priors
            prior = exact_prevalence
        elif calibration == CalibrationMethod.AFFINE_ACC:
            prior = estimated_prevalence
        elif calibration == CalibrationMethod.NONE:
            prior = None
        else:
            raise ValueError('calibration method not supported')
        calibrated_logits = calibrate_logits_fast(data=mod_data, calibration=calibration, prior=prior)
        # extract the minority class for F1 computation
        val_prevalences = torch.bincount(mod_data[Kind.LABELS][Split.DEV_CAL])
        min_class = torch.argmin(val_prevalences).item()
        max_class = torch.argmax(val_prevalences).item()
        # catches case where for balanced task, min class was used as max class in scaling
        if min_class == max_class:
            min_class = 1
        if thresholding == ThresholdingMethod.ARGMAX:
            # use 0.5 as threshold (argmax)
            thresholds = {m: 0.5 for m in Metric}
        elif thresholding == ThresholdingMethod.DEV_TEST:
            # find optimal thresholds on dev test
            thresholds = find_best_thresholds(labels=mod_data[Kind.LABELS][Split.DEV_TEST],
                                              logits=calibrated_logits[Split.DEV_TEST], min_class=min_class,
                                              priors=exact_prevalence.numpy(),
                                              est_priors=estimated_prevalence.numpy())
        else:
            raise ValueError('invalid thresholding method')
        # find optimal thresholds on app test
        optimal_thresholds = find_best_thresholds(labels=mod_data[Kind.LABELS][Split.APP_TEST],
                                                  logits=calibrated_logits[Split.APP_TEST], min_class=min_class,
                                                  priors=exact_prevalence.numpy(),
                                                  est_priors=estimated_prevalence.numpy())
        # compute  predictions on app test using the two sets of thresholds                                        
        new_app_test_preds = {key: F.softmax(calibrated_logits[Split.APP_TEST], dim=1)[:, 0] < thresholds[key] for key
                              in thresholds.keys()}
        optimal_app_test_preds = {
            key: F.softmax(calibrated_logits[Split.APP_TEST], dim=1)[:, 0] < optimal_thresholds[key] for key in
            optimal_thresholds.keys()}
        # compute metrics values for predictions made using optimal app test thresholds
        optimal_metrics = compute_all_metrics(mod_data[Kind.LABELS][Split.APP_TEST],
                                              mod_data[Kind.LOGITS][Split.APP_TEST], optimal_app_test_preds,
                                              min_class=min_class, exact_priors=exact_prevalence,
                                              estimated_priors=exact_prevalence)
        # compute metrics values for predictions made using the other thresholds 
        dev_threshold_metrics = compute_all_metrics(mod_data[Kind.LABELS][Split.APP_TEST],
                                                    mod_data[Kind.LOGITS][Split.APP_TEST], new_app_test_preds,
                                                    min_class=min_class, exact_priors=exact_prevalence,
                                                    estimated_priors=exact_prevalence)
        # append the computed metrics values to the results
        for k in optimal_metrics.keys():
            results[k].append(optimal_metrics[k])
        for k in dev_threshold_metrics.keys():
            results['reference ' + k.value].append(dev_threshold_metrics[k])
    return results

In [9]:
decision_results_list = []
# iterate over calibration methods
for cal in [CalibrationMethod.NONE, CalibrationMethod.AFFINE_REWEIGHTED, CalibrationMethod.AFFINE_ACC]:
    # iterate over thresholding methods
    for thresholding in [ThresholdingMethod.DEV_TEST, ThresholdingMethod.ARGMAX]:
        # iterate over tasks
        for t in ProgIter(binary_tasks):
            for repeat in data:
                _info = {'repeat': repeat, 'task': t, 'threshold': thresholding.value, 'calibration': cal.value}
                _info.update(
                    thresholds_accross_ir(task_data=data[repeat][t], calibration=cal, thresholding=thresholding))
                decision_results_list.append(_info)
decision_rule_df = pd.DataFrame(decision_results_list)
decision_rule_df.to_pickle(RESULT_PATH / '24_uncertainty_decision_rule.pkl')

 100.00% 24/24... rate=0.02 Hz, eta=0:00:00, total=0:19:25
 100.00% 24/24... rate=0.04 Hz, eta=0:00:00, total=0:10:38
 100.00% 24/24... rate=0.02 Hz, eta=0:00:00, total=0:18:45
 100.00% 24/24... rate=0.04 Hz, eta=0:00:00, total=0:10:40
 100.00% 24/24... rate=0.02 Hz, eta=0:00:00, total=0:18:43
 100.00% 24/24... rate=0.04 Hz, eta=0:00:00, total=0:10:37


In [8]:
decision_rule_df = pd.read_pickle(RESULT_PATH / '24_uncertainty_decision_rule.pkl')
metric_differences_list = []
metrics = [Metric.ACCURACY, Metric.F1, Metric.MCC, Metric.BALANCED_ACC, Metric.EC_EST, Metric.EC_ADJUSTED]
for thresholding in [ThresholdingMethod.ARGMAX, ThresholdingMethod.DEV_TEST]:
    for cal in [CalibrationMethod.NONE, CalibrationMethod.AFFINE_REWEIGHTED]:
        for repeat in data:
            for ir_idx, ir in enumerate(IRS):
                _info = {'repeat': repeat, 'calibration': cal.value, 'threshold': thresholding.value, 'ir': ir}
                sub_df = decision_rule_df[
                    (decision_rule_df['repeat'] == repeat) & (decision_rule_df['calibration'] == cal.value) & (
                                decision_rule_df['threshold'] == thresholding.value)]
                estimated_sub_df = decision_rule_df[(decision_rule_df['repeat'] == repeat) & (
                            decision_rule_df['calibration'] == CalibrationMethod.AFFINE_ACC.value) & (
                                                                decision_rule_df['threshold'] == thresholding.value)]
                for metric in metrics:
                    _vals = []
                    iter_over_df = sub_df
                    if metric == Metric.EC_EST:
                        iter_over_df = estimated_sub_df
                    for _, row in iter_over_df.iterrows():
                        # use absolute difference since metrics are oriented differently 
                        # for example EC -> lower is better, ACC -> higher is better
                        _vals.append(np.abs(row[metric][ir_idx] - row['reference ' + metric.value][ir_idx]))
                    _info[metric] = np.mean(_vals)
                metric_differences_list.append(_info)
metric_differences = pd.DataFrame(metric_differences_list)

In [9]:
decison_rule_tables = {}
for thresholding in [ThresholdingMethod.ARGMAX, ThresholdingMethod.DEV_TEST]:
    for cal in [CalibrationMethod.NONE, CalibrationMethod.AFFINE_REWEIGHTED]:
        table_entries = []
        for ir in IRS:
            train_uncertainty = {met: [] for met in metrics}
            data_uncertainty = {met: [] for met in metrics}
            sub_df = metric_differences[
                (metric_differences.calibration == cal.value) & (metric_differences.ir == ir) & (
                            metric_differences.threshold == thresholding.value)]
            for repeat in sub_df['repeat'].unique():
                vals = train_uncertainty if repeat < 10 else data_uncertainty
                for met in metrics:
                    vals[met].append(sub_df[sub_df['repeat'] == repeat][met].mean())
            _info = {'ir': ir}
            for met in metrics:
                _info[met] = (f'{np.mean(train_uncertainty[met]):.3f} ± {np.std(train_uncertainty[met]):.3f} | '
                              f'{np.mean(data_uncertainty[met]):.3f} ± {np.std(data_uncertainty[met]):.3f}')
            table_entries.append(_info)
        enum_map = {elem: elem.value for elem in list(Metric)}
        df = pd.DataFrame(table_entries).rename(columns=enum_map).T
        df = df.rename(columns=df.loc['ir'])
        df.drop('ir', axis=0, inplace=True)
        decison_rule_tables[(thresholding, cal)] = df

In [10]:
t = ThresholdingMethod.ARGMAX
c = CalibrationMethod.NONE
print(f'Decision rule uncertainty for {t} and {c} (top left Fig. 9):')
decison_rule_tables[(t, c)]

Decision rule uncertainty for ThresholdingMethod.ARGMAX and CalibrationMethod.NONE (top left Fig. 9):


Unnamed: 0,1.0,5.0,10.0
Accuracy,0.015 ± 0.003 | 0.015 ± 0.003,0.045 ± 0.007 | 0.045 ± 0.008,0.069 ± 0.009 | 0.068 ± 0.011
F1 Score,0.049 ± 0.009 | 0.058 ± 0.012,0.026 ± 0.005 | 0.030 ± 0.008,0.049 ± 0.007 | 0.053 ± 0.008
MCC,0.022 ± 0.004 | 0.025 ± 0.004,0.033 ± 0.005 | 0.036 ± 0.004,0.053 ± 0.006 | 0.056 ± 0.006
Bal. Accuracy,0.015 ± 0.003 | 0.015 ± 0.003,0.017 ± 0.004 | 0.016 ± 0.003,0.019 ± 0.004 | 0.020 ± 0.004
EC (est. prev.),0.007 ± 0.001 | 0.006 ± 0.001,0.006 ± 0.005 | 0.005 ± 0.003,0.004 ± 0.003 | 0.003 ± 0.002
EC (dep. prev.),0.015 ± 0.003 | 0.015 ± 0.003,0.045 ± 0.007 | 0.045 ± 0.008,0.069 ± 0.009 | 0.068 ± 0.011


In [11]:
t = ThresholdingMethod.ARGMAX
c = CalibrationMethod.AFFINE_REWEIGHTED
print(f'Decision rule uncertainty for {t} and {c} (top right Fig. 9):')
decison_rule_tables[(t, c)]

Decision rule uncertainty for ThresholdingMethod.ARGMAX and CalibrationMethod.AFFINE_REWEIGHTED (top right Fig. 9):


Unnamed: 0,1.0,5.0,10.0
Accuracy,0.005 ± 0.001 | 0.005 ± 0.001,0.003 ± 0.000 | 0.003 ± 0.000,0.002 ± 0.000 | 0.002 ± 0.001
F1 Score,0.014 ± 0.002 | 0.018 ± 0.003,0.097 ± 0.011 | 0.090 ± 0.008,0.103 ± 0.011 | 0.112 ± 0.008
MCC,0.013 ± 0.004 | 0.015 ± 0.003,0.052 ± 0.009 | 0.047 ± 0.006,0.066 ± 0.008 | 0.075 ± 0.007
Bal. Accuracy,0.005 ± 0.001 | 0.005 ± 0.001,0.071 ± 0.005 | 0.065 ± 0.002,0.100 ± 0.005 | 0.103 ± 0.005
EC (est. prev.),0.007 ± 0.001 | 0.006 ± 0.001,0.006 ± 0.005 | 0.005 ± 0.003,0.004 ± 0.003 | 0.003 ± 0.002
EC (dep. prev.),0.005 ± 0.001 | 0.005 ± 0.001,0.003 ± 0.000 | 0.003 ± 0.000,0.002 ± 0.000 | 0.002 ± 0.001


In [12]:
t = ThresholdingMethod.DEV_TEST
c = CalibrationMethod.NONE
print(f'Decision rule uncertainty for {t} and {c} (bottom left Fig. 9):')
decison_rule_tables[(t, c)]

Decision rule uncertainty for ThresholdingMethod.DEV_TEST and CalibrationMethod.NONE (bottom left Fig. 9):


Unnamed: 0,1.0,5.0,10.0
Accuracy,0.006 ± 0.001 | 0.007 ± 0.003,0.073 ± 0.010 | 0.072 ± 0.014,0.107 ± 0.012 | 0.103 ± 0.016
F1 Score,0.006 ± 0.002 | 0.007 ± 0.002,0.062 ± 0.006 | 0.062 ± 0.007,0.107 ± 0.006 | 0.110 ± 0.007
MCC,0.013 ± 0.003 | 0.017 ± 0.004,0.040 ± 0.006 | 0.043 ± 0.007,0.065 ± 0.007 | 0.066 ± 0.010
Bal. Accuracy,0.006 ± 0.001 | 0.007 ± 0.003,0.008 ± 0.003 | 0.009 ± 0.002,0.011 ± 0.002 | 0.012 ± 0.003
EC (est. prev.),0.008 ± 0.003 | 0.010 ± 0.002,0.009 ± 0.009 | 0.005 ± 0.002,0.007 ± 0.004 | 0.006 ± 0.003
EC (dep. prev.),0.006 ± 0.001 | 0.007 ± 0.003,0.004 ± 0.001 | 0.004 ± 0.000,0.003 ± 0.001 | 0.004 ± 0.001


In [13]:
t = ThresholdingMethod.DEV_TEST
c = CalibrationMethod.AFFINE_REWEIGHTED
print(f'Decision rule uncertainty for {t} and {c} (bottom right Fig. 9):')
decison_rule_tables[(t, c)]

Decision rule uncertainty for ThresholdingMethod.DEV_TEST and CalibrationMethod.AFFINE_REWEIGHTED (bottom right Fig. 9):


Unnamed: 0,1.0,5.0,10.0
Accuracy,0.006 ± 0.002 | 0.007 ± 0.003,0.072 ± 0.007 | 0.067 ± 0.009,0.101 ± 0.012 | 0.098 ± 0.010
F1 Score,0.007 ± 0.002 | 0.007 ± 0.001,0.061 ± 0.005 | 0.063 ± 0.007,0.107 ± 0.005 | 0.110 ± 0.009
MCC,0.014 ± 0.004 | 0.018 ± 0.004,0.039 ± 0.005 | 0.044 ± 0.007,0.063 ± 0.007 | 0.066 ± 0.008
Bal. Accuracy,0.006 ± 0.002 | 0.007 ± 0.003,0.008 ± 0.003 | 0.007 ± 0.002,0.010 ± 0.002 | 0.008 ± 0.002
EC (est. prev.),0.008 ± 0.003 | 0.010 ± 0.002,0.009 ± 0.009 | 0.005 ± 0.002,0.007 ± 0.004 | 0.006 ± 0.003
EC (dep. prev.),0.006 ± 0.002 | 0.007 ± 0.002,0.005 ± 0.001 | 0.004 ± 0.001,0.004 ± 0.001 | 0.004 ± 0.001


## uncertainty with respect to performance assessment

In [14]:
def metrics_across_ir(task_data, ir: float, calibration: CalibrationMethod = CalibrationMethod.NONE):
    """ Computes metrics values on both development and test data of a given task and IR."""
    # scale prevalences in the deployment set according to imbalance ratio
    app_test_logits, app_test_classes = scale_prevalences_ir(logits=task_data[Kind.LOGITS][Split.APP_TEST],
                                                             classes=task_data[Kind.LABELS][Split.APP_TEST],
                                                             ir=ir)
    # create a data dictionary with the modified deployment test set
    mod_data = {Kind.LOGITS: {Split.DEV_CAL: task_data[Kind.LOGITS][Split.DEV_CAL],
                              Split.DEV_TEST: task_data[Kind.LOGITS][Split.DEV_TEST],
                              Split.APP_TEST: app_test_logits},
                Kind.LABELS: {Split.DEV_CAL: task_data[Kind.LABELS][Split.DEV_CAL],
                              Split.DEV_TEST: task_data[Kind.LABELS][Split.DEV_TEST],
                              Split.APP_TEST: app_test_classes}}

    # extract the minority class for F1 computation
    val_prevalences = torch.bincount(mod_data[Kind.LABELS][Split.DEV_CAL])
    min_class = torch.argmin(val_prevalences).item()
    max_class = torch.argmax(val_prevalences).item()
    # catches case where for balanced task, min class was used as max class in scaling
    if min_class == max_class:
        min_class = 1
    #compute the exact prevalence in the scaled deployment set
    exact_prevalence = torch.bincount(mod_data[Kind.LABELS][Split.APP_TEST]) / len(
        mod_data[Kind.LABELS][Split.APP_TEST])

    # compute EC estimated separate - both calibration and EC adjustment rely on prevalence estimation
    estimated_prevalence = torch.Tensor(adjust_priors_qp(torch.softmax(mod_data[Kind.LOGITS][Split.DEV_TEST], dim=1),
                                                         mod_data[Kind.LABELS][Split.DEV_TEST],
                                                         torch.softmax(mod_data[Kind.LOGITS][Split.APP_TEST], dim=1),
                                                         mod_data[Kind.LABELS][Split.APP_TEST], method=ACC))
    # calibrate logits and get estimated prevalence
    if calibration == CalibrationMethod.AFFINE_REWEIGHTED:
        prior = exact_prevalence
    elif calibration == CalibrationMethod.NONE:
        prior = None
    else:
        raise ValueError(f'invalid calibration method: {calibration}')
    calibrated_logits = calibrate_logits_fast(data=mod_data, calibration=calibration, prior=prior)

    # compute predictions on scaled deployment set and development test set according to argmax decision rule
    new_app_test_preds = torch.argmax(calibrated_logits[Split.APP_TEST], dim=1)
    dev_test_preds = torch.argmax(calibrated_logits[Split.DEV_TEST], dim=1)

    #compute the metrics on the deployment and development sets
    dep_metrics = compute_all_metrics(mod_data[Kind.LABELS][Split.APP_TEST], calibrated_logits[Split.APP_TEST],
                                      new_app_test_preds,
                                      min_class=min_class, exact_priors=exact_prevalence,
                                      estimated_priors=exact_prevalence)
    dev_metrics = compute_all_metrics(mod_data[Kind.LABELS][Split.DEV_TEST], calibrated_logits[Split.DEV_TEST],
                                      dev_test_preds,
                                      min_class=min_class, exact_priors=exact_prevalence,
                                      estimated_priors=estimated_prevalence)
    # recompute EC estimated separate since re-calibration has to rely on estimation as well!
    if calibration == CalibrationMethod.AFFINE_REWEIGHTED:
        calibrated_logits_est = calibrate_logits_fast(data=mod_data, calibration=calibration, prior=estimated_prevalence)
        estimated_app_test_preds = torch.argmax(calibrated_logits_est[Split.APP_TEST], dim=1)
        dep_metrics[Metric.EC_EST] = compute_metric(Metric.EC_ADJUSTED, mod_data[Kind.LABELS][Split.APP_TEST],
                                                    estimated_app_test_preds, exact_priors=exact_prevalence,
                                                    min_class=min_class)
        estimated_dev_test_preds = torch.argmax(calibrated_logits_est[Split.DEV_TEST], dim=1)
        dev_metrics[Metric.EC_EST] = compute_metric(Metric.EC_EST, mod_data[Kind.LABELS][Split.DEV_TEST],
                                                    estimated_dev_test_preds,
                                                    min_class=min_class, estimated_priors=estimated_prevalence)
    # append metrics to the results dictionary
    results = {}
    for met in dep_metrics:
        results[met] = dep_metrics[met]
        results['reference ' + met.value] = dev_metrics[met]
    return results

In [90]:
# compute metrics at different IRs for different calibration methods and estimated priors
performance_assessment_list = []
for cal in [CalibrationMethod.NONE, CalibrationMethod.AFFINE_REWEIGHTED]:
    for t in ProgIter(binary_tasks):
        for repeat in data:
            for ir in IRS:
                _info = {'repeat': repeat, 'task': t, 'calibration': cal.value, 'ir': ir}
                _info.update(metrics_across_ir(task_data=data[repeat][t], ir=ir, calibration=cal))
                performance_assessment_list.append(_info)
performance_assessment_df = pd.DataFrame(performance_assessment_list)
performance_assessment_df.to_pickle(RESULT_PATH / '24_uncertainty_metric.pkl')

 100.00% 24/24... rate=0.58 Hz, eta=0:00:00, total=0:00:41
 100.00% 24/24... rate=0.41 Hz, eta=0:00:00, total=0:00:58


In [15]:
performance_assessment_df = pd.read_pickle(RESULT_PATH / '24_uncertainty_metric.pkl')
metrics = [Metric.ACCURACY, Metric.F1, Metric.MCC, Metric.BALANCED_ACC, Metric.EC_EST, Metric.EC_ADJUSTED]
# compute difference
for metric in metrics:
    performance_assessment_df[metric.value] = np.abs(
        performance_assessment_df[metric] - performance_assessment_df['reference ' + metric.value])

performance_assesment_tables = {}
for cal in [CalibrationMethod.NONE, CalibrationMethod.AFFINE_REWEIGHTED]:
    table_entries = []
    for ir in IRS:
        train_uncertainty = {met: [] for met in metrics}
        data_uncertainty = {met: [] for met in metrics}
        sub_df = performance_assessment_df[
            (performance_assessment_df.calibration == cal.value) & (performance_assessment_df.ir == ir)]
        for repeat in sub_df['repeat'].unique():
            vals = train_uncertainty if repeat < 10 else data_uncertainty
            for met in metrics:
                vals[met].append(sub_df[sub_df['repeat'] == repeat][met.value].mean())
        _info = {'ir': ir}
        for met in metrics:
            _info[met] = (f'{np.mean(train_uncertainty[met]):.3f} ± {np.std(train_uncertainty[met]):.3f} | '
                          f'{np.mean(data_uncertainty[met]):.3f} ± {np.std(data_uncertainty[met]):.3f}')
        table_entries.append(_info)
    enum_map = {elem: elem.value for elem in list(Metric)}
    df = pd.DataFrame(table_entries).rename(columns=enum_map).T
    df = df.rename(columns=df.loc['ir'])
    df.drop('ir', axis=0, inplace=True)
    performance_assesment_tables[cal] = df

In [16]:
performance_assesment_tables[CalibrationMethod.NONE]  # corresponds to figure 10 top row

Unnamed: 0,1.0,5.0,10.0
Accuracy,0.012 ± 0.002 | 0.012 ± 0.001,0.058 ± 0.010 | 0.061 ± 0.007,0.071 ± 0.012 | 0.074 ± 0.010
F1 Score,0.015 ± 0.002 | 0.016 ± 0.001,0.124 ± 0.008 | 0.122 ± 0.009,0.218 ± 0.013 | 0.210 ± 0.015
MCC,0.024 ± 0.004 | 0.025 ± 0.003,0.055 ± 0.008 | 0.053 ± 0.009,0.115 ± 0.012 | 0.106 ± 0.012
Bal. Accuracy,0.012 ± 0.002 | 0.012 ± 0.001,0.014 ± 0.003 | 0.014 ± 0.001,0.017 ± 0.003 | 0.015 ± 0.001
EC (est. prev.),0.012 ± 0.002 | 0.013 ± 0.002,0.011 ± 0.003 | 0.013 ± 0.002,0.012 ± 0.002 | 0.014 ± 0.002
EC (dep. prev.),0.012 ± 0.002 | 0.012 ± 0.001,0.012 ± 0.003 | 0.013 ± 0.001,0.013 ± 0.003 | 0.014 ± 0.002


In [17]:
performance_assesment_tables[CalibrationMethod.AFFINE_REWEIGHTED]  # corresponds to figure 10 bottom row

Unnamed: 0,1.0,5.0,10.0
Accuracy,0.011 ± 0.002 | 0.012 ± 0.003,0.145 ± 0.007 | 0.145 ± 0.004,0.214 ± 0.008 | 0.212 ± 0.006
F1 Score,0.012 ± 0.002 | 0.013 ± 0.002,0.039 ± 0.004 | 0.041 ± 0.007,0.041 ± 0.005 | 0.047 ± 0.008
MCC,0.023 ± 0.003 | 0.023 ± 0.005,0.031 ± 0.002 | 0.029 ± 0.005,0.029 ± 0.006 | 0.038 ± 0.010
Bal. Accuracy,0.011 ± 0.002 | 0.012 ± 0.003,0.011 ± 0.002 | 0.011 ± 0.002,0.011 ± 0.002 | 0.012 ± 0.003
EC (est. prev.),0.021 ± 0.008 | 0.026 ± 0.010,0.018 ± 0.005 | 0.031 ± 0.013,0.020 ± 0.007 | 0.029 ± 0.013
EC (dep. prev.),0.011 ± 0.002 | 0.012 ± 0.003,0.006 ± 0.001 | 0.006 ± 0.001,0.004 ± 0.001 | 0.005 ± 0.001


# Task level details and robustness of prevalence estimation and re-calibration

In [3]:
quantification_df = pd.read_pickle(RESULT_PATH / '24_uncertainty_quantification.pkl').fillna(value=np.nan)

In [4]:
rel_quant_df = quantification_df[['ir', 'task', 'prior', 'd_size', 'KDEyHD']]
rel_quant_df  = rel_quant_df.loc[rel_quant_df['ir']==10.0]
rel_quant_df['Absolute error'] = rel_quant_df.apply(lambda row: absolute_error(row['prior'], row['KDEyHD']), axis=1)
rel_quant_df['n_classes'] = rel_quant_df['prior'].apply(len)

In [5]:
calibration_df = pd.read_csv(RESULT_PATH / '24_uncertainty_calibration.csv')
calibration_df = calibration_df.drop('Unnamed: 0', axis=1)
calibration_df = calibration_df.loc[calibration_df['calibration']=='AFFINE_ACC']

In [6]:
metadata = {t: "" for t in all_tasks}
for t in all_tasks:
    n_classes = data[0][t][Kind.LOGITS][Split.DEV_CAL].shape[1]
    d_size = data[0][t][Kind.LOGITS][Split.DEV_CAL].shape[0]
    metadata[t] = {'d_size':d_size, 'n_classes':n_classes}
lookup_df = pd.DataFrame(metadata).transpose()
lookup_df = lookup_df.reset_index()
lookup_df = lookup_df.rename(columns={'index': "task"})

In [7]:
calibration_df = pd.merge(calibration_df, lookup_df[['task', 'd_size', 'n_classes']], on='task', how='left')
calibration_df=calibration_df.loc[calibration_df['ir']==10.0]

In [8]:
quant_result = (
    rel_quant_df.groupby("task")
    .agg(
        Mean=("Absolute error", "mean"),
        Std=("Absolute error", "std"),
        d_size=("d_size", "first"),
        n_classes=("n_classes", "first")
    )
    .reset_index()  # Optional: reset index to keep "Group" as a column
)
quant_result['multiclass'] = quant_result['n_classes']>2

In [9]:
fig_quant = px.scatter(quant_result, x='d_size', y='Mean',
                 title='<b>Absolute error vs dataset size at IR 10 for KDEyHD</b>',
                 labels={'d_size': '<b>Dataset size</b>', 'Mean': '<b>Absolute error</b>'}, log_x=False, error_y='Std', color='multiclass')
fig_quant['layout']['font']['family'] = "NewComputerModern10"
fig_quant.update_layout(font_size=20.0, template='plotly_white', legend_itemsizing='constant', width=1800, height=500)

In [10]:
calib_result = (
    calibration_df.groupby("task")
    .agg(
        app_cwce=("app_cwce", "mean"),
        Std=("app_cwce", "std"),
        d_size=("d_size", "first"),
        n_classes=("n_classes", "first")
    )
    .reset_index()  # Optional: reset index to keep "Group" as a column
)
calib_result['multiclass'] = calib_result['n_classes']>2

In [11]:
fig_calib = px.scatter(calib_result, x='d_size', y='app_cwce',
                 title='<b>Absolute error vs dataset size at IR 10 for KDEyHD</b>',
                 labels={'d_size': '<b>Dataset size</b>', 'app_cwce': '<b>CWCE</b>'}, log_x=True, error_y="Std", color='multiclass')
fig_calib['layout']['font']['family'] = "NewComputerModern10"
fig_calib.update_layout(font_size=20.0, template='plotly_white', legend_itemsizing='constant', width=1800, height=500)

In [12]:
fig = multiplot(2,1,[fig_quant,  fig_calib], 
                sub_y_axis_titles={0:'Absolute Error', 1:'CWCE'}, 
                sub_x_axis_titles={0: 'D<sub>dep</sub> set size<br>(a) Prevalence estimation performance', 1:'D<sub>cal</sub> set size<br>(b) Re-calibration performance'},
                ir_axes=[1,2], ir_values=[10,10], height=900, width=1200, shared_xaxes=False, vertical_spacing=0.175)
fig.update_layout(
    xaxis1=dict(type='log'),
    xaxis2=dict(type='log'))
fig['data'][0]['name'] = "Binary"
fig['data'][1]['name'] = "Multiclass"


In [13]:
fig

In [15]:
name = f"fig_E11"
fig.write_image(RESULT_PATH / f"{name}.png")
fig.write_image(RESULT_PATH / f"{name}.pdf")