In [2]:
import json
import pandas as pd
import config
import numpy as np

def calculate_ece(confidences, correct):
    M = 10 # 11 bins: 0,1,...,10
    ece = 0.0
    n = len(confidences)
    bins = np.linspace(0.0, 10.0, M+1)

    for i in range(M):
        bin_lower, bin_upper = bins[i], bins[i+1]
        if M == 0:
            in_bin = (confidences >= bin_lower) & (confidences <= bin_upper)
        else:
            in_bin = (confidences > bin_lower) & (confidences <= bin_upper)
        bin_size = np.sum(in_bin)
        if bin_size > 0:
            conf_avg = np.mean(confidences[in_bin])
            norm_conf_avg = conf_avg / 10.0
            acc_avg = np.mean(correct[in_bin])
            ece += (bin_size / n) * np.abs(acc_avg - norm_conf_avg)

    return ece


def calculate_coefficient(run_name, model):
    loaded_df = pd.read_csv(f'{config.output_dir}/sequences/{run_name}/{model.split("/")[-1]}_conf_uncertainty.csv')

    loaded_df['pred_entropy'] = pd.to_numeric(loaded_df['predictive_entropy_over_concepts'], errors='coerce')
    loaded_df['unnorm_entropy'] = pd.to_numeric(loaded_df['unnormalised_entropy_over_concepts'], errors='coerce')
    loaded_df['confidence_list'] = pd.to_numeric(loaded_df['confidence_list'], errors='coerce')
    loaded_df.replace('N/A', pd.NA, inplace=True)

    loaded_df.dropna(subset=['pred_entropy', 'unnorm_entropy', 'confidence_list'], inplace=True)

    correlation_norm = -loaded_df['pred_entropy'].corr(loaded_df['confidence_list'])
    correlation_norm_exp = loaded_df['pred_entropy'].corr(np.exp(-loaded_df['confidence_list']))
    correlation_unnorm = -loaded_df['unnorm_entropy'].corr(loaded_df['confidence_list'])    

    return correlation_norm, correlation_norm_exp, correlation_unnorm


In [6]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

run_name = {
    "deft-monkey-51": "coqa",
    "decent-field-17": "trivia_qa"
}

model = "meta-llama/Llama-2-7b-hf"
models = [
    "meta-llama/Llama-2-7b-hf",
    "meta-llama/Llama-2-7b-chat-hf",
    "meta-llama/Llama-2-13b-hf",
    "meta-llama/Llama-2-13b-chat-hf",
    "meta-llama/Llama-3.1-8B",
    "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "meta-llama/Llama-3.2-3B",
    "meta-llama/Llama-3.2-3B-Instruct",
    "meta-llama/Meta-Llama-3-8B",
    "meta-llama/Meta-Llama-3-8B-Instruct",

    "Qwen/Qwen2.5-14B",
    "Qwen/Qwen2.5-14B-Instruct",
    "Qwen/Qwen2.5-3B",
    "Qwen/Qwen2.5-3B-Instruct",

    "Qwen/Qwen3-4B-Base",
    "Qwen/Qwen3-4B-Instruct-2507",
    "Qwen/Qwen3-8B-Base",
    "Qwen/Qwen3-14B-Base",

    "mistralai/Mistral-7B-v0.1",
    "mistralai/Mistral-7B-Instruct-v0.1",
    "mistral-community/Mistral-7B-v0.2",
    "mistralai/Mistral-7B-Instruct-v0.2",
    "mistralai/Mistral-7B-v0.3",
    "mistralai/Mistral-7B-Instruct-v0.3",
    "mistralai/Mistral-Nemo-Base-2407",
    "mistralai/Mistral-Nemo-Instruct-2407",

    "mistralai/Mixtral-8x7B-v0.1",
    "mistralai/Mixtral-8x7B-Instruct-v0.1"
]
for run_id in run_name.keys():
    task = run_name[run_id]
    print(f"Task: {task}")
    for model in models:
        # print(f"Model: {model}")
        correlation_norm, correlation_norm_exp, correlation_unnorm = calculate_coefficient(run_id, model)
        print(f"Model: {model.split('/')[1]:<35}\t\t{correlation_norm}")
    print("-" * 80) 

Task: coqa
Model: Llama-2-7b-hf                      		0.0561081563066567
Model: Llama-2-7b-chat-hf                 		0.06329734579658147
Model: Llama-2-13b-hf                     		-0.1476137415084034
Model: Llama-2-13b-chat-hf                		0.06381314925818411
Model: Llama-3.1-8B                       		-0.014993740494461947
Model: Meta-Llama-3.1-8B-Instruct         		0.16069363725679345
Model: Llama-3.2-3B                       		0.00041257492381514816
Model: Llama-3.2-3B-Instruct              		0.24495199899299497
Model: Meta-Llama-3-8B                    		0.00779846309196965
Model: Meta-Llama-3-8B-Instruct           		-0.005993927385860379
Model: Qwen2.5-14B                        		0.004234559052747409
Model: Qwen2.5-14B-Instruct               		0.14069474788850225
Model: Qwen2.5-3B                         		-0.21067532350485618
Model: Qwen2.5-3B-Instruct                		-0.03240709438383913
Model: Qwen3-4B-Base                      		-0.6403238717956943
Model: Qwen3-4B-Inst