In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import torch
import pandas as pd
from sklearn.metrics import roc_auc_score

In [2]:
def calculate_ece(confidences, accuracies, n_bins=10) -> float:
    """
    Calculate the expected calibration error (ECE) given a list of confidence scores (0-1) and accuracy scores (0 or 1).
    """
    df = pd.DataFrame({"conf": confidences, "acc": accuracies}).dropna()

    confidences = torch.tensor(df["conf"].tolist())
    accuracies = torch.tensor(df["acc"].tolist())
    bin_boundaries = torch.linspace(0, 1, n_bins + 1)
    bin_lowers = bin_boundaries[:-1]
    bin_uppers = bin_boundaries[1:]

    ece = torch.zeros(1)
    for bin_lower, bin_upper in zip(bin_lowers, bin_uppers):
        # Calculated |confidence - accuracy| in each bin
        in_bin = confidences.gt(bin_lower.item()) * confidences.le(bin_upper.item())
        prop_in_bin = in_bin.float().mean()
        if prop_in_bin.item() > 0:
            accuracy_in_bin = accuracies[in_bin].float().mean()
            avg_confidence_in_bin = confidences[in_bin].mean()
            ece += torch.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin
    
    return ece.item()

In [3]:
nan_indices = set()

for file in os.listdir("../results"):
    if file.endswith("csv"):
        df = pd.read_csv("../results/" + file)

        # Find indices where either column has NaN
        indices_with_nan = df[
            df["extracted_answer"].isna() | df["verbal_numerical_confidence"].isna()
        ].index

        # Combine indices across all files
        nan_indices.update(indices_with_nan)

# Convert to sorted list if needed
nan_indices = sorted(nan_indices)
len(nan_indices)

12032

# Drop NA Across All Models

In [4]:
stats_df = pd.DataFrame()
for file in os.listdir("../results"):
    if file.endswith("csv"):
        df = pd.read_csv("../results/" + file).drop(index=nan_indices).reset_index(drop=True)
        acc_no_na = df
        new_row = pd.DataFrame({
            "Model": [file.replace("mmlu_pro_", "").replace("_eval_all_None.csv", "").strip().capitalize()],
            "Missing answer": [df["extracted_answer"].isna().sum()],
            "Missing verbal numerical confidence": [df["verbal_numerical_confidence"].isna().sum()],
            "Missing logit perplexity confidence": [df["logit_perplexity_confidence"].isna().sum()],
            "Missing verbal linguistic confidence": [df["verbal_linguistic_confidence"].isna().sum()],
            "Accuracy": [(df["extracted_answer"] == df["correct_answer"]).mean()],
            "Accuracy without na": [(acc_no_na["extracted_answer"] == acc_no_na["correct_answer"]).mean()],
            "ECE verbal numerical confidence": [calculate_ece(df["verbal_numerical_confidence"].values, (df["extracted_answer"] == df["correct_answer"]))],
            "ECE logit perplexity confidence": [calculate_ece(df["logit_perplexity_confidence"].values, (df["extracted_answer"] == df["correct_answer"]))],
            "ECE verbal linguistic confidence": [calculate_ece(df["verbal_linguistic_confidence"].values, (df["extracted_answer"] == df["correct_answer"]))],
            "AUROC verbal numerical confidence": roc_auc_score((df["extracted_answer"] == df["correct_answer"]).values, df["verbal_numerical_confidence"].fillna(0).values),
            "AUROC logit perplexity confidence": roc_auc_score((df["extracted_answer"] == df["correct_answer"]), df["logit_perplexity_confidence"].fillna(0).values),
            "AUROC verbal linguistic confidence": roc_auc_score((df["extracted_answer"] == df["correct_answer"]), df["verbal_linguistic_confidence"].fillna(0).values),
            "Mean verbal numerical confidence": [df["verbal_numerical_confidence"].mean()],
            "Mean logit perplexity confidence": [df["logit_perplexity_confidence"].mean()],
            "Mean verbal linguistic confidence": [df["verbal_linguistic_confidence"].mean()],
            "Std verbal numerical confidence": [df["verbal_numerical_confidence"].std()],
            "Std logit perplexity confidence": [df["logit_perplexity_confidence"].std()],
            "Std verbal linguistic confidence": [df["verbal_linguistic_confidence"].std()],
        })
        stats_df = pd.concat([stats_df, new_row], ignore_index=True)
    
display(stats_df.sort_values(by="Model", ignore_index=True))

ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.

# Drop NA on an Individual Basis

In [3]:
stats_df = pd.DataFrame()
for file in os.listdir("../results"):
    if file.endswith("csv"):
        df = pd.read_csv("../results/" + file)
        acc_no_na = df[["extracted_answer", "correct_answer"]].dropna()
        new_row = pd.DataFrame({
            "Model": [file.replace("mmlu_pro_", "").replace("_eval_all_None.csv", "").strip().capitalize()],
            "Missing answer": [df["extracted_answer"].isna().sum()],
            "Missing verbal numerical confidence": [df["verbal_numerical_confidence"].isna().sum()],
            "Missing logit perplexity confidence": [df["logit_perplexity_confidence"].isna().sum()],
            "Missing verbal linguistic confidence": [df["verbal_linguistic_confidence"].isna().sum()],
            "Accuracy": [(df["extracted_answer"] == df["correct_answer"]).mean()],
            "Accuracy without na": [(acc_no_na["extracted_answer"] == acc_no_na["correct_answer"]).mean()],
            "ECE verbal numerical confidence": [calculate_ece(df["verbal_numerical_confidence"].values, (df["extracted_answer"] == df["correct_answer"]))],
            "ECE logit perplexity confidence": [calculate_ece(df["logit_perplexity_confidence"].values, (df["extracted_answer"] == df["correct_answer"]))],
            "ECE verbal linguistic confidence": [calculate_ece(df["verbal_linguistic_confidence"].values, (df["extracted_answer"] == df["correct_answer"]))],
            "AUROC verbal numerical confidence": roc_auc_score((df["extracted_answer"] == df["correct_answer"]).values, df["verbal_numerical_confidence"].fillna(0).values),
            "AUROC logit perplexity confidence": roc_auc_score((df["extracted_answer"] == df["correct_answer"]), df["logit_perplexity_confidence"].fillna(0).values),
            "AUROC verbal linguistic confidence": roc_auc_score((df["extracted_answer"] == df["correct_answer"]), df["verbal_linguistic_confidence"].fillna(0).values),
            "Mean verbal numerical confidence": [df["verbal_numerical_confidence"].mean()],
            "Mean logit perplexity confidence": [df["logit_perplexity_confidence"].mean()],
            "Mean verbal linguistic confidence": [df["verbal_linguistic_confidence"].mean()],
            "Std verbal numerical confidence": [df["verbal_numerical_confidence"].std()],
            "Std logit perplexity confidence": [df["logit_perplexity_confidence"].std()],
            "Std verbal linguistic confidence": [df["verbal_linguistic_confidence"].std()],
        })
        stats_df = pd.concat([stats_df, new_row], ignore_index=True)
    
display(stats_df.sort_values(by="Model", ignore_index=True))



Unnamed: 0,Model,Missing answer,Missing verbal numerical confidence,Missing logit perplexity confidence,Missing verbal linguistic confidence,Accuracy,Accuracy without na,ECE verbal numerical confidence,ECE logit perplexity confidence,ECE verbal linguistic confidence,AUROC verbal numerical confidence,AUROC logit perplexity confidence,AUROC verbal linguistic confidence,Mean verbal numerical confidence,Mean logit perplexity confidence,Mean verbal linguistic confidence,Std verbal numerical confidence,Std logit perplexity confidence,Std verbal linguistic confidence
0,Gemma-2-27b-it,30,0,0,0,0.531666,0.532995,0.379636,0.352674,0.0,0.612014,0.52369,0.5,0.909806,0.88434,0.0,0.141204,0.046461,0.0
1,Gemma-3-27b-it,12032,12032,12032,0,0.0,,0.0,0.0,0.0,,,,,,0.0,,,0.0
2,Qwen2.5-7b-instruct,58,22,0,0,0.505402,0.50785,0.390313,0.366994,0.0,0.64294,0.611824,0.5,0.896391,0.872396,0.0,0.094847,0.054149,0.0
3,Qwen3-0.6b,2306,21,0,0,0.200299,0.247789,0.691891,0.631562,0.0,0.546083,0.538223,0.5,0.885019,0.831861,0.0,0.264321,0.059801,0.0
4,Qwen3-0.6b-base,3329,2296,0,0,0.197224,0.272665,0.736976,0.640971,0.0,0.60587,0.488874,0.5,0.971261,0.838195,0.0,0.151414,0.081134,0.0
5,Qwen3-1.7b,264,55,0,0,0.390209,0.398963,0.491705,0.548449,0.0,0.598064,0.61103,0.5,0.875566,0.938659,0.0,0.24126,0.029157,0.0
6,Qwen3-1.7b-think,231,345,0,0,0.549618,0.560376,0.326778,0.363263,0.0,0.724315,0.711066,0.5,0.885424,0.91288,0.0,0.173317,0.0284,0.0
7,Qwen3-14b,44,14,0,0,0.629322,0.631632,0.267041,0.294918,0.0,0.70623,0.697831,0.5,0.896331,0.924239,0.0,0.109715,0.035738,0.0
8,Qwen3-14b-base,213,105,0,0,0.581699,0.592182,0.338958,0.282598,0.0,0.632375,0.58174,0.5,0.924956,0.864297,0.0,0.074858,0.053334,0.0
9,Qwen3-14b-think,113,170,0,0,0.756566,0.763739,0.129329,0.119671,0.0,0.78677,0.748882,0.5,0.894385,0.876237,0.0,0.120138,0.04426,0.0
