# Collect uncertainties and evaluate

In [None]:
import os
import pandas as pd
from copy import deepcopy
import json

In [None]:
hyper_parameters = {'dataset': ['AmbigInst', 'AmbigQA'],
                    'ensembling_method': ['clarification_zeroshot', 'no_ensembling'],
                    'variation_model': ['gpt_4o', 'no_model'],
                    'target_model': ['phi_4', 'llama4_maverick'],
                    'embedding_model': ['all_mpnet_base_v2']}

In [None]:
import numpy as np
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, f1_score

def evaluate_uncertainty_predictiveness(uncertainties, labels, eval_measure):
    uncertainties = np.array(uncertainties)
    labels = np.array(labels)

    # AUC-ROC
    auc_roc = roc_auc_score(labels, uncertainties)

    # AUC-PR (Average Precision Score)
    auc_pr = average_precision_score(labels, uncertainties)

    # Best F1 Score
    precision, recall, thresholds = precision_recall_curve(labels, uncertainties)
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
    best_f1 = np.max(f1_scores)

    # Averages
    avg_uncertainty_ambiguous = uncertainties[labels == 1].mean() if np.any(labels == 1) else float('nan')
    avg_uncertainty_unambiguous = uncertainties[labels == 0].mean() if np.any(labels == 0) else float('nan')

    values_dict = {
        'AUC_ROC': auc_roc,
        'AUC_PR': auc_pr,
        'Best_F1': best_f1,
        'Avg_uncertainty_amb': avg_uncertainty_ambiguous,
        'Avg_uncertainty_unambiguous': avg_uncertainty_unambiguous
    }
    return values_dict[eval_measure]


In [None]:
evals_wo_embeddings = {'AUC_ROC': [],
        'AUC_PR': [],
        'Best_F1': [],
        'Avg_uncertainty_amb': [],
        'Avg_uncertainty_unambiguous': []}
uncertainties_wo_embeddings = []
for dataset in hyper_parameters['dataset']:
    for ensembling_method in hyper_parameters['ensembling_method']:
        for variation_model in hyper_parameters['variation_model']:
            for target_model in hyper_parameters['target_model']:
                config = {'dataset': dataset,
                            'ensembling_method': ensembling_method,
                            'variation_model': variation_model,
                            'target_model': target_model}
                dataset_df = pd.read_json(f"data/eval/{dataset}.json")
                dataset_df = dataset_df[["question_id", "is_ambig"]]
                uncertainties_path = os.path.join(f"data/logs/{dataset}/{ensembling_method}/{variation_model}-variations",
                                                f"{target_model}-uncertainties_wo_embeddings.json")
                if os.path.exists(uncertainties_path):
                    metrics_df = pd.read_json(uncertainties_path)
                    metrics = [metric for metric in metrics_df.columns if metric!="question_id"]
                    labeled_uncertainties = metrics_df.join(dataset_df, on = "question_id", how = "left", validate="one_to_one", rsuffix="r")
                    for eval_measure in evals_wo_embeddings.keys():
                        config_copy = deepcopy(config)
                        for metric in metrics:
                            config_copy.update({metric: evaluate_uncertainty_predictiveness(labeled_uncertainties[metric], labeled_uncertainties["is_ambig"], eval_measure)})
                        evals_wo_embeddings[eval_measure].append(config_copy)
                    config_copy = deepcopy(config)
                    config_copy.update({'is_ambig': labeled_uncertainties["is_ambig"].to_list()})
                    for metric in metrics:
                        config_copy.update({metric: labeled_uncertainties[metric].to_list()})
                    uncertainties_wo_embeddings.append(config_copy)                    


In [None]:
evals_w_embeddings = {'AUC_ROC': [],
        'AUC_PR': [],
        'Best_F1': [],
        'Avg_uncertainty_amb': [],
        'Avg_uncertainty_unambiguous': []}
uncertainties_w_embeddings = []
for dataset in hyper_parameters['dataset']:
    for ensembling_method in hyper_parameters['ensembling_method']:
        for variation_model in hyper_parameters['variation_model']:
            for target_model in hyper_parameters['target_model']:
                for embedding_model in hyper_parameters['embedding_model']:
                    config = {'dataset': dataset,
                                'ensembling_method': ensembling_method,
                                'variation_model': variation_model,
                                'target_model': target_model,
                                'embedding_model': embedding_model}
                    dataset_df = pd.read_json(f"data/eval/{dataset}.json")
                    dataset_df = dataset_df[["question_id", "is_ambig"]]
                    uncertainties_path = os.path.join(f"data/logs/{dataset}/{ensembling_method}/{variation_model}-variations",
                                                    f"{target_model}-uncertainties_w_embeddings-{embedding_model}.json")
                    if os.path.exists(uncertainties_path):
                        metrics_df = pd.read_json(uncertainties_path)
                        metrics = [metric for metric in metrics_df.columns if metric!="question_id"]
                        labeled_uncertainties = metrics_df.join(dataset_df, on = "question_id", how = "left", validate="one_to_one", rsuffix="r")
                        for eval_measure in evals_w_embeddings.keys():
                            config_copy = deepcopy(config)
                            for metric in metrics:
                                config_copy.update({metric: evaluate_uncertainty_predictiveness(labeled_uncertainties[metric], labeled_uncertainties["is_ambig"], eval_measure)})
                            evals_w_embeddings[eval_measure].append(config_copy)
                        config_copy = deepcopy(config)
                        config_copy.update({'is_ambig': labeled_uncertainties["is_ambig"].to_list()})
                        for metric in metrics:
                            config_copy.update({metric: labeled_uncertainties[metric].to_list()})
                        uncertainties_w_embeddings.append(config_copy)                    


In [None]:
evals_wo_embeddings_df = {key: pd.DataFrame(value) for key, value in evals_wo_embeddings.items()}
evals_w_embeddings_df = {key: pd.DataFrame(value) for key, value in evals_w_embeddings.items()}

In [None]:
evals_wo_embeddings_df['AUC_ROC']

In [None]:
evals_w_embeddings_df['AUC_ROC']

In [None]:
uncertainties_w_embeddings_df = pd.DataFrame(uncertainties_w_embeddings)
uncertainties_wo_embeddings_df = pd.DataFrame(uncertainties_wo_embeddings)

In [None]:
uncertainties_w_embeddings_df

In [None]:
uncertainties_wo_embeddings_df 

In [None]:
def get_evaluation(evaluation_df, query_dict, uncertainty_metric, formula_part):
    mask = np.logical_and.reduce([evaluation_df[k] == v for k, v in query_dict.items()])
    filtered_df = evaluation_df[mask]
    assert len(filtered_df) == 1, f"{len(filtered_df)}, {query_dict}"
    full_metric_name = f"{uncertainty_metric}_{formula_part}"
    return filtered_df.iloc[0][full_metric_name]

def get_labels(uncertainties_df, query_dict):
    mask = np.logical_and.reduce([uncertainties_df[k] == v for k, v in query_dict.items()])
    filtered_df = uncertainties_df[mask]
    assert len(filtered_df) == 1, f"{len(filtered_df)}, {query_dict}"
    return filtered_df.iloc[0]['is_ambig']

# Main results

In [None]:
def generate_main_table(evals_w_embeddings_df, evals_wo_embeddings_df, dataset, target_model, embedding_model):
    eval_metric_names = ['AUC_ROC', 'AUC_PR']
    
    uncertainty_method_to_metric = {'semantic entropy': 'discrete_semantic_entropy',
                                    'kernel language entropy': 'kernel_language_entropy',
                                    'predictive kernel entropy': 'predictive_kernel_entropy',
                                    'input clarification ensembling': 'discrete_semantic_entropy',
                                    'spectral uncertainty': 'von_neuman_entropy'}
    
    

    uncertainty_method_to_ensembling_method = {'semantic entropy': 'no_ensembling',
                                    'kernel language entropy': 'no_ensembling',
                                    'predictive kernel entropy': 'no_ensembling',
                                    'input clarification ensembling': 'clarification_zeroshot',
                                    'spectral uncertainty': 'clarification_zeroshot'}
    uncertainty_method_to_formula_part = {'semantic entropy': 'total',
                                    'kernel language entropy': 'total',
                                    'predictive kernel entropy': 'total',
                                    'input clarification ensembling': 'disagreement',
                                    'spectral uncertainty': 'disagreement'}
    uncertainty_method_to_variation_model = {'semantic entropy': 'no_model',
                                    'kernel language entropy': 'no_model',
                                    'predictive kernel entropy': 'no_model',
                                    'input clarification ensembling': 'gpt_4o',
                                    'spectral uncertainty': 'gpt_4o'}
    uncertainty_method_to_target_model = {'semantic entropy': target_model,
                                    'kernel language entropy': target_model,
                                    'predictive kernel entropy': target_model,
                                    'input clarification ensembling': target_model,
                                    'spectral uncertainty': target_model}
    


    uncertainty_method_to_df = {'semantic entropy': evals_wo_embeddings_df,
                                    'kernel language entropy': evals_wo_embeddings_df,
                                    'predictive kernel entropy': evals_w_embeddings_df,
                                    'input clarification ensembling': evals_wo_embeddings_df,
                                    'spectral uncertainty': evals_w_embeddings_df}
    
    
    
    table_lines = []
    for method in uncertainty_method_to_metric.keys(): #Methods without embeddings
        if method not in ['spectral uncertainty', 'predictive kernel entropy']:
            query_dict = {'dataset': dataset,
                          'target_model': uncertainty_method_to_target_model[method],
                          'ensembling_method': uncertainty_method_to_ensembling_method[method],
                          'variation_model': uncertainty_method_to_variation_model[method]}
            method_dict = {'Uncertainty Method': method}
            for eval_metric in eval_metric_names:
                method_dict[eval_metric] = get_evaluation(uncertainty_method_to_df[method][eval_metric], query_dict, uncertainty_method_to_metric[method], uncertainty_method_to_formula_part[method])
            table_lines.append(method_dict)
    for method in ['predictive kernel entropy', 'spectral uncertainty']: #Methods with embeddings
        query_dict = {'dataset': dataset,
                        'target_model': uncertainty_method_to_target_model[method],
                        'ensembling_method': uncertainty_method_to_ensembling_method[method],
                        'variation_model': uncertainty_method_to_variation_model[method],
                        'embedding_model': embedding_model}
        method_dict = {'Uncertainty Method': method }
        for eval_metric in eval_metric_names:
            method_dict[eval_metric] = get_evaluation(uncertainty_method_to_df[method][eval_metric], query_dict, uncertainty_method_to_metric[method], uncertainty_method_to_formula_part[method])
        table_lines.append(method_dict)

    return pd.DataFrame(table_lines)

In [None]:
generate_main_table(evals_w_embeddings_df, evals_wo_embeddings_df, 'AmbigQA', 'phi_4', 'all_mpnet_base_v2')

In [None]:
generate_main_table(evals_w_embeddings_df, evals_wo_embeddings_df, 'AmbigInst', 'phi_4', 'all_mpnet_base_v2')

In [None]:
generate_main_table(evals_w_embeddings_df, evals_wo_embeddings_df, 'AmbigQA', 'llama4_maverick', 'all_mpnet_base_v2')

In [None]:
generate_main_table(evals_w_embeddings_df, evals_wo_embeddings_df, 'AmbigInst', 'llama4_maverick', 'all_mpnet_base_v2')

# KDE plots

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def generate_kde_data(uncertainties_w_embeddings_df, uncertainties_wo_embeddings_df, dataset, target_model, embedding_model):    
    uncertainty_method_to_metric = {'Semantic Entropy': 'discrete_semantic_entropy',
                                    'Kernel Language Entropy': 'kernel_language_entropy',
                                    'Predictive Kernel Entropy': 'predictive_kernel_entropy',
                                    'Input Clarification Ensembling (aleatoric)': 'discrete_semantic_entropy',
                                    'Spectral Uncertainty (aleatoric)': 'von_neuman_entropy'}
    
    uncertainty_method_to_ensembling_method = {'Semantic Entropy': 'no_ensembling',
                                    'Kernel Language Entropy': 'no_ensembling',
                                    'Predictive Kernel Entropy': 'no_ensembling',
                                    'Input Clarification Ensembling (aleatoric)': 'clarification_zeroshot',
                                    'Spectral Uncertainty (aleatoric)': 'clarification_zeroshot'}
    uncertainty_method_to_formula_part = {'Semantic Entropy': 'total',
                                    'Kernel Language Entropy': 'total',
                                    'Predictive Kernel Entropy': 'total',
                                    'Input Clarification Ensembling (aleatoric)': 'disagreement',
                                    'Spectral Uncertainty (aleatoric)': 'disagreement'}
    uncertainty_method_to_variation_model = {'Semantic Entropy': 'no_model',
                                    'Kernel Language Entropy': 'no_model',
                                    'Predictive Kernel Entropy': 'no_model',
                                    'Input Clarification Ensembling (aleatoric)': 'gpt_4o',
                                    'Spectral Uncertainty (aleatoric)': 'gpt_4o'}
    uncertainty_method_to_target_model = {'Semantic Entropy': target_model,
                                    'Kernel Language Entropy': target_model,
                                    'Predictive Kernel Entropy': target_model,
                                    'Input Clarification Ensembling (aleatoric)': target_model,
                                    'Spectral Uncertainty (aleatoric)': target_model}
    


    uncertainty_method_to_df = {'Semantic Entropy': uncertainties_wo_embeddings_df,
                                    'Kernel Language Entropy': uncertainties_wo_embeddings_df,
                                    'Predictive Kernel Entropy': uncertainties_w_embeddings_df,
                                    'Input Clarification Ensembling (aleatoric)': uncertainties_wo_embeddings_df,
                                    'Spectral Uncertainty (aleatoric)': uncertainties_w_embeddings_df}
    uncertainty_method_to_embedding_model = {'Semantic Entropy': None,
                                    'Kernel Language Entropy': None,
                                    'Predictive Kernel Entropy': embedding_model,
                                    'Input Clarification Ensembling (aleatoric)': None,
                                    'Spectral Uncertainty (aleatoric)': embedding_model}
    
    
    method_names = []
    method_uncertainties = []
    for method in uncertainty_method_to_metric.keys(): 
        query_dict = {'dataset': dataset,
                        'target_model': uncertainty_method_to_target_model[method],
                        'ensembling_method': uncertainty_method_to_ensembling_method[method],
                        'variation_model': uncertainty_method_to_variation_model[method]}
        if uncertainty_method_to_embedding_model[method] is not None:
            query_dict['embedding_model'] = uncertainty_method_to_embedding_model[method]
        method_names.append(method)
        method_uncertainties.append(get_evaluation(uncertainty_method_to_df[method], query_dict, uncertainty_method_to_metric[method], uncertainty_method_to_formula_part[method]))
        is_ambig = get_labels(uncertainty_method_to_df[method], query_dict)

    return is_ambig, method_uncertainties, method_names


def plot_uncertainty_distributions(is_ambig, uncertainty_methods, method_names):
    plt.style.use('ggplot')

    plt.rcParams.update({
        'font.size': 10,
        'axes.titlesize': 12,
        'axes.labelsize': 10,
        'legend.fontsize': 9,
        'xtick.labelsize': 9,
        'ytick.labelsize': 9,
        'axes.edgecolor': 'black',
        'axes.linewidth': 0.8,
        'grid.color': '#e0e0e0',
        'grid.linestyle': '--',
    })
    n_methods = len(uncertainty_methods)

        
    fig, axes = plt.subplots(2, 3, figsize=(12, 8)) #todo change to 3x2 grid, 8x12 size
    axes = axes.flatten()

    for i, (method_uncertainties, name) in enumerate(zip(uncertainty_methods, method_names)):
        ax = axes[i]
        ambig_values = [u for u, a in zip(method_uncertainties, is_ambig) if a]
        nonambig_values = [u for u, a in zip(method_uncertainties, is_ambig) if not a]
        
        sns.kdeplot(ambig_values, ax=ax, label='Ambiguous', fill=True, alpha = 0.4, linewidth=1.5, color='#E24A33')
        sns.kdeplot(nonambig_values, ax=ax, label='Non-ambiguous', fill=True, alpha = 0.4, linewidth=1.5, color='#348ABD')
        
        ax.set_title(name)
        ax.set_xlabel('Uncertainty')
        ax.set_ylabel('Density')
        ax.legend()

    # Hide any unused subplots if < 6 methods
    for j in range(n_methods, 6):
        fig.delaxes(axes[j])

    plt.tight_layout(rect=[0, 0, 1, 0.95])
    
    plt.show()


In [None]:
is_ambig, method_uncertainties, method_names = generate_kde_data(uncertainties_w_embeddings_df, uncertainties_wo_embeddings_df, 'AmbigQA', 'phi_4', 'all_mpnet_base_v2')

In [None]:
plot_uncertainty_distributions(is_ambig, method_uncertainties, method_names)

In [None]:
is_ambig, method_uncertainties, method_names = generate_kde_data(uncertainties_w_embeddings_df, uncertainties_wo_embeddings_df, 'AmbigInst', 'phi_4', 'all_mpnet_base_v2')

In [None]:
plot_uncertainty_distributions(is_ambig, method_uncertainties, method_names)

In [None]:
is_ambig, method_uncertainties, method_names = generate_kde_data(uncertainties_w_embeddings_df, uncertainties_wo_embeddings_df, 'AmbigQA', 'llama4_maverick', 'all_mpnet_base_v2')

In [None]:
plot_uncertainty_distributions(is_ambig, method_uncertainties, method_names)

In [None]:
is_ambig, method_uncertainties, method_names = generate_kde_data(uncertainties_w_embeddings_df, uncertainties_wo_embeddings_df, 'AmbigInst', 'llama4_maverick', 'all_mpnet_base_v2')

In [None]:
plot_uncertainty_distributions(is_ambig, method_uncertainties, method_names)

# Pairwise distances

In [None]:
import pickle
from tqdm import tqdm
import seaborn as sns

In [None]:
def pairwise_distance_histograms(dataset, model):
    with open(f'data/logs/{dataset}/clarification_zeroshot/gpt_4o-variations/{model}-embeddings-all_mpnet_base_v2.pkl', 'rb') as f:
        embeddings = pickle.load(f)
    input_df = pd.read_json(f'data/logs/{dataset}/clarification_zeroshot/gpt_4o-variations/{model}-answers.json')
    input_df['embedding'] = embeddings
    input_df = input_df[['question_id', 'variant_id', 'answer_id', 'embedding']]
    question_ids = np.unique(input_df["question_id"]) #Sorted unique values
    distances = []
    for question_id in tqdm(question_ids):
        question_df = input_df[input_df["question_id"] == question_id]
        question_df = question_df.reset_index(drop = True)
        for i, row in question_df.iterrows():
            for j in range(i+1, len(question_df)):
                other_row = question_df.iloc[j]
                if row['variant_id']!= other_row['variant_id']:
                    distance = np.linalg.norm(np.array(row['embedding']) - np.array(other_row['embedding']))
                    distances.append(distance)
    plt.figure(figsize=(12, 6))
    sns.kdeplot(
        distances,
        bw_adjust=0.2,     
        cumulative=True,    # Enable CDF
        linewidth=2,
        color='#E24A33' if dataset == "AmbigInst" else '#348ABD',    # Orange-red (colorblind-friendly)
        label='Pairwise Distance CDF',
        fill=True,
        alpha=0.3
    )
    plt.xlabel('Distance', fontsize=12)
    plt.xlim(-0.05,1.6)
    plt.ylabel('Cumulative Probability', fontsize=12)
    plt.grid(True, linestyle='--', linewidth=0.5, color='gray', alpha=0.4)
    plt.legend(frameon=False)
    plt.tight_layout()
    plt.show()

In [None]:
pairwise_distance_histograms('AmbigQA', 'phi_4')

In [None]:
pairwise_distance_histograms('AmbigInst', 'phi_4')

In [None]:
pairwise_distance_histograms('AmbigQA', 'llama4_maverick')

In [None]:
pairwise_distance_histograms('AmbigInst', 'llama4_maverick')