In [8]:
import pandas as pd
from omegaconf import OmegaConf
from lib.uncertainty import Simulate
from prettytable import PrettyTable

In [None]:
EXPERIMENTS = [
    '2026-01-26_20-22_fever_dense',
    '2026-01-26_20-23_fever_sparse',
    '2026-01-26_23-23_fever_hybrid',
    '2026-01-26_20-22_fever_similar',
    '2026-01-26_20-25_fever_oracle',
    '2026-01-26_20-55_fever_random',
    '2026-01-26_20-23_fever_probabilistic', # Random
    '2026-01-27_17-16_fever_probabilistic', # Similar
    '2026-01-26_21-44_compendium_dense',
    '2026-01-26_22-34_compendium_sparse',
    '2026-01-26_23-23_compendium_hybrid',
    '2026-01-26_22-22_compendium_similar',
    '2026-01-26_21-56_compendium_oracle',
    '2026-01-26_22-05_compendium_random',
    '2026-01-26_22-52_compendium_probabilistic', # Random
    '2026-01-27_17-17_compendium_probabilistic', # Similar
    '2026-01-27_13-07_nq_dense',
    '2026-01-27_17-53_nq_sparse',
    '2026-01-27_18-41_nq_hybrid',
    '2026-01-27_17-51_nq_similar',
    '2026-01-27_17-51_nq_oracle',
    '2026-01-27_17-49_nq_random',
    #'Probabilistic; # Random -> TODO
    #'Probabilistic; # Similar -> Scheduled

]

dataset = {
    'fever': 'Fever',
    'compendium': 'Compendium'
}

strategy = {
    'oracle': 'Oracle',
    'random': 'Random',
    'similar': 'Similar',
    'dense': 'Dense',
    'sparse': 'Sparse'
}

In [10]:
results_all = {}

for experiment in EXPERIMENTS:
    id = experiment.split('_')

    # cfg = OmegaConf.load(f'../results/{experiment}/config.yaml')
    results = pd.read_json(f'../results/{experiment}/results.json')
    results['correct_query'] = True

    results['date'] = id[0]
    results['time'] = id[1]
    results['dataset'] = id[2]

    results['retriever_strategy'] = id[3]

    results_all[experiment] = results

In [11]:
# cfg = OmegaConf.load(f'../results/{experiment}/config.yaml')
simulate = Simulate()

success_rates = {}
for experiment, results in results_all.items():
    success_rates[experiment] = simulate.compute_uncertainty(results)

In [12]:
t = PrettyTable(field_names=['Dataset', 'Retriever Strategy', 'Query', 'Retreiver', 'Generator', '# Tests'])

for experiment, success_rate in success_rates.items():
    id = experiment.split('_')
    t.add_row([
        id[2],
        id[3],
        f"{success_rate['q']['mean']:.2%} ± {success_rate['q']['std']:.4%}",
        f"{success_rate['r']['mean']:.2%} ± {success_rate['r']['std']:.4%}",
        f"{success_rate['g']['mean']:.2%} ± {success_rate['g']['std']:.4%}",
        len(results_all[experiment])
    ]) 

t

Dataset,Retriever Strategy,Query,Retreiver,Generator,# Tests
fever,dense,99.99% ± 0.0099%,65.78% ± 0.4768%,78.83% ± 0.4078%,10000
fever,sparse,99.99% ± 0.0099%,51.07% ± 0.5032%,76.58% ± 0.4214%,10000
fever,hybrid,99.99% ± 0.0099%,77.83% ± 0.4168%,88.15% ± 0.3220%,10000
fever,similar,99.99% ± 0.0099%,0.01% ± 0.0118%,58.71% ± 0.4855%,10000
fever,oracle,99.99% ± 0.0099%,99.98% ± 0.0119%,91.33% ± 0.2816%,10000
fever,random,99.99% ± 0.0099%,3.08% ± 0.1741%,4.28% ± 0.1989%,10000
fever,probabilistic,99.99% ± 0.0099%,80.41% ± 0.3981%,73.86% ± 0.4439%,10000
fever,probabilistic,99.99% ± 0.0099%,79.80% ± 0.4027%,84.70% ± 0.3596%,10000
compendium,dense,99.83% ± 0.1658%,82.42% ± 1.5547%,76.71% ± 1.7234%,598
compendium,sparse,99.83% ± 0.1658%,58.29% ± 2.0206%,55.94% ± 2.0271%,598


In [13]:
df = results_all['2026-01-26_20-22_fever_dense']
df['answer'].value_counts()

answer
SUPPORTS    7219
REFUTES     2781
Name: count, dtype: int64

In [14]:
df['generated_answer'].value_counts()

generated_answer
SUPPORTS           5949
REFUTES            2326
NOT ENOUGH INFO    1725
Name: count, dtype: int64