In [None]:
import pandas as pd
from omegaconf import OmegaConf
from lib.uncertainty import Simulate
from prettytable import PrettyTable

In [None]:
EXPERIMENTS = [
    '2026-01-26_20-22_fever_dense',
    '2026-01-26_20-23_fever_sparse',
    '2026-01-26_20-23_fever_hybrid',
    '2026-01-26_20-25_fever_oracle',
    '2026-01-26_20-55_fever_random',
    '2026-01-26_20-23_fever_probabilistic',
    '2026-01-26_21-44_compendium_dense',
    '2026-01-26_22-34_compendium_sparse',
    '2026-01-26_22-42_compendium_hybrid',
    '2026-01-26_21-56_compendium_oracle',
    '2026-01-26_22-05_compendium_random',
    '2026-01-26_22-52_compendium_probabilistic'
]

dataset = {
    'fever': 'Fever',
    'compendium': 'Compendium'
}

strategy = {
    'oracle': 'Oracle',
    'random': 'Random',
    'similar': 'Similar',
    'dense': 'Dense',
    'sparse': 'Sparse'
}

In [None]:
results_all = {}

for experiment in EXPERIMENTS:
    id = experiment.split('_')

    # cfg = OmegaConf.load(f'../results/{experiment}/config.yaml')
    results = pd.read_json(f'../results/{experiment}/results.json')
    results['correct_query'] = True

    results['date'] = id[0]
    results['time'] = id[1]
    results['dataset'] = id[2]

    results['retriever_strategy'] = id[3]

    results_all[experiment] = results

In [None]:
# cfg = OmegaConf.load(f'../results/{experiment}/config.yaml')
simulate = Simulate()

success_rates = {}
for experiment, results in results_all.items():
    success_rates[experiment] = simulate.compute_uncertainty(results)

In [None]:
t = PrettyTable(field_names=['Dataset', 'Retriever Strategy', 'Query', 'Retreiver', 'Generator', '# Tests'])

for experiment, success_rate in success_rates.items():
    id = experiment.split('_')
    t.add_row([
        id[2],
        id[3],
        f"{success_rate['q']['mean']:.2%} ± {success_rate['q']['std']:.4%}",
        f"{success_rate['r']['mean']:.2%} ± {success_rate['r']['std']:.4%}",
        f"{success_rate['g']['mean']:.2%} ± {success_rate['g']['std']:.4%}",
        len(results_all[experiment])
    ])

t

Dataset,Retriever Strategy,Query,Retreiver,Generator,# Tests
fever,dense,99.99% ± 0.0099%,65.59% ± 0.4775%,78.61% ± 0.4093%,10000
fever,similar,99.99% ± 0.0099%,0.01% ± 0.0118%,58.60% ± 0.4856%,10000
fever,oracle,99.99% ± 0.0099%,99.98% ± 0.0119%,91.34% ± 0.2814%,10000
fever,random,99.99% ± 0.0099%,3.41% ± 0.1829%,4.31% ± 0.1995%,10000
fever,sparse,99.99% ± 0.0099%,61.52% ± 0.4892%,82.41% ± 0.3789%,10000
compendium,sparse,99.83% ± 0.1658%,65.61% ± 1.9451%,61.26% ± 1.9902%,598
compendium,dense,99.83% ± 0.1658%,82.42% ± 1.5547%,74.88% ± 1.7679%,598
compendium,oracle,99.83% ± 0.1658%,99.58% ± 0.2590%,90.05% ± 1.2186%,598
compendium,random,99.83% ± 0.1658%,0.25% ± 0.1958%,3.80% ± 0.7628%,598
compendium,similar,99.83% ± 0.1658%,0.25% ± 0.1958%,39.32% ± 1.9605%,598
