In [14]:
import pandas as pd
from omegaconf import OmegaConf
from lib.uncertainty import Simulate
from prettytable import PrettyTable

In [25]:
EXPERIMENTS = [
    '2026-01-20_11-24_fever_dense', # Dense
    '2026-01-20_12-06_fever_similar', # Similar
    '2026-01-20_12-23_fever_oracle', # Oracle
    '2026-01-20_12-34_fever_random', # Random
    '2026-01-23_13-52_fever_sparse', # Sparse
    '2026-01-23_16-14_compendium_sparse' # Sparse
]

dataset = {
    'fever': 'Fever',
    'compendium': 'Compendium'
}

strategy = {
    'oracle': 'Oracle',
    'random': 'Random',
    'similar': 'Similar',
    'dense': 'Dense',
    'sparse': 'Sparse'
}

In [26]:
results_all = {}

for experiment in EXPERIMENTS:
    id = experiment.split('_')

    # cfg = OmegaConf.load(f'../results/{experiment}/config.yaml')
    results = pd.read_json(f'../results/{experiment}/results.json')
    results['correct_query'] = True

    results['date'] = id[0]
    results['time'] = id[1]
    results['dataset'] = id[2]

    results['retriever_strategy'] = id[3]

    results_all[experiment] = results

In [None]:
# cfg = OmegaConf.load(f'../results/{experiment}/config.yaml')
simulate = Simulate()

success_rates = {}
for experiment, results in results_all.items():
    success_rates[experiment] = simulate.compute_uncertainty(results)

In [31]:
t = PrettyTable(field_names=['Dataset', 'Retriever Strategy', 'Query', 'Retreiver', 'Generator', '# Tests'])

for experiment, success_rate in success_rates.items():
    id = experiment.split('_')
    t.add_row([
        id[2],
        id[3],
        f"{success_rate['q']['mean']:.2%} ± {success_rate['q']['std']:.4%}",
        f"{success_rate['r']['mean']:.2%} ± {success_rate['r']['std']:.4%}",
        f"{success_rate['g']['mean']:.2%} ± {success_rate['g']['std']:.4%}",
        len(results_all[experiment])
    ])

t

Dataset,Retriever Strategy,Query,Retreiver,Generator,# Tests
fever,dense,99.99% ± 0.0099%,65.59% ± 0.4775%,78.61% ± 0.4093%,10000
fever,similar,99.99% ± 0.0099%,0.01% ± 0.0118%,58.60% ± 0.4856%,10000
fever,oracle,99.99% ± 0.0099%,99.98% ± 0.0119%,91.34% ± 0.2814%,10000
fever,random,99.99% ± 0.0099%,3.41% ± 0.1829%,4.31% ± 0.1995%,10000
fever,sparse,99.99% ± 0.0099%,61.52% ± 0.4892%,82.41% ± 0.3789%,10000
compendium,sparse,99.83% ± 0.1658%,65.61% ± 1.9451%,61.26% ± 1.9902%,598


In [19]:
cond = simulate.load_conditionals(results_all['dense'])

cond['r']['b']['q1'].mean

0.6559688062387522