In [1]:
import os
from statistics import mean

import dill
from dotenv import load_dotenv

from src.fact_reasoner.comprehensiveness import ComprehensivenessResult


def load_results(filename: str) -> tuple[dict, list[list[ComprehensivenessResult]]]:
    load_dotenv()
    RESULTS_PATH = os.environ["RESULTS_PATH"]

    save_file_path = f"{RESULTS_PATH}/{filename}"
    if not os.path.exists(save_file_path):
        raise FileNotFoundError(f"File {save_file_path} doesn't exist!")
    with open(save_file_path, "rb") as f:
        experiment_data = dill.load(f)

    return experiment_data["args"], experiment_data["results"]


def compute_score_mean(
    all_results: list[list[ComprehensivenessResult]], score_key: str | None = None
):
    scores = []
    for results in all_results:
        for result in results:
            if score_key is None:
                scores.append(result["comprehensiveness_eval_main_score"])
            else:
                scores.append(result["comprehensiveness_eval_results"][score_key])  # type: ignore
    return mean(scores)

## Overall Results

In [2]:
EXPERIMENT_VERSION = "v12"

In [3]:
import pathlib

load_dotenv()
RESULTS_PATH = os.environ["RESULTS_PATH"]
result_files = [p for p in pathlib.Path(RESULTS_PATH).iterdir() if p.is_file()]

In [4]:
summary_results = []
for result_file in result_files:
    if f"_{EXPERIMENT_VERSION}_" not in result_file.name:
        continue
    args, all_results = load_results(result_file.name)
    base_data = {
        "model": args["model_name"],
        "variant": args["variant"],
        "relevance_threshold": args["relevance_threshold"],
        "confidence_threshold": args["confidence_threshold"],
        "use_tools": not args["disable_tools"],
    }
    if args["dataset"] == "wiki_contradict_humaneval":
        summary_results.append(
            {
                **base_data,
                "result": "WikiContradict Base",
                "value": compute_score_mean(all_results, "score_satisfies_criteria"),
            }
        )
    elif args["dataset"] == "conflict_bank":
        summary_results.append(
            {
                **base_data,
                "result": "ConflictBank Lax",
                "value": compute_score_mean(all_results, "lax_score"),
            }
        )
        summary_results.append(
            {
                **base_data,
                "result": "ConflictBank Moderate",
                "value": compute_score_mean(all_results, "moderate_score"),
            }
        )
        summary_results.append(
            {
                **base_data,
                "result": "ConflictBank Strict",
                "value": compute_score_mean(all_results, "strict_score"),
            }
        )

In [5]:
import pandas as pd

df = pd.DataFrame(summary_results)
df = (
    df.pivot(
        index=[
            "model",
            "variant",
            "relevance_threshold",
            "confidence_threshold",
            "use_tools",
        ],
        columns="result",
        values="value",
    )
    .reset_index()
    .sort_values(by=["model", "variant"], key=lambda col: col.str.lower())
)
df["ConflictBank Mean"] = df[
    ["ConflictBank Lax", "ConflictBank Moderate", "ConflictBank Strict"]
].mean(axis=1)
df["Mean"] = df[["ConflictBank Mean", "WikiContradict Base"]].mean(axis=1)
df = df.reset_index(drop=True)

In [6]:
with pd.option_context("display.precision", 2):
    display(df)

result,model,variant,relevance_threshold,confidence_threshold,use_tools,ConflictBank Lax,ConflictBank Moderate,ConflictBank Strict,WikiContradict Base,ConflictBank Mean,Mean
0,gpt-oss-120b,e2e,3.5,2.0,True,0.7,0.63,0.58,0.74,0.63,0.69
1,gpt-oss-120b,e2e-base,3.5,2.0,True,0.67,0.68,0.59,0.76,0.65,0.7
2,gpt-oss-120b,nli,3.5,2.0,False,0.66,0.63,0.55,0.6,0.62,0.61
3,gpt-oss-120b,qa,3.5,2.0,False,0.95,0.69,0.6,0.74,0.75,0.75
4,gpt-oss-20b,e2e,3.5,2.0,True,0.88,0.62,0.55,0.78,0.68,0.73
5,gpt-oss-20b,e2e-base,3.5,2.0,True,0.88,0.69,0.57,0.78,0.71,0.74
6,gpt-oss-20b,nli,3.5,2.0,False,0.66,0.68,0.56,0.51,0.63,0.57
7,gpt-oss-20b,qa,3.5,2.0,False,0.93,0.79,0.64,0.74,0.79,0.77
8,llama-3.3-70b-instruct,e2e,3.5,2.0,True,0.98,0.66,0.61,0.82,0.75,0.79
9,llama-3.3-70b-instruct,e2e-base,3.5,2.0,True,0.99,0.85,0.61,0.81,0.81,0.81


## ELI5

In [7]:
import pathlib

import pandas as pd


def display_eli5_results_for_version(version: str):
    load_dotenv()
    RESULTS_PATH = os.environ["RESULTS_PATH"]
    result_files = [p for p in pathlib.Path(RESULTS_PATH).iterdir() if p.is_file()]

    summary_results = []
    for result_file in result_files:
        if f"_{version}_" not in result_file.name:
            continue
        args, all_results = load_results(result_file.name)
        base_data = {
            "evaluated_model": args["evaluated_model_name"],
            "model": args["model_name"],
            "variant": args["variant"],
        }
        if args["dataset"] == "eli5_base":
            summary_results.append(
                {
                    **base_data,
                    "result": "ELI5 Base Comprehensiveness",
                    "value": compute_score_mean(all_results),
                }
            )
        elif args["dataset"] == "eli5_v2":
            summary_results.append(
                {
                    **base_data,
                    "result": "ELI5 V2 Comprehensiveness",
                    "value": compute_score_mean(all_results),
                }
            )

    df = pd.DataFrame(summary_results)
    df = (
        df.pivot(
            index=[
                "evaluated_model",
                "model",
                "variant",
            ],
            columns="result",
            values="value",
        )
        .reset_index()
        .sort_values(
            by=["evaluated_model", "model", "variant"], key=lambda col: col.str.lower()
        )
    )
    df["Comprehensiveness Mean"] = df[
        ["ELI5 Base Comprehensiveness", "ELI5 V2 Comprehensiveness"]
    ].mean(axis=1)
    df = df.reset_index(drop=True)

    with pd.option_context("display.precision", 2):
        display(df)

In [8]:
display_eli5_results_for_version("eval_ELI5_gpt-oss-20b")

result,evaluated_model,model,variant,ELI5 Base Comprehensiveness,ELI5 V2 Comprehensiveness,Comprehensiveness Mean
0,gpt-oss-120b,gpt-oss-20b,qa,0.6,0.71,0.65
1,gpt-oss-20b,gpt-oss-20b,qa,0.58,0.69,0.63
2,llama-3.3-70b-instruct,gpt-oss-20b,qa,0.52,0.67,0.59
3,llama-4-maverick-17b-128e-instruct-fp8,gpt-oss-20b,qa,0.54,0.67,0.6
4,Qwen2.5-72B-Instruct,gpt-oss-20b,qa,0.5,0.66,0.58


In [9]:
display_eli5_results_for_version("eval_ELI5_llama-4-maverick-17b-128e-instruct-fp8")

result,evaluated_model,model,variant,ELI5 Base Comprehensiveness,ELI5 V2 Comprehensiveness,Comprehensiveness Mean
0,gpt-oss-120b,llama-4-maverick-17b-128e-instruct-fp8,e2e,0.63,0.8,0.72
1,gpt-oss-20b,llama-4-maverick-17b-128e-instruct-fp8,e2e,0.59,0.77,0.68
2,llama-3.3-70b-instruct,llama-4-maverick-17b-128e-instruct-fp8,e2e,0.61,0.78,0.69
3,llama-4-maverick-17b-128e-instruct-fp8,llama-4-maverick-17b-128e-instruct-fp8,e2e,0.62,0.75,0.68
4,Qwen2.5-72B-Instruct,llama-4-maverick-17b-128e-instruct-fp8,e2e,0.52,0.71,0.61


In [11]:
display_eli5_results_for_version("eval_ELI5_llama-3-3-70b-instruct")

result,evaluated_model,model,variant,ELI5 Base Comprehensiveness,ELI5 V2 Comprehensiveness,Comprehensiveness Mean
0,gpt-oss-120b,llama-3.3-70b-instruct,e2e-base,0.47,0.69,0.58
1,gpt-oss-20b,llama-3.3-70b-instruct,e2e-base,0.45,0.64,0.54
2,llama-3.3-70b-instruct,llama-3.3-70b-instruct,e2e-base,0.5,0.68,0.59
3,llama-4-maverick-17b-128e-instruct-fp8,llama-3.3-70b-instruct,e2e-base,0.51,0.62,0.56
4,Qwen2.5-72B-Instruct,llama-3.3-70b-instruct,e2e-base,0.45,0.59,0.52
