In [7]:
# merge responses
import json

def merge_dataset_with_response(dataset_path, response_path, output_path):

    with open(dataset_path, 'r', encoding='utf-8') as f:
        dataset = json.load(f)

    with open(response_path, 'r', encoding='utf-8') as f:
        responses = json.load(f)

    response_map = {item["question_id"]: item["response"] for item in responses}

    merged_data = []
    for item in dataset:
        qid = item["question_id"]
        response = response_map.get(qid, "")

        new_item = {}

        new_item["question_id"] = item["question_id"]
        new_item["Question"] = item["Question"]

        new_item["response"] = response

        new_item["is_hallucination"] = False

        for key, value in item.items():
            if key not in new_item:
                new_item[key] = value

        merged_data.append(new_item)


    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(merged_data, f, indent=2, ensure_ascii=False)

    print(f"Merged file saved to: {output_path}")


# merge_dataset_with_response("deepl_input_translated_sorted.json", "Gemini_2.0_Flash_deepl_response.json", "merged_for_human_eval.json")

In [None]:
merge_dataset_with_response("../deepl_input_translated_sorted.json", "human_eval/responses/Gemini_2.0_Flash_deepl_response.json", "Gemini_2.0_Flash_deepl_translated_human_eval.json")

Merged file saved to: Gemini_2.0_Flash_deepl_translated_human_eval.json


In [None]:
merge_dataset_with_response("../HalluQA.json", "responses/Gemini_2.0_Flash_response.json", "human_eval/Gemini_2.0_Flash_CN_human_eval.json")

Merged file saved to: Gemini_2.0_Flash_CN_human_eval.json


In [None]:
merge_dataset_with_response("../HalluQA_EN_human.json", "responses/Gemini_2.0_Flash_EN_human_response.json", "human_eval/Gemini_2.0_Flash_EN_human_eval.json")

Merged file saved to: Gemini_2.0_Flash_EN_human_eval.json


In [2]:
# calculate hallucination rate
import json
import numpy as np
from tqdm import tqdm

def calculate_hallucination_rate(json_file_path):

    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    total = len(data)
    hallucinated = sum(1 for item in data if item.get("is_hallucination") is True)

    rate = hallucinated / total
    print(f"Hallucination rate: {rate:.2%} ({hallucinated}/{total})")
    return rate


def bootstrap_hallucination_rate(json_file_path, n_iter=1000, ci=95, seed=42):
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    labels = [1 if item.get("is_hallucination") is True else 0 for item in data]
    np.random.seed(seed)
    rates = []

    for _ in tqdm(range(n_iter), desc="Bootstrapping"):
        sample = np.random.choice(labels, size=len(labels), replace=True)
        rate = np.mean(sample)
        rates.append(rate)

    lower = np.percentile(rates, (100 - ci) / 2)
    upper = np.percentile(rates, 100 - (100 - ci) / 2)
    mean = np.mean(rates)

    print(f"Bootstrap Hallucination Rate: {mean:.2%}")
    print(f"{ci}% Confidence Interval: [{lower:.2%}, {upper:.2%}]")
    return mean, (lower, upper)


In [3]:
calculate_hallucination_rate("human_eval/Gemini_2.0_Flash_CN_human_eval.json")
bootstrap_hallucination_rate("human_eval/Gemini_2.0_Flash_CN_human_eval.json")

Hallucination rate: 26.22% (118/450)


Bootstrapping: 100%|██████████| 1000/1000 [00:00<00:00, 48227.58it/s]

Bootstrap Hallucination Rate: 26.28%
95% Confidence Interval: [22.22%, 30.44%]





(np.float64(0.2627866666666666),
 (np.float64(0.2222222222222222), np.float64(0.30444444444444446)))