# Configure

In [None]:
!pip install openai tqdm

In [None]:
import os
import openai

OPENAI_API_KEY="your_openai_api_key_here"
openai.api_key = os.getenv("OPENAI_API_KEY")

# Evaluation: Compute EM/F1 and LLM-as-a-Judge Accuracy with GPT-4o
This cell evaluates model predictions using **GPT-4o acting as an LLM-as-a-Judge**.

For each question, the model:
- Compares the prediction with the ground truth,
- Judges it as **True** (correct) or **False** (incorrect) based on prompt instructions,
- Computes standard QA metrics including:

  - **LLM-as-a-Judge Accuracy**: how often the model agrees with the ground truth,
  - **Exact Match (EM)**: strict equality after normalization,
  - **F1 Score**: token-level overlap between prediction and gold answer.

The results are stored in a JSON file for further analysis.


In [None]:
import pandas as pd
import json
import re
from collections import Counter
from tqdm.notebook import tqdm
import openai

INPUT_CSV_GT    = '/content/uniqa_1000.csv'
INPUT_CSV_PRED  = '/content/bm25_uniqa_results_notebook.csv'
OUTPUT_JSON     = 'b_u.json'
MODEL_NAME      = "gpt-4o"  
def normalize_answer(s: str) -> str:
    """Lower text and remove punctuation, articles and extra whitespace."""
    if s is None:
        s = ""
    s = str(s)
    def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text)
    def remove_punc(text):    return re.sub(r'[^\w\s]', '', text)
    def white_space_fix(text):return ' '.join(text.split())
    return white_space_fix(remove_articles(remove_punc(s.lower())))

def compute_exact(a_gold: str, a_pred: str) -> int:
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))

def compute_f1(a_gold: str, a_pred: str) -> float:
    gold_toks = normalize_answer(a_gold).split()
    pred_toks = normalize_answer(a_pred).split()
    common = Counter(gold_toks) & Counter(pred_toks)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    precision = num_same / len(pred_toks)
    recall    = num_same / len(gold_toks)
    return 2 * precision * recall / (precision + recall)

SYSTEM_PROMPT = '''
### Role
You are an expert language model evaluator. Your task is to evaluate the model prediction given a ground truth.
### Instruction
- If the prediction is no answer -> It is False
- If the prediction contains the ground truth -> It is True
- If the prediction contains variations of the ground truth -> It is True
- If the prediction contradicts the ground truth -> It is False
### Note
- Ignore any extra explanatory text beyond the prediction itself.
- Ignore the difference in language between the prediction and the ground truth.
'''.strip()

USER_PROMPT = '''
### Question
{question}
### Ground Truth
{ground_truth}
### Prediction
{prediction}

Respond with exactly one word: True or False.
'''.strip()

def evaluate_pair(question: str, ground_truth: str, prediction: str) -> bool:
    prompt = USER_PROMPT.format(
        question=question,
        ground_truth=ground_truth,
        prediction=prediction,
    )
    resp = openai.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )
    return resp.choices[0].message.content.strip().lower() == "true"

df_gt   = pd.read_csv(INPUT_CSV_GT, dtype=str)
df_pred = pd.read_csv(INPUT_CSV_PRED, dtype=str)
pred_map = dict(zip(df_pred['question'], df_pred['rag_ans']))

results = []
sum_em, sum_f1 = 0, 0
num_no_ans_gt = 0
sum_correct_no_ans = 0

for _, row in tqdm(df_gt.iterrows(), total=len(df_gt)):
    q  = row['question']
    gt = row['answer']
    pr = pred_map.get(q, "").strip()
    is_correct = evaluate_pair(q, gt, pr)

    em = compute_exact(gt, pr)
    f1 = compute_f1(gt, pr)
    sum_em  += em
    sum_f1  += f1

    if isinstance(gt, str) and gt == "":
        num_no_ans_gt += 1
        if is_correct:
            sum_correct_no_ans += 1

    results.append({
        'question':      q,
        'ground_truth':  gt,
        'prediction':    pr,
        'is_correct':    is_correct,
        'exact_match':   em,
        'f1':            f1,
    })

n = len(results)
overall_accuracy = sum(r['is_correct'] for r in results) / n
avg_em           = sum_em / n
avg_f1           = sum_f1 / n
no_answer_acc    = (sum_correct_no_ans / num_no_ans_gt) if num_no_ans_gt else 0.0

print(f"Overall accuracy:        {overall_accuracy:.3%} ({sum(r['is_correct'] for r in results)}/{n})")
print(f"Average Exact Match:     {avg_em:.3%}")
print(f"Average F1 score:        {avg_f1:.3%}")
print(f"'No answer' accuracy:    {no_answer_acc:.3%} ({sum_correct_no_ans}/{num_no_ans_gt})")

with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)
print(f"Saved detailed results to {OUTPUT_JSON}")

# Ambiguous QA Evaluation using GPT-4o (LLM-as-a-Judge with EM/F1 Metrics)
This cell performs evaluation of ambiguous question answering predictions using **GPT-4o as an LLM-as-a-Judge**.

Predictions are classified as either **CORRECT** or **INCORRECT** according to structured evaluation criteria that distinguish:
- Clarification questions (requests for missing information),
- Direct answers (attempts to answer despite ambiguity).

In addition to the LLM judgment, the following quantitative metrics are computed:
- **LLM-as-a-Judge Accuracy**: the proportion of predictions judged as correct by GPT-4o,
- **Exact Match (EM)**: binary match after normalization against the gold answer,
- **F1 Score**: token-level overlap between the prediction and the reference answer.

Final results, including per-question judgments and scores, are saved as a CSV file for downstream analysis and reporting.

In [None]:
import os
import json
import re
import pandas as pd
from tqdm import tqdm
from collections import Counter
import openai

MODEL_NAME = "gpt-4o"

SYSTEM_PROMPT = """
You are an evaluator for an ambiguous question answering system.

Evaluation Rules:
1. You are given: the question, a list of required info items, and the model's prediction.
2. If the prediction is a clarification question:
    - CORRECT if it explicitly asks for at least one of the required info items.
    - INCORRECT if it does not ask for any of the required info items.
3. If the prediction is a direct answer:
    - CORRECT if it matches or clearly paraphrases the correct answer.
    - INCORRECT otherwise.
4. If the prediction refuses to answer, says "I don't know", or says "Not enough information to answer":
    - Always INCORRECT.
5. Ignore the language used in the prediction or the answer; evaluate purely based on content (language-agnostic).
6. Output exactly one word: CORRECT or INCORRECT.
""".strip()

USER_TEMPLATE = """
### Question
{question}

### Required Info
{info}

### Model Prediction
{prediction}
""".strip()

def normalize_answer(s: str) -> str:
    s = str(s).lower()
    s = re.sub(r'\b(a|an|the)\b', ' ', s)
    s = re.sub(r'[^\w\s]', '', s)
    return ' '.join(s.split())

def compute_exact(a_gold: str, a_pred: str) -> int:
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))

def compute_f1(a_gold: str, a_pred: str) -> float:
    gold_toks = normalize_answer(a_gold).split()
    pred_toks = normalize_answer(a_pred).split()
    common = Counter(gold_toks) & Counter(pred_toks)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    precision = num_same / len(pred_toks)
    recall = num_same / len(gold_toks)
    return 2 * precision * recall / (precision + recall)

def evaluate_row(row):
    info = row["info"]
    if not isinstance(info, str):
        try:
            info = json.dumps(info, ensure_ascii=False)
        except:
            info = "[]"

    prompt = USER_TEMPLATE.format(
        question=row["question"],
        info=info,
        prediction=row["gpt_answer"]
    )

    try:
        response = openai.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": prompt}
            ],
            temperature=0
        )
        result = response.choices[0].message.content.strip().upper()
    except Exception as e:
        print(f"Lỗi đánh giá: {row['question']} → {e}")
        result = "ERROR"

    return result

GT_CSV   = "/content/UniQA_ambious.csv"
PRED_CSV = "/content/results_with_gpt-am.csv"
OUT_CSV  = "ambiguous_eval_with_openai.csv"

df_gt = pd.read_csv(GT_CSV, dtype=str)
df_pred = pd.read_csv(PRED_CSV, dtype=str)


df = pd.merge(df_gt, df_pred, on="question", how="inner")
if "answer" not in df.columns:
    df["answer"] = ""

judge_list = []
em_list = []
f1_list = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    judge = evaluate_row(row)
    judge_list.append(judge)

    em = compute_exact(row["answer"], row["gpt_answer"]) if row["answer"] else 0
    f1 = compute_f1(row["answer"], row["gpt_answer"]) if row["answer"] else 0
    em_list.append(em)
    f1_list.append(f1)

df["judge"] = judge_list
df["exact_match"] = em_list
df["f1_score"] = f1_list

valid = df[df["judge"] != "ERROR"]
accuracy = valid["judge"].eq("CORRECT").mean()
avg_em = valid["exact_match"].mean()
avg_f1 = valid["f1_score"].mean()

df.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")

print("\nEvaluation Summary:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Average Exact Match:     {avg_em:.4f}")
print(f"Average F1 Score:        {avg_f1:.4f}")
print(f"Saved: {OUT_CSV}")
