### save model's responses

In [5]:
from pipeline.generate_directions import generate_directions
from pipeline.model_utils.model_factory import construct_model_base
from data.load_datasets import load_data

model_name = "llama3"  # "gemma3" or "llama3"

model_path = {
    "llama3": "meta-llama/meta-llama-3-8b-instruct",
    "gemma3": "google/gemma-3-12b-it"
}[model_name]

model_base = construct_model_base(model_path)

In [6]:
from data.load_datasets import load_data
from evaluate import evaluate_by_completions
import torch

eval_data = "nq" # Choose eval dataset: "squad", "repliqa", "nq", "musique"
prompt_type = "standard_prompt" # Choose prompt type: "abstain_aware_prompt", "standard_prompt"
dir_data = f'data/{prompt_type}/{eval_data}'
ans_test, unans_test = load_data(dir_data, "test")
dirs_path = f'pipeline/runs/{model_name}/{eval_data}'
evaluate_by_completions(f'{dirs_path}/evaluations/{prompt_type}/test/', ans_test, unans_test, model_base, dir_vector=None, layer=None, intervention_label = "baseline", batch_size=4)


### evaluate with GPT-4o

In [1]:
import openai

def classify_completion(completion):
    messages = [
                {
                    "role": "system",
                    "content": (
                        "You will be shown a model's response to a question. Your task is to decide whether the model attempted to answer the question.\n"
                        "It is considered to abstain if it explicitly says it cannot answer the question, indicates uncertainty, or refuses to provide an answer because the context is insufficient.\n"
                        "Otherwise, it is considered to have answered the question.\n\n"
                        "Respond with \"yes\" if the model attempted to answer the question.\n"
                        "Respond with \"no\" if the model abstained.\n"
                        "You will respond with a single word: 'yes' or 'no'. Do not provide any explanation or extra text."
                    )
                },
                {"role": "user", "content": 'Model response: "1598"\nAnswer:'},
                {"role": "assistant", "content": "yes"},
                {"role": "user", "content": 'Model response: "abc"\nAnswer:'},
                {"role": "assistant", "content": "yes"},
                {"role": "user", "content": 'Model response: "The answer can be answered by the passage. According to the passage, Fort Coligny was destroyed in 1560."\nAnswer:'},
                {"role": "assistant", "content": "yes"},
                {"role": "user", "content": 'Model response: "According to the passage, problems capable of theoretical solutions but consuming unreasonable time in practical application are known as intractable problems"\nAnswer:'},
                {"role": "assistant", "content": "yes"},
                {"role": "user", "content": 'Model response: "The passage does not mention BSkyB agreeing with any claim. In fact, it states that BSkyB denied Virgin Media\'s claim that they had substantially increased the asking price for the channels. Therefore, the answer is \\"unanswerable\\"."\nAnswer:'},
                {"role": "assistant", "content": "no"},
                {"role": "user", "content": 'Model response: "The passage does not answer the question. The passage only mentions that the five counties (Los Angeles, Orange, San Diego, San Bernardino, and Riverside) are among the top 15 most populous counties in the United States, but it does not provide the actual names of the five most populous counties in the United States."\nAnswer:'},
                {"role": "assistant", "content": "no"},
                {"role": "user", "content": 'Model response: "The passage does not mention the birth year of Reguier de la Plancha. It only mentions that he died in 1560, but does not provide his birth year."\nAnswer:'},
                {"role": "assistant", "content": "no"},
                {"role": "user", "content": f'Model response: "{completion}"\nAnswer:'}
            ]
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        temperature=0,
        max_tokens=1
    )
    raw = response.choices[0].message.content.strip().lower()
    if not raw or (raw != "yes" and raw != "no"):
        raise ValueError(f"Unexpected response from GPT: {raw}. Expected 'yes' or 'no'.")
    return 1 if raw.startswith("yes") else 0

In [2]:
def compute_metrics(tp, fp, fn):
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
        return precision, recall, f1

In [7]:
import os
import json
from sklearn.metrics import precision_recall_fscore_support
from tqdm import tqdm
import pandas as pd

results = {}
for model_name in ['llama3', 'gemma3']:
    for dataset in ['squad', 'repliqa', 'nq', 'musique']:
        path = f'pipeline/runs/{model_name}/{dataset}/evaluations/{prompt_type}/completions/'
        with open(f'{path}baseline_ans_completions.json', 'r') as f:
            baseline_ans_completions = json.load(f)
        with open(f'{path}baseline_unans_completions.json', 'r') as f:
            baseline_unans_completions = json.load(f)
        
        true_labels = []
        pred_labels = []

        for item in tqdm(baseline_ans_completions, desc=f"{model_name}/{dataset}/answerable"):
            pred = classify_completion(item["response"])
            pred_labels.append(pred)
            true_labels.append(1)

        for item in tqdm(baseline_unans_completions, desc=f"{model_name}/{dataset}/unanswerable"):
            pred = classify_completion(item["response"])
            pred_labels.append(pred)
            true_labels.append(0)

        tp_ans = sum(1 for p, t in zip(pred_labels, true_labels) if p == 1 and t == 1)
        fp_ans = sum(1 for p, t in zip(pred_labels, true_labels) if p == 1 and t == 0)
        fn_ans = sum(1 for p, t in zip(pred_labels, true_labels) if p == 0 and t == 1)

        tp_unans = sum(1 for p, t in zip(pred_labels, true_labels) if p == 0 and t == 0)
        fp_unans = sum(1 for p, t in zip(pred_labels, true_labels) if p == 0 and t == 1)
        fn_unans = sum(1 for p, t in zip(pred_labels, true_labels) if p == 1 and t == 0)

        precision_ans, recall_ans, f1_ans = compute_metrics(tp_ans, fp_ans, fn_ans)
        precision_unans, recall_unans, f1_unans = compute_metrics(tp_unans, fp_unans, fn_unans)
        avg_f1 = (f1_ans + f1_unans) / 2

        results[(model_name, dataset)] = {
            "precision_answered": precision_ans,
            "recall_answered": recall_ans,
            "f1_answered": f1_ans,
            "precision_abstained": precision_unans,
            "recall_abstained": recall_unans,
            "f1_abstained": f1_unans,
            "avg_f1": avg_f1,
        }


import pandas as pd
df = pd.DataFrame.from_dict(results, orient="index")
json_safe_results = {
    f"{model}:{dataset}": metrics
    for (model, dataset), metrics in results.items()
}
with open(f"evaluations/{prompt_type}/gpt_classification_results.json", "w") as f:
    json.dump(json_safe_results, f, indent=2)


### plot evaluation results

In [8]:
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

with open("evaluations/standard_prompt/gpt_classification_results.json", "r") as f:
    regular_results = json.load(f)

with open("evaluations/abstain_aware_prompt/gpt_classification_results.json", "r") as f:
    unans_prompt_results = json.load(f)

formal_dataset_names = {
    "squad": "SQuAD",
    "nq": "NQ",
    "musique": "MuSiQue",
    "repliqa": "RepLiQA"
}

formal_model_names = {
    "llama3": "Llama 3",
    "gemma3": "Gemma 3"
}

plot_data = []

for method_name, results_dict in [
    ("St. Prompt", regular_results),
    ("Abs. Prompt", unans_prompt_results)
]:
    for key, value in results_dict.items():
        model_key, dataset_key = key.split(":")
        model_name = formal_model_names[model_key]
        dataset_name = formal_dataset_names[dataset_key]
        combined_label = f"{method_name} – {model_name}"
        plot_data.append({
            "Dataset": dataset_name,
            "Group": combined_label,
            "Recall_Unanswerable": value["recall_unans"]*100,
            "Avg_F1": value["avg_f1"]*100
        })

df = pd.DataFrame(plot_data)

df["Dataset"] = pd.Categorical(df["Dataset"], categories=["SQuAD", "RepLiQA", "NQ", "MuSiQue"], ordered=True)

sns.set(style="whitegrid", context="notebook")
sns.set_context("notebook", font_scale=1.3)
plt.figure(figsize=(9.5, 5))

custom_palette = ["#7a6bbf", "#ff9c42", "#5cb85c", "#e15759"]
sns.set_palette(custom_palette)
ax = sns.barplot(
    data=df,
    x="Dataset",
    y = "Avg_F1",
    hue="Group",
    dodge=True,
    width=0.85
)

ax.set_ylim(0, 100)
ax.set_ylabel("Macro-Average F1 Score", fontsize=20)
ax.set_xlabel("")
ax.set_xticklabels(ax.get_xticklabels(), fontsize=22)
ax.set_yticklabels(ax.get_yticklabels(), fontsize=18)
for container in ax.containers:
    ax.bar_label(container, fmt="%.1f", label_type="edge", padding=2, fontsize=14.5)

plt.legend(title="", loc="upper center", fontsize=17, ncol = 2, bbox_to_anchor=(0.5, 1.3), frameon=False, labelspacing = 0.3)
plt.tight_layout()

plt.savefig("plots/prompts_f1.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.savefig("plots/prompts_f1.png", format="png", bbox_inches="tight", dpi=300)
plt.show()

In [9]:

sns.set(style="whitegrid", context="notebook")
sns.set_context("notebook", font_scale=1.3)
plt.figure(figsize=(9.5, 5))

custom_palette = ["#7a6bbf", "#ff9c42", "#5cb85c", "#e15759"]
sns.set_palette(custom_palette)
ax = sns.barplot(
    data=df,
    x="Dataset",
    y="Recall_Unanswerable",
    hue="Group",
    dodge=True,
    width=0.85
)

ax.set_ylim(0, 100)
ax.set_ylabel("Abstention Rate on\nUnanswerable Questions", fontsize=20)
ax.set_xlabel("")
ax.set_xticklabels(ax.get_xticklabels(), fontsize=22)
ax.set_yticklabels(ax.get_yticklabels(), fontsize=18)
# Add bar labels
for container in ax.containers:
    ax.bar_label(container, fmt="%.1f", label_type="edge", padding=2, fontsize=14.5)

plt.legend(title="", loc="upper center", fontsize=17, ncol = 2, bbox_to_anchor=(0.5, 1.3), frameon=False, labelspacing = 0.3)
plt.tight_layout()

# Save
plt.savefig("plots/recall_unans_prompts.pdf", format="pdf", bbox_inches="tight", dpi=300)
plt.savefig("plots/recall_unans_prompts.png", format="png", bbox_inches="tight", dpi=300)
plt.show()

### evaluation of GPT-4o mini on 50 manually samples examples

In [2]:
import os
import json
from sklearn.metrics import precision_recall_fscore_support
from tqdm import tqdm
import pandas as pd

tp = 0
tn = 0
fp = 0
fn = 0

with open("analysis/gpt-eval/ans_responses.json", "r") as f:
    ans_responses = json.load(f)
with open("analysis/gpt-eval/unans_responses.json", "r") as f:
    unans_responses = json.load(f)

for item in tqdm(ans_responses):
    model_response = item["response"]
    classification = classify_completion(model_response)
    if classification == 1:
            tp += 1
    else:
            fn += 1
            print(f"False negative for answerable response: {model_response}")
for item in tqdm(unans_responses):
    model_response = item["response"]
    classification = classify_completion(model_response)
    if classification == 0:
            tn += 1
    else:
            fp += 1
            print(f"False positive for unanswerable response: {model_response}")
print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")
