In [None]:
import os
os.environ["OPENAI_API_KEY"]

In [2]:
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd
import re

from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

ds = load_dataset("ailsntua/QEvasion")
train = ds["train"]
test  = ds["test"]

print("Train:", len(train), "Test:", len(test))
print("Columns:", train.column_names)
train.features

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/3.90M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/259k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3448 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/308 [00:00<?, ? examples/s]

Train: 3448 Test: 308
Columns: ['title', 'date', 'president', 'url', 'question_order', 'interview_question', 'interview_answer', 'gpt3.5_summary', 'gpt3.5_prediction', 'question', 'annotator_id', 'annotator1', 'annotator2', 'annotator3', 'inaudible', 'multiple_questions', 'affirmative_questions', 'index', 'clarity_label', 'evasion_label']


{'title': Value('string'),
 'date': Value('string'),
 'president': Value('string'),
 'url': Value('string'),
 'question_order': Value('int64'),
 'interview_question': Value('string'),
 'interview_answer': Value('string'),
 'gpt3.5_summary': Value('string'),
 'gpt3.5_prediction': Value('string'),
 'question': Value('string'),
 'annotator_id': Value('string'),
 'annotator1': Value('string'),
 'annotator2': Value('string'),
 'annotator3': Value('string'),
 'inaudible': Value('bool'),
 'multiple_questions': Value('bool'),
 'affirmative_questions': Value('bool'),
 'index': Value('int64'),
 'clarity_label': Value('string'),
 'evasion_label': Value('string')}

In [4]:
print("Train:", len(train), "Test:", len(test))
print("Columns:", train.column_names)

print("Unique clarity labels (train):", sorted(set(train["clarity_label"])))
print("Unique clarity labels (test):",  sorted(set(test["clarity_label"])))

Train: 3448 Test: 308
Columns: ['title', 'date', 'president', 'url', 'question_order', 'interview_question', 'interview_answer', 'gpt3.5_summary', 'gpt3.5_prediction', 'question', 'annotator_id', 'annotator1', 'annotator2', 'annotator3', 'inaudible', 'multiple_questions', 'affirmative_questions', 'index', 'clarity_label', 'evasion_label']
Unique clarity labels (train): ['Ambivalent', 'Clear Non-Reply', 'Clear Reply']
Unique clarity labels (test): ['Ambivalent', 'Clear Non-Reply', 'Clear Reply']


In [7]:
from openai import OpenAI
client = OpenAI()

TARGET_LABELS = ["Clear Reply", "Ambivalent", "Clear Non-Reply"]

def build_prompt(question, answer):
    """
    Build a strict classification prompt for response clarity.
    We force the model to output exactly one label so we can parse it reliably.
    """
    return f"""You are an expert political discourse analyst.

Task: classify the clarity of a politician's answer to the given question.

Labels:
- Clear Reply: directly answers the question unambiguously.
- Ambivalent: appears relevant but is vague/hedged/multi-interpretable or partially answers.
- Clear Non-Reply: refuses to answer or does not address the question.

Question:
{question}

Answer:
{answer}

Return ONLY one label from:
Clear Reply, Ambivalent Reply, Clear Non-Reply
"""

def call_llm(prompt, model="gpt-4o-mini"):
    """
    Calls the LLM using OpenAI API.
    Note: Responses API requires max_output_tokens >= 16.
    """
    r = client.responses.create(
        model=model,
        input=prompt,
        temperature=0,
        max_output_tokens=16
    )
    return r.output_text.strip()

def normalize_label(raw_text):
    """
    Normalize model output into one of the target labels.
    Returns None if parsing fails.
    """
    if raw_text is None:
        return None

    # take first line, strip punctuation/junk
    t = raw_text.splitlines()[0].strip()
    t = re.sub(r"[^A-Za-z \-]", "", t).strip()

    # tolerant matching
    if "Clear Non" in t:
        return "Clear Non-Reply"
    if "Clear Reply" in t:
        return "Clear Reply"
    if "Ambivalent" in t:
        return "Ambivalent"
    return None

In [8]:
sample = test.select(range(10))

for i, ex in enumerate(sample):
    q = ex["question"]
    a = ex["interview_answer"]
    gold = ex["clarity_label"]

    raw = call_llm(build_prompt(q, a))
    pred = normalize_label(raw)

    print(f"\n--- Example {i} ---")
    print("GOLD:", gold)
    print("RAW :", raw)
    print("PRED:", pred)


--- Example 0 ---
GOLD: Ambivalent
RAW : Ambivalent
PRED: Ambivalent

--- Example 1 ---
GOLD: Ambivalent
RAW : Ambivalent
PRED: Ambivalent

--- Example 2 ---
GOLD: Ambivalent
RAW : Ambivalent
PRED: Ambivalent

--- Example 3 ---
GOLD: Ambivalent
RAW : Ambivalent
PRED: Ambivalent

--- Example 4 ---
GOLD: Ambivalent
RAW : Ambivalent
PRED: Ambivalent

--- Example 5 ---
GOLD: Ambivalent
RAW : Clear Non-Reply
PRED: Clear Non-Reply

--- Example 6 ---
GOLD: Ambivalent
RAW : Ambivalent
PRED: Ambivalent

--- Example 7 ---
GOLD: Ambivalent
RAW : Ambivalent
PRED: Ambivalent

--- Example 8 ---
GOLD: Ambivalent
RAW : Ambivalent
PRED: Ambivalent

--- Example 9 ---
GOLD: Ambivalent
RAW : Ambivalent
PRED: Ambivalent


In [9]:
subset = test.select(range(min(50, len(test))))

preds, golds, raws = [], [], []

for ex in tqdm(subset):
    q = ex["question"]
    a = ex["interview_answer"]
    gold = ex["clarity_label"]

    raw = call_llm(build_prompt(q, a))
    pred = normalize_label(raw)

    # One retry if the model output is not parseable
    if pred is None:
        raw2 = call_llm(build_prompt(q, a) + "\nAnswer with exactly one label. No other text.")
        pred = normalize_label(raw2)
        raw = raw + " | FALLBACK: " + raw2

    preds.append(pred)
    golds.append(gold)
    raws.append(raw)

acc = accuracy_score(golds, preds)
f1  = f1_score(golds, preds, average="macro")

print("ACC:", acc)
print("Macro-F1:", f1)
print("\nReport:\n", classification_report(golds, preds))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:46<00:00,  1.08it/s]

ACC: 0.7
Macro-F1: 0.4492784992784993

Report:
                  precision    recall  f1-score   support

     Ambivalent       0.86      0.80      0.83        40
Clear Non-Reply       0.17      0.67      0.27         3
    Clear Reply       1.00      0.14      0.25         7

       accuracy                           0.70        50
      macro avg       0.68      0.54      0.45        50
   weighted avg       0.84      0.70      0.72        50






In [10]:
preds, golds, raws = [], [], []

for ex in tqdm(test):
    q = ex["question"]
    a = ex["interview_answer"]
    gold = ex["clarity_label"]

    raw = call_llm(build_prompt(q, a))
    pred = normalize_label(raw)

    if pred is None:
        raw2 = call_llm(build_prompt(q, a) + "\nAnswer with exactly one label. No other text.")
        pred = normalize_label(raw2)
        raw = raw + " | FALLBACK: " + raw2

    preds.append(pred)
    golds.append(gold)
    raws.append(raw)

acc = accuracy_score(golds, preds)
f1  = f1_score(golds, preds, average="macro")

print("FINAL ACC:", acc)
print("FINAL Macro-F1:", f1)
print("\nFINAL Report:\n", classification_report(golds, preds))

df = pd.DataFrame({
    "question": [ex["question"] for ex in test],
    "answer":   [ex["interview_answer"] for ex in test],
    "gold": golds,
    "pred": preds,
    "raw": raws
})

out_path = "prompting_results_gpt4o-mini_zero-shot.csv"
df.to_csv(out_path, index=False)
print("Saved:", out_path)

df.head()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 308/308 [04:19<00:00,  1.19it/s]

FINAL ACC: 0.6038961038961039
FINAL Macro-F1: 0.43666614888071936

FINAL Report:
                  precision    recall  f1-score   support

     Ambivalent       0.70      0.78      0.74       206
Clear Non-Reply       0.21      0.61      0.31        23
    Clear Reply       0.86      0.15      0.26        79

       accuracy                           0.60       308
      macro avg       0.59      0.51      0.44       308
   weighted avg       0.71      0.60      0.58       308

Saved: prompting_results_gpt4o-mini_zero-shot.csv





Unnamed: 0,question,answer,gold,pred,raw
0,Inquiring about the status or information reg...,"Well, the world has made it clear that these t...",Ambivalent,Ambivalent,Ambivalent
1,Will you invite them to the White House to neg...,I think that anytime and anyplace that they ar...,Ambivalent,Ambivalent,Ambivalent
2,Why was it necessary for Japan to drop the thr...,I think that the purpose of the U.N. Security ...,Ambivalent,Ambivalent,Ambivalent Reply
3,When will we see this resolution?,I'll let Condi talk about the details of what ...,Ambivalent,Ambivalent,Ambivalent
4,Updating the figure of Iraqi deaths,"No, I don't consider it a credible report; nei...",Ambivalent,Ambivalent,Ambivalent


In [11]:
labels = ["Clear Reply", "Ambivalent", "Clear Non-Reply"]

cm = confusion_matrix(golds, preds, labels=labels)
cm_df = pd.DataFrame(cm, index=[f"gold:{l}" for l in labels], columns=[f"pred:{l}" for l in labels])
cm_df

Unnamed: 0,pred:Clear Reply,pred:Ambivalent,pred:Clear Non-Reply
gold:Clear Reply,12,59,8
gold:Ambivalent,2,160,44
gold:Clear Non-Reply,0,9,14


In [12]:
errors = df[df["gold"] != df["pred"]].copy()
print("Errors:", len(errors), "out of", len(df))

# Show a few mistakes (useful for analysis)
errors.head(25)[["gold", "pred", "question", "answer"]]

Errors: 122 out of 308


Unnamed: 0,gold,pred,question,answer
5,Ambivalent,Clear Non-Reply,Do you think the Republican leader in the Hous...,I wouldn't have exactly put it that way. But I...
11,Clear Reply,Ambivalent,Shape of the multinational force,"In terms of the troops, that's what the meetin..."
12,Ambivalent,Clear Non-Reply,Do you see any contradictory evidence in the c...,"No, I said—Mike, thanks. I was just speculatin..."
13,Clear Reply,Ambivalent,What have they achieved and what will they lea...,"Okay, I will start answering. Has it become be..."
17,Ambivalent,Clear Non-Reply,Would you campaign against Senator Joe Lieberm...,I'm going to stay out of Connecticut. []
18,Ambivalent,Clear Non-Reply,Would you veto the bill if it passes in the fo...,"First, we have been working throughout the sum..."
21,Clear Reply,Clear Non-Reply,Was this coordinated with you?,"No, it wasn't coordinated with me, and my pati..."
25,Ambivalent,Clear Non-Reply,What is the outlook in your view when you will...,"Well, to answer the first question, there's th..."
27,Ambivalent,Clear Non-Reply,Do you have to take those with something of a ...,"With respect to Europe, I'm deeply concerned, ..."
30,Clear Reply,Ambivalent,New Policy on Intercepting North Korean Ships,"Well, this is not simply a U.S. policy; this i..."


In [13]:
import random

random.seed(42)

train_df = pd.DataFrame(train)

shots = []
for label in ["Clear Reply", "Ambivalent", "Clear Non-Reply"]:
    candidates = train_df[train_df["clarity_label"] == label]
    # pick random rows
    sampled = candidates.sample(2, random_state=42).to_dict("records")
    shots.extend(sampled)

# shuffle so model doesn't learn ordering bias
random.shuffle(shots)

# Show what got selected (sanity)
for s in shots:
    print("\nLABEL:", s["clarity_label"])
    print("Q:", s["question"][:120])
    print("A:", s["interview_answer"][:120])


LABEL: Ambivalent
Q: Leverage to stop Assad and Putin in Aleppo
A: One of the great things about our democracy is, it expresses itself in all sorts of ways, and that includes people prote

LABEL: Clear Reply
Q:  What was the message you were trying to send with not only your decision not to attend the Sochi Games, but also with t
A: Well, first of all, I haven't attended Olympics in the past, and I suspect that me attending the Olympics, particularly 

LABEL: Ambivalent
Q:  Is there any point at which the United States would consider arming the rebels?
A: I was one of the first leaders, I think, around the world to say Asad had to go, in response to the incredible brutality

LABEL: Clear Non-Reply
Q: Request for confirmation of drone strikes in Yemen.
A: I will not have a discussion about operational issues.Ed Henry [FOX News].

LABEL: Clear Reply
Q: And could this signal your real losses for Democrats in the midterms?
A: We're going to win. I think we're going to win in Virginia.And 

In [18]:
def build_fewshot_prompt(shots, question, answer):
    """
    Few-shot prompt using labeled examples from the training split.
    """
    examples_block = ""
    for ex in shots:
        examples_block += (
            f"Question: {ex['question']}\n"
            f"Answer: {ex['interview_answer']}\n"
            f"Label: {ex['clarity_label']}\n\n"
        )

    return f"""You are an expert political discourse analyst.

Task: classify the clarity of a politician's answer to the given question.

Labels:
- Clear Reply: directly answers the question unambiguously.
- Ambivalent: appears relevant but is vague/hedged/multi-interpretable or partially answers.
- Clear Non-Reply: refuses to answer or does not address the question.

Here are labeled examples:
{examples_block}
Now classify this pair.

Question:
{question}

Answer:
{answer}

Return ONLY one label from:
Clear Reply, Ambivalent, Clear Non-Reply
"""

In [19]:
preds_fs, golds_fs, raws_fs = [], [], []

for ex in tqdm(test):
    q = ex["question"]
    a = ex["interview_answer"]
    gold = ex["clarity_label"]

    prompt = build_fewshot_prompt(shots, q, a)
    raw = call_llm(prompt)
    pred = normalize_label(raw)

    if pred is None:
        raw2 = call_llm(prompt + "\nAnswer with exactly one label. No other text.")
        pred = normalize_label(raw2)
        raw = raw + " | FALLBACK: " + raw2

    # final clamp (avoid None messing metrics)
    if pred is None:
        pred = "Ambivalent"

    preds_fs.append(pred)
    golds_fs.append(gold)
    raws_fs.append(raw)

acc_fs = accuracy_score(golds_fs, preds_fs)
f1_fs  = f1_score(golds_fs, preds_fs, average="macro")

print("FEW-SHOT ACC:", acc_fs)
print("FEW-SHOT Macro-F1:", f1_fs)
print("\nFEW-SHOT Report:\n", classification_report(golds_fs, preds_fs))

df_fs = pd.DataFrame({
    "question": [ex["question"] for ex in test],
    "answer":   [ex["interview_answer"] for ex in test],
    "gold": golds_fs,
    "pred": preds_fs,
    "raw": raws_fs
})
out_path = "prompting_results_gpt4o-mini_few-shot.csv"
df_fs.to_csv(out_path, index=False)
print("Saved:", out_path)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 308/308 [04:38<00:00,  1.11it/s]

FEW-SHOT ACC: 0.6883116883116883
FEW-SHOT Macro-F1: 0.5605262551210403

FEW-SHOT Report:
                  precision    recall  f1-score   support

     Ambivalent       0.75      0.83      0.79       206
Clear Non-Reply       0.38      0.48      0.42        23
    Clear Reply       0.60      0.39      0.47        79

       accuracy                           0.69       308
      macro avg       0.57      0.57      0.56       308
   weighted avg       0.68      0.69      0.68       308

Saved: prompting_results_gpt4o-mini_few-shot.csv



