In [None]:
# --- Imports & config ---
import os, json, pathlib, itertools
from typing import List, Dict, Any
from dataclasses import dataclass
import pandas as pd
from transformers import pipeline
import dspy
from dspy.evaluate import SemanticF1
import faiss


ROOT = pathlib.Path.cwd().parent if (pathlib.Path.cwd().name == "nlp-with-llms-2025-hw3") else pathlib.Path.cwd()
PRAG_DATA = ROOT / "PragmatiCQA" / "data" / "val.jsonl"            
SOURCES_DIR = ROOT / "PragmatiCQA-sources"                         

# lm = dspy.LM('ollama_chat/devstral', api_base='http://localhost:11434', api_key='')
with open("xai_key.txt") as f:
    api_key = f.read().strip()

lm = dspy.LM('xai/grok-3-mini', api_key=api_key)
dspy.configure(lm=lm)


In [None]:
# 4.3 First-Turn Evaluation
VAL_JSONL = pathlib.Path("PragmatiCQA-data/val.jsonl")
def load_first_turns(val_jsonl_path: pathlib.Path) -> List[Dict[str, Any]]:
    rows = []
    with open(val_jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            conv = json.loads(line)
            qas = conv.get("qas", [])
            if not qas: continue
            turn0 = qas[0]
            rows.append({
                "topic": conv.get("topic", ""),
                "question": turn0.get("q", ""),
                "gold": turn0.get("a", ""),
                "literal_spans": [x["text"] for x in turn0["a_meta"].get("literal_obj", [])],
                "pragmatic_spans": [x["text"] for x in turn0["a_meta"].get("pragmatic_obj", [])],
            })
    return rows

first_turns = load_first_turns(VAL_JSONL)
print("Loaded", len(first_turns), "examples")



Loaded 179 examples


In [40]:
INDEX_DIR = ROOT / "indexes"
EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(EMB_MODEL_NAME)

@dataclass
class TopicIndex:
    faiss_index: faiss.Index
    texts: List[str]

_cache = {}
def load_index(topic: str) -> TopicIndex:
    if topic not in _cache:
        idx_path = INDEX_DIR / f"{topic}.faiss"
        pkl_path = INDEX_DIR / f"{topic}.pkl"
        if not idx_path.exists():
            raise FileNotFoundError(f"No index for topic: {topic}")
        index = faiss.read_index(str(idx_path))
        with open(pkl_path, "rb") as f:
            texts = pickle.load(f)
        _cache[topic] = TopicIndex(index, texts)
    return _cache[topic]

def retrieve_for_topic(question: str, topic: str, k=6) -> List[str]:
    ti = load_index(topic)
    q_vec = embedder.encode([question], normalize_embeddings=True)
    D, I = ti.faiss_index.search(np.array(q_vec, dtype=np.float32), k)
    return [ti.texts[i] for i in I[0] if i < len(ti.texts)]


In [41]:
qa = pipeline("question-answering", model="distilbert/distilbert-base-cased-distilled-squad")

def answer_with_qa(question: str, context: str) -> str:
    if not context.strip():
        return ""
    try:
        out = qa(question=question, context=context)
        return out["answer"].strip()
    except:
        return ""

Device set to use cpu


In [42]:
preds = []
for ex in first_turns:
    q, gold, topic = ex["question"], ex["gold"], ex["topic"]

    # Literal
    ctx_lit = " ".join(ex["literal_spans"])
    pred_lit = answer_with_qa(q, ctx_lit)

    # Pragmatic
    ctx_prag = " ".join(ex["pragmatic_spans"])
    pred_prag = answer_with_qa(q, ctx_prag)

    # Retrieved
    try:
        ctx_ret = "\n".join(retrieve_for_topic(q, topic, k=6))
    except FileNotFoundError:
        ctx_ret = ""
    pred_ret = answer_with_qa(q, ctx_ret)

    preds.append({
        "topic": topic,
        "question": q,
        "gold": gold,
        "pred_lit": pred_lit,
        "pred_prag": pred_prag,
        "pred_ret": pred_ret,
    })

preds_df = pd.DataFrame(preds)
preds_df.head()


Unnamed: 0,topic,question,gold,pred_lit,pred_prag,pred_ret
0,A Nightmare on Elm Street (2010 film),who is freddy krueger?,Freddy Kruger is the nightmare in nighmare on ...,Cannot GET /wiki/A%20N,Cannot GET /wiki/A%20N,
1,A Nightmare on Elm Street (2010 film),who was the star on this movie?,"Robert Englund IS Freddy Kruger, the bad guy f...",20Nightmare,20Nightmare,
2,A Nightmare on Elm Street (2010 film),What is the movie about?,"Ok, here goes, I'm getting ""Cannot get""..so, N...",20film,20film,
3,A Nightmare on Elm Street (2010 film),Who directed the new film?,It was Directed by: Samuel Bayer. Note that th...,2010%20film,2010%20film,
4,Batman,Is the Batman comic similar to the movies?,I would say the movie and comics has same stor...,Gotham City socialites,his parents were killed by a small-time crimin...,


In [None]:
from dspy.evaluate import SemanticF1
import pandas as pd

assert isinstance(preds, list) and len(preds) > 0 and isinstance(preds[0], dict), \

metric = SemanticF1()  

def prf1(q, gold, pred):
    s = metric.module(
        question=q or "",
        ground_truth=str(gold or ""),
        system_response=str(pred or "")
    )
    p, r = float(s.precision), float(s.recall)
    f1 = 2 * p * r / (p + r + 1e-9)
    return p, r, f1



p_lit, r_lit, f1_lit = zip(*[prf1(r["question"], r["gold"], r["pred_lit"])  for r in preds])
p_prg, r_prg, f1_prg = zip(*[prf1(r["question"], r["gold"], r["pred_prag"]) for r in preds])
p_ret, r_ret, f1_ret = zip(*[prf1(r["question"], r["gold"], r["pred_ret"])  for r in preds])

scores_df = pd.DataFrame({
    "topic":    [r.get("topic","")    for r in preds],
    "question": [r.get("question","") for r in preds],
    "p_lit": p_lit, "r_lit": r_lit, "f1_lit": f1_lit,
    "p_prg": p_prg, "r_prg": r_prg, "f1_prg": f1_prg,
    "p_ret": p_ret, "r_ret": r_ret, "f1_ret": f1_ret,
})
metric_cols = ["p_lit","r_lit","f1_lit",
               "p_prg","r_prg","f1_prg",
               "p_ret","r_ret","f1_ret"]


display(scores_df[metric_cols].astype(float).describe())

scores_df.to_csv("part4_3_metrics.csv", index=False)


Unnamed: 0,p_lit,r_lit,f1_lit,p_prg,r_prg,f1_prg,p_ret,r_ret,f1_ret
count,179.0,179.0,179.0,179.0,179.0,179.0,179.0,179.0,179.0
mean,0.817505,0.285407,0.406835,0.759777,0.270348,0.37485,0.089385,0.0,0.0
std,0.378368,0.193985,0.238026,0.418467,0.22644,0.269145,0.2861,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.2,0.309548,0.5,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.33,0.4,1.0,0.25,0.4,0.0,0.0,0.0
75%,1.0,0.333333,0.5,1.0,0.333333,0.5,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
