In [1]:
import os
import json
from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional
import dspy


with open("xai_key.txt") as f:
    api_key = f.read().strip()

lm = dspy.LM('xai/grok-3-mini', api_key=api_key)
dspy.configure(lm=lm)


HW3_ROOT = os.getcwd()   
PRAG_DATA_DIR = os.path.join(HW3_ROOT, "PragmatiCQA", "data")
SOURCES_DIR   = os.path.join(HW3_ROOT, "PragmatiCQA-sources")

VAL_JSONL   = os.path.join(PRAG_DATA_DIR, "val.jsonl")
TRAIN_JSONL = os.path.join(PRAG_DATA_DIR, "train.jsonl")
TEST_JSONL  = os.path.join(PRAG_DATA_DIR, "test.jsonl")



TOP_K_RETRIEVE = 5 
SEED = 42

@dataclass
class FirstTurnExample:
    topic: str
    question: str
    gold_answer: str                  
    literal_spans: List[str]           
    pragmatic_spans: List[str]        
    conversation_id: str               

def read_jsonl(path: str) -> List[Dict]:
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows


In [2]:

from typing import Any

def _extract_span_texts(objs: Optional[List[Dict[str, Any]]]) -> List[str]:
    """Safely extract the 'text' field from a_meta.literal_obj / pragmatic_obj entries."""
    if not objs:
        return []
    out = []
    for obj in objs:
        txt = obj.get("text")
        if isinstance(txt, str) and txt.strip():
            out.append(txt.strip())
    return out

def load_first_questions(val_jsonl_path: str) -> List[FirstTurnExample]:
    rows = read_jsonl(val_jsonl_path)
    examples: List[FirstTurnExample] = []

    for i, conv in enumerate(rows):
        topic = conv.get("topic", "").strip()
        qas = conv.get("qas") or []
        if not qas:
            continue  

        first = qas[0]
        question = (first.get("q") or "").strip()
        gold_answer = (first.get("a") or "").strip()

        a_meta = first.get("a_meta") or {}
        literal_objs = a_meta.get("literal_obj") or []
        pragmatic_objs = a_meta.get("pragmatic_obj") or []

        literal_spans = _extract_span_texts(literal_objs)
        pragmatic_spans = _extract_span_texts(pragmatic_objs)

        ex = FirstTurnExample(
            topic=topic or f"[UNKNOWN_TOPIC_{i}]",
            question=question,
            gold_answer=gold_answer,
            literal_spans=literal_spans,
            pragmatic_spans=pragmatic_spans,
            conversation_id=str(i),
        )
        examples.append(ex)

    return examples

val_first_q_examples = load_first_questions(VAL_JSONL)
print(f"[INFO] Loaded {len(val_first_q_examples)} first-question examples from val.jsonl")
if val_first_q_examples:
    e0 = val_first_q_examples[0]
    print("Sample:")
    print("topic:", e0.topic)
    print("question:", e0.question[:120], "...")
    print("gold_answer:", e0.gold_answer[:120], "...")
    print("#literal_spans:", len(e0.literal_spans), "| #pragmatic_spans:", len(e0.pragmatic_spans))


[INFO] Loaded 179 first-question examples from val.jsonl
Sample:
topic: A Nightmare on Elm Street (2010 film)
question: who is freddy krueger? ...
gold_answer: Freddy Kruger is the nightmare in nighmare on Elm street. Please note, and to be very clear, the system that loads up wi ...
#literal_spans: 1 | #pragmatic_spans: 1


In [3]:
# === Cell 3 (Unified): Topic resolver + FAISS retriever ===
from typing import List, Tuple, Optional
import os, re, glob, difflib
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# ----------------------------------------------------
EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(EMB_MODEL_NAME)

_topic_to_index: dict[str, faiss.Index] = {}
_topic_to_chunks: dict[str, List[str]] = {}

# ----------------------------------------------------
def normalize_topic_to_dirname(topic: str) -> str:
    t = topic.strip()
    t = re.sub(r"\s*\([^)]*\)", "", t)          
    t = re.sub(r"[^0-9A-Za-z _-]+", " ", t)    
    t = re.sub(r"\s+", " ", t).strip()
    t = t.replace(" ", "_")
    return t

def list_available_topics_dirs() -> list:
    if not os.path.isdir(SOURCES_DIR):
        print(f"[ERROR] Sources dir missing: {SOURCES_DIR}")
        return []
    return [d for d in os.listdir(SOURCES_DIR) if os.path.isdir(os.path.join(SOURCES_DIR, d))]

def resolve_topic_dir(topic: str) -> Optional[str]:
    candidates = list_available_topics_dirs()
    if not candidates:
        return None

    norm = normalize_topic_to_dirname(topic)
    direct = os.path.join(SOURCES_DIR, norm)
    if os.path.isdir(direct):
        return direct

    match = difflib.get_close_matches(norm, candidates, n=1, cutoff=0.6)
    if match:
        print(f"[INFO] Fuzzy-resolved topic '{topic}' -> '{match[0]}'")
        return os.path.join(SOURCES_DIR, match[0])

    raw = topic.replace(" ", "_")
    raw_match = difflib.get_close_matches(raw, candidates, n=1, cutoff=0.6)
    if raw_match:
        print(f"[INFO] Fuzzy-resolved (raw) '{topic}' -> '{raw_match[0]}'")
        return os.path.join(SOURCES_DIR, raw_match[0])

    print(f"[WARN] Could not resolve topic folder for: {topic} (norm='{norm}')")
    return None

# ----------------------------------------------------
def build_faiss_index_for_topic(topic: str) -> None:
    topic_dir = resolve_topic_dir(topic)
    if not topic_dir:
        print(f"[WARN] Topic dir not found for topic: {topic}")
        return

    html_files = glob.glob(os.path.join(topic_dir, "*.html"))
    chunks: List[str] = []
    for f in html_files:
        with open(f, "r", encoding="utf-8", errors="ignore") as fh:
            text = fh.read()
        for para in text.split("\n"):
            para = para.strip()
            if len(para) > 30:
                chunks.append(para)

    if not chunks:
        print(f"[WARN] No chunks for topic '{topic}' (dir='{topic_dir}')")
        return

    X = embedder.encode(chunks, convert_to_numpy=True, show_progress_bar=False)
    faiss.normalize_L2(X)
    index = faiss.IndexFlatIP(X.shape[1]) 
    index.add(X)

    _topic_to_index[topic] = index
    _topic_to_chunks[topic] = chunks
    print(f"[INFO] Built FAISS index | topic='{topic}' | dir='{os.path.basename(topic_dir)}' | #chunks={len(chunks)}")

def retrieve_context(topic: str, question: str, top_k: int = TOP_K_RETRIEVE) -> List[str]:
    resolved_dir = resolve_topic_dir(topic)
    if not resolved_dir:
        print(f"[WARN] Could not resolve dir for topic: {topic}")

    if topic not in _topic_to_index:
        build_faiss_index_for_topic(topic)
    if topic not in _topic_to_index:
        return []

    index = _topic_to_index[topic]
    chunks = _topic_to_chunks[topic]

    q_emb = embedder.encode([question], convert_to_numpy=True)
    faiss.normalize_L2(q_emb)
    D, I = index.search(q_emb, top_k)

    return [chunks[i] for i in I[0] if 0 <= i < len(chunks)]

sample_ex = val_first_q_examples[0]
print("Topic:", sample_ex.topic)
ctx = retrieve_context(sample_ex.topic, sample_ex.question, top_k=3)



Topic: A Nightmare on Elm Street (2010 film)
[INFO] Fuzzy-resolved topic 'A Nightmare on Elm Street (2010 film)' -> 'A Nightmare on Elm Street'
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\A Nightmare on Elm Street
[INFO] Fuzzy-resolved topic 'A Nightmare on Elm Street (2010 film)' -> 'A Nightmare on Elm Street'
[INFO] Built FAISS index | topic='A Nightmare on Elm Street (2010 film)' | dir='A Nightmare on Elm Street' | #chunks=33827


In [4]:
import dspy
from typing import List, Tuple, Optional
class SummarizeGoal(dspy.Signature):
    """Summarize the student's long-term goal or interests from prior Q/A history."""
    history = dspy.InputField(desc="List of prior (question, answer) pairs, oldest→newest.")
    summary = dspy.OutputField(desc="2-4 concise sentences summarizing goals/interests.")

class InferPragmaticNeed(dspy.Signature):
    """Infer the current pragmatic need behind the student's question."""
    question = dspy.InputField()
    history_summary = dspy.InputField()
    retrieved_glimpse = dspy.InputField(desc="Short excerpt(s) of the retrieved context.")
    need = dspy.OutputField(desc="Crisp statement of what extra info would be most helpful now.")

class GenerateCoopQuery(dspy.Signature):
    """Generate a cooperative follow-up retrieval query to fetch complementary context."""
    question = dspy.InputField()
    pragmatic_need = dspy.InputField()
    coop_query = dspy.OutputField(desc="A single focused query for the retriever (<= 160 chars).")

class ReasonCoT(dspy.Signature):
    """Deliberate on how to craft a cooperative answer grounded in the retrieved evidence."""
    question = dspy.InputField()
    history_summary = dspy.InputField()
    all_context = dspy.InputField(desc="Concise merged evidence snippets to ground the answer.")
    pragmatic_need = dspy.InputField()
    reasoning = dspy.OutputField(desc="Step-by-step plan: literal facts to include + helpful extras.")

class CooperativeAnswer(dspy.Signature):
    """Produce a cooperative answer grounded in retrieved evidence."""
    question = dspy.InputField()
    history_summary = dspy.InputField()
    all_context = dspy.InputField()
    reasoning = dspy.InputField()
    answer = dspy.OutputField(desc="Final cooperative answer. Cite evidence implicitly; avoid hallucinations; be concise and helpful.")


# ----------------------------------------------------
def _format_history(history: Optional[List[Tuple[str, str]]]) -> str:
    if not history:
        return "(no prior turns)"
    lines = []
    for i, (q, a) in enumerate(history, 1):
        lines.append(f"Turn {i} - Q: {q}\nTurn {i} - A: {a}")
    return "\n".join(lines)

def _shorten_chunks(chunks: List[str], max_chars: int = 1000) -> str:
    """Concatenate chunks with a soft character budget (for prompting)."""
    out, used = [], 0
    for c in chunks:
        c = c.strip()
        if not c:
            continue
        if used + len(c) + 2 > max_chars:
            break
        out.append(c)
        used += len(c) + 2
    return "\n---\n".join(out)


# ----------------------------------------------------
class MultiStepCoopQAModule(dspy.Module):
    def __init__(self, retriever_fn, top_k: int = 5, second_hop: bool = True):
        """
        retriever_fn: callable(topic: str, question: str, top_k: int) -> List[str]
        top_k: כמה קטעים לשלוף בכל שאילתה.
        second_hop: האם לבצע שליפה שניה עם ה-coop query.
        """
        super().__init__()
        self.retriever_fn = retriever_fn
        self.top_k = top_k
        self.second_hop = second_hop

        self.summarize = dspy.Predict(SummarizeGoal)
        self.infer_need = dspy.Predict(InferPragmaticNeed)
        self.gen_query = dspy.Predict(GenerateCoopQuery)
        self.reason = dspy.Predict(ReasonCoT)
        self.answer = dspy.Predict(CooperativeAnswer)

    def forward(
        self,
        *,
        topic: str,
        question: str,
        history: Optional[List[Tuple[str, str]]] = None,
        initial_context: Optional[List[str]] = None,
        top_k: Optional[int] = None,
    ):
        """
        API ראשי:
          - topic: שם הטופיק (לשליפה מתוך ה-HTMLים שלו)
          - question: השאלה הנוכחית
          - history: [(q,a), ...] (ב-4.4.1 לרוב None)
          - initial_context: אם כבר שלפת קטעים מראש (אפשר להשאיר None)
          - top_k: גובר על self.top_k אם סופק
        מחזיר: dict עם שלבי ביניים + answer סופי.
        """
        k = top_k or self.top_k

        ctx1 = initial_context
        if ctx1 is None:
            ctx1 = self.retriever_fn(topic, question, top_k=k)
        ctx1_short = _shorten_chunks(ctx1, max_chars=800) if ctx1 else "(no context retrieved)"

        history_txt = _format_history(history)

        summary = self.summarize(history=history_txt).summary
        need = self.infer_need(
            question=question,
            history_summary=summary,
            retrieved_glimpse=ctx1_short
        ).need

        coop_q = self.gen_query(
            question=question,
            pragmatic_need=need
        ).coop_query

        ctx_all = list(ctx1) if ctx1 else []
        if self.second_hop and coop_q and isinstance(coop_q, str) and len(coop_q.strip()) > 0:
            ctx2 = self.retriever_fn(topic, coop_q.strip(), top_k=max(2, k // 2))
            if ctx2:
                ctx_all.extend(ctx2)

        ctx_all_short = _shorten_chunks(ctx_all, max_chars=1600) if ctx_all else ctx1_short
        plan = self.reason(
            question=question,
            history_summary=summary,
            all_context=ctx_all_short,
            pragmatic_need=need
        ).reasoning

        final = self.answer(
            question=question,
            history_summary=summary,
            all_context=ctx_all_short,
            reasoning=plan
        ).answer

        return {
            "history_summary": summary,
            "pragmatic_need": need,
            "coop_query": coop_q,
            "context_1": ctx1[:k] if ctx1 else [],
            "context_all": ctx_all[: (k + max(2, k // 2)) ],
            "reasoning": plan,
            "answer": final,
        }


# ----------------------------------------------------
multi_step = MultiStepCoopQAModule(retriever_fn=retrieve_context, top_k=TOP_K_RETRIEVE, second_hop=True)

probe = multi_step(
    topic=val_first_q_examples[0].topic,
    question=val_first_q_examples[0].question,
    history=None,               
    initial_context=None,       
    top_k=3
)
print("Coop Query:", probe["coop_query"])
print("Answer (preview):", (probe["answer"] or "")[:300], "...")


[INFO] Fuzzy-resolved topic 'A Nightmare on Elm Street (2010 film)' -> 'A Nightmare on Elm Street'
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\A Nightmare on Elm Street
[INFO] Fuzzy-resolved topic 'A Nightmare on Elm Street (2010 film)' -> 'A Nightmare on Elm Street'
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\A Nightmare on Elm Street
Coop Query: Detailed biography of Freddy Krueger: origins in A Nightmare on Elm Street, creator, characteristics, and cultural significance.
Answer (preview): Freddy Krueger, whose full name is Frederick Charles "Freddy" Krueger, is a fictional character from the "A Nightmare on Elm Street" horror film franchise. He is often referred to as Fred Krueger and is portrayed as a vengeful dream-haunting antagonist with a burned appearance and a signature clawed ...


In [None]:
# === Cell 5: Run Multi-Step module on first questions and save outputs ===
import os, json, time
from tqdm import tqdm
from typing import Dict, Any, List, Optional

OUT_DIR = os.path.join(HW3_ROOT, "outputs")
os.makedirs(OUT_DIR, exist_ok=True)
PRED_JSONL = os.path.join(OUT_DIR, "val_firstq_multistep_predictions.jsonl")

def run_first_questions_multistep(
    examples: List[FirstTurnExample],
    module: MultiStepCoopQAModule,
    top_k: int = TOP_K_RETRIEVE,
    limit: Optional[int] = None,
    save_path: Optional[str] = None,
) -> List[Dict[str, Any]]:

    results: List[Dict[str, Any]] = []
    N = len(examples) if limit is None else min(limit, len(examples))
    t0 = time.time()

    for ex in tqdm(examples[:N], total=N, desc="MultiStep on first questions"):
        try:
            initial_ctx = retrieve_context(ex.topic, ex.question, top_k=top_k)

            out = module(
                topic=ex.topic,
                question=ex.question,
                history=None,             
                initial_context=initial_ctx,
                top_k=top_k
            )

            rec: Dict[str, Any] = {
                "conversation_id": ex.conversation_id,
                "topic": ex.topic,
                "question": ex.question,
                "pred_answer": out.get("answer", ""),
                "gold_answer": ex.gold_answer,
                "literal_spans": ex.literal_spans,
                "pragmatic_spans": ex.pragmatic_spans,
                "history_summary": out.get("history_summary", ""),
                "pragmatic_need": out.get("pragmatic_need", ""),
                "coop_query": out.get("coop_query", ""),
                "context_1": out.get("context_1", []),
                "context_all": out.get("context_all", []),
                "reasoning": out.get("reasoning", ""),
            }
        except Exception as e:
            rec = {
                "conversation_id": ex.conversation_id,
                "topic": ex.topic,
                "question": ex.question,
                "pred_answer": "",
                "gold_answer": ex.gold_answer,
                "literal_spans": ex.literal_spans,
                "pragmatic_spans": ex.pragmatic_spans,
                "error": str(e),
            }
        results.append(rec)

        if save_path:
            with open(save_path, "a", encoding="utf-8") as f:
                f.write(json.dumps(rec, ensure_ascii=False) + "\n")

    dt = time.time() - t0
    print(f"[INFO] Done {N} examples in {dt:.1f}s ({dt/max(N,1):.2f}s/ex)")
    print(f"[INFO] Saved to: {save_path}" if save_path else "[INFO] Not saved (save_path=None)")
    return results

if os.path.exists(PRED_JSONL):
    os.remove(PRED_JSONL)

val_firstq_preds = run_first_questions_multistep(
    val_first_q_examples,
    multi_step,
    top_k=TOP_K_RETRIEVE,
    limit=None,                   
    save_path=PRED_JSONL
)

print(f"Total predictions: {len(val_firstq_preds)}")
for rec in val_firstq_preds[:2]:
    print("-"*80)
    print("Q:", rec["question"])
    print("Pred:", (rec["pred_answer"] or "")[:300], "...")
    print("Gold:", (rec["gold_answer"] or "")[:300], "...")
    print("Coop Query:", rec.get("coop_query"))


MultiStep on first questions:   0%|          | 0/179 [00:00<?, ?it/s]

[INFO] Fuzzy-resolved topic 'A Nightmare on Elm Street (2010 film)' -> 'A Nightmare on Elm Street'
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\A Nightmare on Elm Street
[INFO] Fuzzy-resolved topic 'A Nightmare on Elm Street (2010 film)' -> 'A Nightmare on Elm Street'
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\A Nightmare on Elm Street


MultiStep on first questions:   1%|          | 1/179 [00:26<1:19:09, 26.68s/it]

[INFO] Fuzzy-resolved topic 'A Nightmare on Elm Street (2010 film)' -> 'A Nightmare on Elm Street'
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\A Nightmare on Elm Street
[INFO] Fuzzy-resolved topic 'A Nightmare on Elm Street (2010 film)' -> 'A Nightmare on Elm Street'
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\A Nightmare on Elm Street


MultiStep on first questions:   1%|          | 2/179 [00:49<1:12:44, 24.66s/it]

[INFO] Fuzzy-resolved topic 'A Nightmare on Elm Street (2010 film)' -> 'A Nightmare on Elm Street'
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\A Nightmare on Elm Street
[INFO] Fuzzy-resolved topic 'A Nightmare on Elm Street (2010 film)' -> 'A Nightmare on Elm Street'
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\A Nightmare on Elm Street


MultiStep on first questions:   2%|▏         | 3/179 [01:16<1:14:41, 25.46s/it]

[INFO] Fuzzy-resolved topic 'A Nightmare on Elm Street (2010 film)' -> 'A Nightmare on Elm Street'
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\A Nightmare on Elm Street
[INFO] Fuzzy-resolved topic 'A Nightmare on Elm Street (2010 film)' -> 'A Nightmare on Elm Street'
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\A Nightmare on Elm Street


MultiStep on first questions:   2%|▏         | 4/179 [01:55<1:29:32, 30.70s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Built FAISS index | topic='Batman' | dir='Batman' | #chunks=122677
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:   3%|▎         | 5/179 [33:20<33:48:19, 699.42s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:   3%|▎         | 6/179 [33:48<22:38:25, 471.13s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:   4%|▍         | 7/179 [34:11<15:31:09, 324.82s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:   4%|▍         | 8/179 [34:35<10:52:10, 228.84s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:   5%|▌         | 9/179 [35:03<7:51:05, 166.27s/it] 

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:   6%|▌         | 10/179 [35:31<5:47:54, 123.52s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:   6%|▌         | 11/179 [35:57<4:22:30, 93.75s/it] 

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:   7%|▋         | 12/179 [36:24<3:23:48, 73.22s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:   7%|▋         | 13/179 [36:59<2:50:50, 61.75s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:   8%|▊         | 14/179 [37:23<2:18:29, 50.36s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:   9%|▉         | 17/179 [37:52<1:03:54, 23.67s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:  10%|█         | 18/179 [38:17<1:04:08, 23.90s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:  11%|█         | 19/179 [38:43<1:05:25, 24.53s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:  11%|█         | 20/179 [39:06<1:03:31, 23.97s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:  12%|█▏        | 21/179 [39:37<1:08:29, 26.01s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:  12%|█▏        | 22/179 [40:03<1:08:00, 25.99s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:  13%|█▎        | 23/179 [40:31<1:09:18, 26.65s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:  13%|█▎        | 24/179 [41:01<1:11:24, 27.64s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:  14%|█▍        | 25/179 [41:34<1:14:35, 29.06s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:  15%|█▍        | 26/179 [42:11<1:19:56, 31.35s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:  15%|█▌        | 27/179 [42:42<1:19:19, 31.31s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:  16%|█▌        | 28/179 [43:11<1:16:58, 30.59s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:  16%|█▌        | 29/179 [43:38<1:13:46, 29.51s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:  17%|█▋        | 30/179 [44:05<1:11:39, 28.85s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:  17%|█▋        | 31/179 [44:33<1:10:21, 28.52s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:  18%|█▊        | 32/179 [44:59<1:08:12, 27.84s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:  18%|█▊        | 33/179 [45:33<1:12:21, 29.74s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:  19%|█▉        | 34/179 [45:58<1:08:16, 28.25s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:  20%|█▉        | 35/179 [46:20<1:03:16, 26.37s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:  20%|██        | 36/179 [46:45<1:01:43, 25.90s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:  21%|██        | 37/179 [47:23<1:10:21, 29.73s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman
[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


MultiStep on first questions:  21%|██        | 38/179 [47:50<1:08:01, 28.95s/it]

[INFO] Using topic dir: c:\Users\gilic\hw3\nlp-with-llms-2025-hw3\PragmatiCQA-sources\Batman


In [None]:
import os, json
from statistics import mean
PRED_JSONL = os.path.join(HW3_ROOT, "outputs", "val_firstq_multistep_predictions.jsonl")
if 'val_firstq_preds' not in globals() or not isinstance(val_firstq_preds, list) or len(val_firstq_preds) == 0:
    val_firstq_preds = []
    if os.path.exists(PRED_JSONL):
        with open(PRED_JSONL, "r", encoding="utf-8") as f:
            for line in f:
                if line.strip():
                    val_firstq_preds.append(json.loads(line))
        print(f"[INFO] Loaded {len(val_firstq_preds)} predictions from file.")
    else:
        raise RuntimeError("[ERROR] No predictions found in memory or on disk.")


EVAL_LIMIT = None 

pairs = []
for rec in val_firstq_preds[: (EVAL_LIMIT or len(val_firstq_preds))]:
    pred = (rec.get("pred_answer") or "").strip()
    gold = (rec.get("gold_answer") or "").strip()
    if pred == "" and gold == "":
        continue
    pairs.append((pred, gold))

print(f"[INFO] Evaluating {len(pairs)} pairs with SemanticF1...")

try:
    from dspy.evaluate import SemanticF1
except Exception:
    try:
        from dspy.evaluation import SemanticF1
    except Exception as e:
        raise ImportError(
            f"Could not import SemanticF1 from dspy. "
            f"Please ensure DSPy is up to date. Original error: {e}"
        )


metric = SemanticF1()
precisions, recalls, f1s = [], [], []
try:
    preds = [p for p, _ in pairs]
    gts   = [g for _, g in pairs]
    batch_scores = metric.batch(predictions=preds, references=gts)
    for s in batch_scores:
        precisions.append(float(s.get('precision', 0.0)))
        recalls.append(float(s.get('recall', 0.0)))
        f1s.append(float(s.get('f1', 0.0)))
except Exception:
    for pred, gold in pairs:
        s = metric(prediction=pred, reference=gold)
        precisions.append(float(s.get('precision', 0.0)))
        recalls.append(float(s.get('recall', 0.0)))
        f1s.append(float(s.get('f1', 0.0)))

def _avg(x): 
    return round(mean(x), 4) if x else 0.0

report = {
    "N": len(pairs),
    "precision_avg": _avg(precisions),
    "recall_avg": _avg(recalls),
    "f1_avg": _avg(f1s),
}

print("\n=== SemanticF1 Report (Multi-Step, first questions) ===")
for k, v in report.items():
    print(f"{k}: {v}")

METRICS_JSON = os.path.join(HW3_ROOT, "outputs", "val_firstq_multistep_metrics.json")
with open(METRICS_JSON, "w", encoding="utf-8") as f:
    json.dump(report, f, ensure_ascii=False, indent=2)
print(f"[INFO] Metrics saved to: {METRICS_JSON}")
