In [3]:
# If you don't have datasets installed:

from datasets import load_dataset
import json, re, os
from collections import defaultdict
from typing import List, Tuple, Dict

# --- config ---
SPLIT = "train"           # "train" | "validation" | "test"
K = 3                    # how many topics to examine
PRED_PATH = "scico_train_tanl_extraction.jsonl"  # path to your cached predictions JSONL
SHOW_FIRST_N_PARAS = 3   # per topic, show this many paragraphs verbosely
SHOW_FIRST_N_MENTIONS = 10  # truncate long lists for readability

In [16]:
# must match your extractor's detok to align strings
_punct = re.compile(r"\s+([,.;:%)\]])")
_openp = re.compile(r"([\[(])\s+")
def detok(tokens: List[str]) -> str:
    s = " ".join(tokens)
    s = _punct.sub(r"\1", s)
    s = _openp.sub(r"\1", s)
    return s

def norm(s: str) -> str:
    s = s.lower().strip()
    s = s.replace("’","'").replace("“",'"').replace("”",'"')
    s = re.sub(r"\s+", " ", s)
    return s

# bracketed: [ mention | type | rels ... ]
BRACKET_SPAN_RE = re.compile(r"\[\s*(?P<mention>.+?)\s*\|\s*[^|\]]+?(?:\s*\|[^\]]+)?\]")

def extract_pred_mentions(tanl_output: str) -> List[str]:
    out = []
    for m in BRACKET_SPAN_RE.finditer(tanl_output or ""):
        raw = m.group("mention")
        # be defensive if mention text accidentally contains a '|'
        out.append(raw.split("|")[0].strip())
    return out


In [17]:
scico = load_dataset("allenai/scico")[SPLIT]

# For fast lookup: (topic_id, para_idx) -> {"text":..., "gold_mentions":[...], "doc_id":...}
gold = {}
for row in scico:
    tid = int(row["id"])
    for pidx, toks in enumerate(row["tokens"]):
        text = detok(toks)
        # collect gold mentions for this paragraph
        gstrings = []
        for (pid, s, e, _cid) in row["mentions"]:
            if pid == pidx:
                gstrings.append(detok(row["tokens"][pid][s:e]))
        gold[(tid, pidx)] = {
            "text": text,
            "gold_mentions": gstrings,
            "doc_id": int(row["doc_ids"][pidx]),
        }


In [18]:
assert os.path.exists(PRED_PATH), f"Missing predictions file at {PRED_PATH}"

pred = {}  # (tid, pidx) -> record
with open(PRED_PATH, "r", encoding="utf-8") as f:
    for line in f:
        rec = json.loads(line)
        key = (int(rec["topic_id"]), int(rec["para_idx"]))
        pred[key] = rec

# quick sanity: how many overlaps
overlap = sum(1 for k in gold if k in pred)
print(f"Gold paragraphs: {len(gold):,} | Predicted paragraphs: {len(pred):,} | Overlap: {overlap:,}")


Gold paragraphs: 10,660 | Predicted paragraphs: 10,660 | Overlap: 10,660


In [19]:
def compare_paragraph(tid: int, pidx: int, show_text: bool = False) -> Dict:
    g = gold.get((tid, pidx), None)
    pr = pred.get((tid, pidx), None)
    if g is None:
        return {"status": "missing-gold"}
    if pr is None:
        return {"status": "missing-pred"}

    text_g = g["text"]
    text_p = pr.get("text", "")
    same_text = (norm(text_g) == norm(text_p))

    # gold mentions (strings)
    gold_list = g["gold_mentions"]
    # predicted mentions (strings extracted from TANL brackets)
    pred_list = extract_pred_mentions(pr.get("tanl_output",""))

    # normalize for equality check (we keep raw for display)
    gold_norm = set(norm(x) for x in gold_list)
    pred_norm = set(norm(x) for x in pred_list)

    hits = [m for m in gold_list if norm(m) in pred_norm]
    miss = [m for m in gold_list if norm(m) not in pred_norm]

    out = {
        "status": "ok",
        "tid": tid,
        "para_idx": pidx,
        "doc_id": g["doc_id"],
        "texts_equal": same_text,
        "n_gold": len(gold_list),
        "n_pred": len(pred_list),
        "n_hits": len(hits),
        "n_miss": len(miss),
        "hits": hits,
        "miss": miss,
        "gold_sample": gold_list[:SHOW_FIRST_N_MENTIONS],
        "pred_sample": pred_list[:SHOW_FIRST_N_MENTIONS],
    }
    if show_text:
        out["text"] = text_g
        out["tanl_output"] = pred.get("tanl_output","")
    return out


In [20]:
# pick the first K topics from the dataset order
topic_ids = [int(scico[i]["id"]) for i in range(min(K, len(scico)))]

for t_idx, tid in enumerate(topic_ids, 1):
    row = next(r for r in scico if int(r["id"]) == tid)
    n_paras = len(row["tokens"])
    print("="*100)
    print(f"[{t_idx}/{len(topic_ids)}] Topic {tid} — {n_paras} paragraphs")
    print("-"*100)

    # summarize counts per topic
    topic_hits = topic_gold = topic_pred_mentions = 0
    for pidx in range(n_paras):
        g = gold.get((tid,pidx))
        pr = pred.get((tid,pidx))
        if g is None or pr is None:
            continue
        res = compare_paragraph(tid, pidx)
        topic_hits += res["n_hits"]
        topic_gold += res["n_gold"]
        topic_pred_mentions += res["n_pred"]

    recall = (topic_hits/topic_gold) if topic_gold else 0.0
    print(f"Topic-level mention recall: {topic_hits}/{topic_gold} = {recall:.3f}  | predicted mentions in topic: {topic_pred_mentions}")

    # show the first few paragraphs verbosely
    for pidx in range(min(SHOW_FIRST_N_PARAS, n_paras)):
        res = compare_paragraph(tid, pidx, show_text=True)
        print("\n--- Paragraph", pidx, f"(doc_id={res.get('doc_id','?')}) ---")
        if res["status"] != "ok":
            print(res["status"])
            continue

        print(f"texts_equal_to_pred_text: {res['texts_equal']}")
        print("\n[TEXT]")
        print(res["text"])
        print("\n[GOLD mentions] (n =", res["n_gold"], ") →", res["gold_sample"])
        print("[PRED mentions] (n =", res["n_pred"], ") →", res["pred_sample"])
        print("[HITS] →", res["hits"])
        print("[MISS] →", res["miss"])
        # optionally also show the full TANL output for this paragraph
        # print("\n[TANL OUTPUT]")
        # print(res["tanl_output"])


[1/3] Topic 521 — 66 paragraphs
----------------------------------------------------------------------------------------------------
Topic-level mention recall: 2/67 = 0.030  | predicted mentions in topic: 385

--- Paragraph 0 (doc_id=0) ---
texts_equal_to_pred_text: True

[TEXT]
With the rapid development of the network technologies, software development is becoming more and more complicated. Traditional software engineering management methods based on Client/Server structure have not been very competent for large-scale software development.

[GOLD mentions] (n = 1 ) → ['software']
[PRED mentions] (n = 5 ) → ['network technologies', 'software development', 'software engineering management methods', 'Client/Server structure', 'large-scale software development']
[HITS] → []
[MISS] → ['software']

--- Paragraph 1 (doc_id=1) ---
texts_equal_to_pred_text: True

[TEXT]
Conclusion: Selecting the appropriate integration architecture is a fundamental issue of any software development project. 

In [9]:
a = [
  "With",
  "the",
  "rapid",
  "development",
  "of",
  "the",
  "network",
  "technologies",
  ",",
  "software",
  "development",
  "is",
  "becoming",
  "more",
  "and",
  "more",
  "complicated",
  ".",
  "Traditional",
  "software",
  "engineering",
  "management",
  "methods",
  "based",
  "on",
  "Client/Server",
  "structure",
  "have",
  "not",
  "been",
  "very",
  "competent",
  "for",
  "large-scale",
  "software",
  "development",
  ".",
  "Conclusion",
  ":",
  "Selecting",
  "the",
  "appropriate",
  "integration",
  "architecture",
  "is",
  "a",
  "fundamental",
  "issue",
  "of",
  "any",
  "software",
  "development",
  "project",
  ".",
  "HIS-DF",
  "provides",
  "a",
  "unique",
  "methodological",
  "approach",
  "guiding",
  "the",
  "development",
  "of",
  "healthcare",
  "integration",
  "projects",
  ".",
  "Between",
  "Communication",
  "Activities",
  "and",
  "Success",
  "Indexes",
  "in",
  "Small",
  "and",
  "Medium",
  "Software",
  "Projects",
  ".",
  "In",
  "the",
  "field",
  "of",
  "software",
  "developing",
  ",",
  "project",
  "is",
  "almost",
  "the",
  "most",
  "common",
  "organizational",
  "form",
  ".",
  "But",
  "with",
  "the",
  "enhancement",
  "of",
  "software",
  "products",
  "'",
  "complexity",
  "and",
  "the",
  "constant",
  "change",
  "of",
  "customers",
  "'",
  "demand",
  ",",
  "the",
  "importance",
  "of",
  "communication",
  "in",
  "software",
  "development",
  "projects",
  "is",
  "increasingly",
  "prominent",
  ".",
  "Software",
  "reuse",
  "is",
  "a",
  "technology",
  "that",
  "is",
  "usually",
  "used",
  "in",
  "software",
  "developing",
  ".",
  "This",
  "paper",
  "discusses",
  "why",
  "object",
  "oriented",
  "programming",
  "(",
  "OOP",
  ")",
  "are",
  "suitable",
  "for",
  "supporting",
  "software",
  "reuse",
  "and",
  "states",
  "the",
  "ways",
  "of",
  "implementing",
  "software",
  "reuse",
  ".",
  "Reverse",
  "engineering",
  "a",
  "program",
  "constructs",
  "a",
  "high-level",
  "representation",
  "suitable",
  "for",
  "various",
  "software",
  "development",
  "purposes",
  "such",
  "as",
  "documentation",
  "or",
  "reengineering",
  ".",
  "Unfortunately",
  "however",
  ",",
  "there",
  "are",
  "no",
  "established",
  "guidelines",
  "to",
  "assess",
  "the",
  "adequacy",
  "of",
  "such",
  "a",
  "representation",
  ".",
  "Polymorphism",
  "is",
  "the",
  "ability",
  "of",
  "two",
  "classes",
  "to",
  "react",
  "differently",
  "to",
  "the",
  "same",
  "message",
  ".",
  "G.",
  "Software",
  "Development",
  "Software",
  "development",
  "is",
  "similar",
  "to",
  "most",
  "other",
  "types",
  "of",
  "construction",
  "processes",
  ".",
  "A",
  "complex",
  "problem",
  "is",
  "encountered",
  ";",
  "a",
  "solution",
  "is",
  "deduced",
  ";",
  "then",
  "construction",
  "of",
  "the",
  "solution",
  "occurs",
  ".",
  "The",
  "method",
  "used",
  "in",
  "the",
  "applicationdevelopment",
  "approach",
  "is",
  "a",
  "combination",
  "of",
  "interactive",
  "multimedia",
  "and",
  "educational",
  "psychology",
  ".",
  "When",
  "thesoftware",
  "development",
  ",",
  "we",
  "consider",
  "several",
  "aspects",
  "e.g",
  ":",
  "interfaces",
  ",",
  "interactivity",
  ",",
  "ease",
  "of",
  "use",
  ",",
  "and",
  "standalone",
  "software",
  "running",
  "on",
  "mobile",
  "phones",
  "and",
  "multimedia-based",
  ".",
  "Furthermore",
  ",",
  "to",
  "test",
  "the"]

In [5]:
import re

In [4]:
# --------- configuration ----------
SPLIT = "train"                        # "train" | "validation" | "test"
K = 30                                  # how many topics (rows) to examine step by step
PRED_PATH = "scico_train_tanl_extraction.jsonl"  # produced by the batch extractor

# fuzzy match thresholds
DELTA_POS = 2   # allow predicted start token to be within ±2 of gold start
DELTA_LEN = 2   # allow predicted length to differ by at most 2 tokens

# how much to print
SHOW_FIRST_N_PARAS = 4
SHOW_FIRST_N_PRED  = 12

In [6]:
# detokenize exactly like we did at extraction time
_punct = re.compile(r"\s+([,.;:%)\]])")
_openp = re.compile(r"([\[(])\s+")
def detok(tokens: List[str]) -> str:
    s = " ".join(tokens)
    s = _punct.sub(r"\1", s)
    s = _openp.sub(r"\1", s)
    return s

def norm(s: str) -> str:
    s = s.lower().strip()
    s = s.replace("’","'").replace("“",'"').replace("”",'"')
    s = re.sub(r"\s+", " ", s)
    return s

# extract the *mention phrase* (the bit before the first "|") from TANL brackets
BRACKET_SPAN_RE = re.compile(r"\[\s*(?P<mention>.+?)\s*\|\s*[^|\]]+?(?:\s*\|[^\]]+)?\]")

def extract_pred_mentions(tanl_output: str) -> List[str]:
    out = []
    for m in BRACKET_SPAN_RE.finditer(tanl_output or ""):
        raw = m.group("mention")
        out.append(raw.split("|")[0].strip())
    return out

# map a predicted mention string back to token spans by exact detok-equality over windows
def find_token_spans_for_string(par_tokens: List[str], mention: str, max_window: int = 20) -> List[Tuple[int,int]]:
    tgt = norm(mention)
    N = len(par_tokens)
    hits = []
    # cap window length to avoid quadratic blowup; mentions are short by nature
    for i in range(N):
        acc = []
        for j in range(i+1, min(N, i+max_window)+1):
            acc.append(par_tokens[j-1])
            cand = norm(detok(acc))
            if cand == tgt:
                hits.append((i, j))  # [start, end)
                break
            if len(cand) > len(tgt) + 10:
                break
    return hits

def fuzzy_hit(gold_span: Tuple[int,int], pred_span: Tuple[int,int], delta_pos=2, delta_len=2) -> bool:
    gs, ge = gold_span
    ps, pe = pred_span
    len_g = ge - gs
    len_p = pe - ps
    return abs(ps - gs) <= delta_pos and abs(len_p - len_g) <= delta_len


In [7]:
scico = load_dataset("allenai/scico")[SPLIT]

# Build gold maps:
#   (topic_id, para_idx) -> {
#       "tokens": [...],
#       "doc_id": int,
#       "gold_spans": [(start,end), ...],
#       "gold_strings": ["...", ...]
#   }
gold = {}
order_topic_ids = []

for i in range(len(scico)):
    row = scico[i]
    tid = int(row["id"])
    order_topic_ids.append(tid)
    for pid, toks in enumerate(row["tokens"]):
        spans = []
        strings = []
        for (ppid, s, e, _cid) in row["mentions"]:
            if ppid == pid:
                spans.append((int(s), int(e)))
                strings.append(detok(row["tokens"][pid][s:e+1]))
        gold[(tid, pid)] = {
            "tokens": row["tokens"][pid],
            "doc_id": int(row["doc_ids"][pid]),
            "gold_spans": spans,
            "gold_strings": strings,
        }

print(f"Loaded SciCo {SPLIT}: topics={len(scico)}, paragraphs={len(gold)}")


Loaded SciCo train: topics=221, paragraphs=10660


In [8]:
assert os.path.exists(PRED_PATH), f"Missing predictions JSONL at {PRED_PATH}"

pred = {}  # (tid, pid) -> record dict
with open(PRED_PATH, "r", encoding="utf-8") as f:
    for line in f:
        rec = json.loads(line)
        key = (int(rec["topic_id"]), int(rec["para_idx"]))
        pred[key] = rec

print(f"Predicted paragraphs: {len(pred)}")


Predicted paragraphs: 10660


In [9]:
def compare_paragraph_fuzzy(tid: int, pid: int, delta_pos=2, delta_len=2):
    g = gold.get((tid, pid))
    p = pred.get((tid, pid))
    if g is None:
        return {"status": "missing-gold"}
    if p is None:
        return {"status": "missing-pred"}

    tokens = g["tokens"]
    gold_spans = g["gold_spans"]
    gold_strings = g["gold_strings"]

    # predicted raw mentions (strings from brackets)
    pred_strings = extract_pred_mentions(p.get("tanl_output",""))

    # map each predicted string to *all* matching token spans in this paragraph
    pred_spans = []
    for s in pred_strings:
        spans = find_token_spans_for_string(tokens, s, max_window=20)
        # keep all matches (rare to have >1, but possible)
        pred_spans.extend(spans)

    # for display: pair spans with strings (first match per string)
    pred_span_for_display = []
    seen = set()
    for s in pred_strings:
        spans = find_token_spans_for_string(tokens, s, max_window=20)
        if spans:
            if spans[0] not in seen:
                pred_span_for_display.append((s, spans[0]))
                seen.add(spans[0])

    # greedy “found” counting: each gold may match at most one predicted span
    hits = 0
    misses = []
    matched_pred = set()
    for gidx, gs in enumerate(gold_spans):
        found = False
        for ps in pred_spans:
            if ps in matched_pred:
                continue
            if fuzzy_hit(gs, ps, delta_pos=delta_pos, delta_len=delta_len):
                hits += 1
                matched_pred.add(ps)
                found = True
                break
        if not found:
            misses.append((gidx, gs, gold_strings[gidx] if gidx < len(gold_strings) else ""))

    out = {
        "status": "ok",
        "doc_id": g["doc_id"],
        "text": detok(tokens),
        "gold_spans": gold_spans,
        "gold_strings": gold_strings,
        "pred_strings": pred_strings,
        "pred_spans_display": pred_span_for_display,
        "n_gold": len(gold_spans),
        "n_pred": len(pred_spans),
        "hits": hits,
        "misses": misses,
    }
    return out


In [10]:
topic_ids = order_topic_ids[:K]

total_gold = total_hits = 0
macro_stats = []

for t_idx, tid in enumerate(topic_ids, 1):
    # number of paragraphs for this topic
    n_paras = sum(1 for (t,p) in gold.keys() if t == tid)
    print("="*110)
    print(f"[{t_idx}/{len(topic_ids)}] Topic {tid} — paragraphs: {n_paras}")
    print("-"*110)

    topic_gold = topic_hits = 0

    # show first few paragraphs verbosely
    for pid in range(min(SHOW_FIRST_N_PARAS, n_paras)):
        res = compare_paragraph_fuzzy(tid, pid)
        print(f"\n--- Paragraph {pid} (doc_id={res.get('doc_id','?')}) ---")
        if res["status"] != "ok":
            print(res["status"])
            continue

        print("[TEXT]")
        print(res["text"])

        print("\n[GOLD mentions] n=", res["n_gold"])
        print(res["gold_strings"][:SHOW_FIRST_N_PRED])

        print("\n[PRED mentions (strings)] n=", len(res["pred_strings"]))
        print(res["pred_strings"][:SHOW_FIRST_N_PRED])

        print("\n[PRED spans (first match per string) — for inspection]")
        show = []
        for s, (st,en) in res["pred_spans_display"][:SHOW_FIRST_N_PRED]:
            show.append(f"({st},{en}) :: {s}")
        print(show)

        print(f"\nHITS={res['hits']}  MISSES={len(res['misses'])}")
        if res["misses"]:
            print("Missed gold (idx, span, text):")
            for gidx, (s,e), gtext in res["misses"][:SHOW_FIRST_N_PRED]:
                print(f"  - #{gidx} ({s},{e}) :: {gtext}")

        topic_gold += res["n_gold"]
        topic_hits += res["hits"]

    # aggregate over *all* paragraphs in the topic for recall (not just printed ones)
    for pid in range(n_paras):
        res = compare_paragraph_fuzzy(tid, pid)
        if res["status"] != "ok":
            continue
        topic_gold += 0  # already added in verbose loop? Let's recompute cleanly below.

    # clean recompute over all paras for topic recall
    topic_gold = topic_hits = 0
    for pid in range(n_paras):
        r = compare_paragraph_fuzzy(tid, pid)
        if r["status"] != "ok":
            continue
        topic_gold += r["n_gold"]
        topic_hits += r["hits"]

    topic_recall = (topic_hits / topic_gold) if topic_gold else 0.0
    total_gold += topic_gold
    total_hits += topic_hits
    macro_stats.append((topic_hits, topic_gold))

    print("\n" + "-"*110)
    print(f"Topic {tid} recall: {topic_hits}/{topic_gold} = {topic_recall:.3f}")

micro_recall = (total_hits / total_gold) if total_gold else 0.0
macro_recall = sum(h/g for h,g in macro_stats if g>0) / max(1, sum(1 for _,g in macro_stats if g>0))
print("\n" + "="*110)
print(f"[MICRO] recall over first {K} topics: {micro_recall:.3f}  ({total_hits}/{total_gold})")
print(f"[MACRO] recall over first {K} topics: {macro_recall:.3f}")


[1/30] Topic 521 — paragraphs: 66
--------------------------------------------------------------------------------------------------------------

--- Paragraph 0 (doc_id=0) ---
[TEXT]
With the rapid development of the network technologies, software development is becoming more and more complicated. Traditional software engineering management methods based on Client/Server structure have not been very competent for large-scale software development.

[GOLD mentions] n= 1
['software development']

[PRED mentions (strings)] n= 5
['network technologies', 'software development', 'software engineering management methods', 'Client/Server structure', 'large-scale software development']

[PRED spans (first match per string) — for inspection]
['(6,8) :: network technologies', '(9,11) :: software development', '(19,23) :: software engineering management methods', '(25,27) :: Client/Server structure', '(33,36) :: large-scale software development']

HITS=1  MISSES=0

--- Paragraph 1 (doc_id=1) ---
[

In [7]:
from evaluate_tanl import *

k = -1
split = "validation"
pred_path = "scico_dev_tanl_extraction.jsonl"
log_path = "_.txt"
delta_pos = 2
delta_len = 2

_, gold, topic_order = load_scico_gold(split)
pred = load_predictions(pred_path)

# choose topic subset
topic_ids = topic_order if k < 0 else topic_order[:k]

# progress & logging
total_gold = total_hits = 0
macro_parts = []
with open(log_path, "w", encoding="utf-8") as log_fh:
    for idx, tid in enumerate(tqdm(topic_ids, desc="Evaluating topics")):
        th, tg, rec = eval_topic(
            tid, gold, pred, delta_pos, delta_len, log_fh=log_fh
        )
        total_hits += th
        total_gold += tg
        if tg > 0:
            macro_parts.append(th / tg)

micro_recall = (total_hits / total_gold) if total_gold else 0.0
macro_recall = (sum(macro_parts) / len(macro_parts)) if macro_parts else 0.0

print("=" * 80)
print(f"SciCo split: {split}")
print(f"Topics evaluated: {len(topic_ids)}  |  Total gold mentions: {total_gold}")
print(f"[MICRO] recall: {micro_recall:.4f}  ({total_hits}/{total_gold})")
print(f"[MACRO] recall: {macro_recall:.4f}")
print(f"Missed cases log: {log_path}")

Evaluating topics: 100%|██████████| 100/100 [00:28<00:00,  3.51it/s]

SciCo split: validation
Topics evaluated: 100  |  Total gold mentions: 4873
[MICRO] recall: 0.8211  (4001/4873)
[MACRO] recall: 0.8237
Missed cases log: _.txt



