# 02 — Information Extraction Demos (Rule-based + spaCy NER)

This notebook:
- Loads processed data (from Step 1)
- Runs **rule-based** regex extraction and **spaCy NER** on a small sample
- Shows qualitative, side-by-side examples
- Saves demo outputs for the report


In [2]:
import os, sys, json, yaml, time
from pathlib import Path

# Ensure local package is importable
REPO_ROOT = Path.cwd().resolve().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
SRC_DIR = REPO_ROOT / "src"
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

print("Repo root:", REPO_ROOT)
print("Using src dir:", SRC_DIR)

import pandas as pd
from IPython.display import display, HTML

from escalate_nlp_agent.extract.rule_based import run_rule_based
from escalate_nlp_agent.extract.spacy_ner import run_spacy_ner


Repo root: C:\Users\BAB AL SAFA\Documents\Vani\personal\escalate-nlp-agent
Using src dir: C:\Users\BAB AL SAFA\Documents\Vani\personal\escalate-nlp-agent\src


In [3]:
# Load master config → dataset config
CONFIG_PATH = REPO_ROOT / "configs" / "config.yaml"
with open(CONFIG_PATH, "r") as f:
    cfg = yaml.safe_load(f)

DS_CFG_PATH = REPO_ROOT / cfg["dataset_config"]
with open(DS_CFG_PATH, "r") as f:
    ds_cfg = yaml.safe_load(f)

print("Dataset:", ds_cfg["name"], "| id:", ds_cfg["id"])


Dataset: NLTK Reuters | id: reuters


In [4]:
import spacy
try:
    _ = spacy.load("en_core_web_sm")
except Exception:
    !python -m spacy download en_core_web_sm


In [5]:
proc_dir = REPO_ROOT / ds_cfg["outputs"]["processed_dir"]
train_path = proc_dir / "train.parquet"
assert train_path.exists(), f"Missing {train_path}. Run Step 1 preprocessing."

df = pd.read_parquet(train_path)[["id","text","title"]]
print(df.shape)
df.head(3)


(7616, 3)


Unnamed: 0,id,text,title
0,training/1,bahia cocoa review showers continued throughou...,
1,training/10,computer terminal systems &lt;cpml> completes ...,
2,training/100,n.z. trading bank deposit growth rises slightl...,


In [6]:
SAMPLE_N = 20  # fast demo size
sample = df.sample(SAMPLE_N, random_state=42).reset_index(drop=True)
sample.head(3)


Unnamed: 0,id,text,title
0,training/12077,walker telecommunications corp &lt;wtel> 4th q...,
1,training/2714,furniture (ufurf) ups bench craft (sofa) stake...,
2,training/13263,caesars world board approves recapitalization ...,


In [7]:
RULE_CFG = REPO_ROOT / "configs" / "extract" / "rule_based.yaml"
SPCY_CFG = REPO_ROOT / "configs" / "extract" / "spacy_ner.yaml"

with open(RULE_CFG, "r") as f:
    rule_cfg = yaml.safe_load(f)
with open(SPCY_CFG, "r") as f:
    spacy_cfg = yaml.safe_load(f)

rule_cfg, spacy_cfg["model"]


({'name': 'Rule-based Extraction',
  'patterns': {'dates': '\\b\\d{1,2}[/-]\\d{1,2}[/-]\\d{2,4}\\b',
   'numbers': '\\b\\d+(?:\\.\\d+)?\\b',
   'emails': '[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}',
   'urls': 'https?://[^\\s]+'},
  'outputs': {'file': 'data/processed/{dataset}/extractions_rule_based.parquet'}},
 'en_core_web_sm')

In [8]:
t0 = time.time()
rb_out = run_rule_based(sample[["id","text"]].copy(), rule_cfg)
t1 = time.time()
print(f"Rule-based extraction: {len(rb_out)} rows, {t1 - t0:.2f}s")
rb_out.head(3)


Rule-based extraction: 20 rows, 0.01s


Unnamed: 0,id,extracted
0,training/12077,"{'dates': [], 'numbers': ['818', '25.2', '58',..."
1,training/2714,"{'dates': [], 'numbers': ['4', '2', '45.3', '2..."
2,training/13263,"{'dates': [], 'numbers': ['28.7', '37.8', '28'..."


In [11]:
t0 = time.time()
ner_out = run_spacy_ner(sample[["id","text"]].copy(), spacy_cfg)
t1 = time.time()
print(f"spaCy NER extraction: {len(ner_out)} rows, {t1 - t0:.2f}s")
ner_out.head(3)


spaCy NER extraction: 20 rows, 0.82s


Unnamed: 0,id,entities
0,training/12077,"[{'label': 'ORG', 'text': 'walker telecommunic..."
1,training/2714,"[{'label': 'ORG', 'text': 'universal furniture..."
2,training/13263,"[{'label': 'ORG', 'text': 'caesars world board..."


In [10]:
def html_entities(entity_list, max_items=12):
    rows = []
    for ent in entity_list[:max_items]:
        rows.append(f"<span style='padding:2px 6px;margin:2px;border:1px solid #ccc;border-radius:8px;display:inline-block;'>{ent['label']}: <b>{ent['text']}</b></span>")
    return " ".join(rows)

def html_rule_hits(extracted, max_items=6):
    # extracted is a dict like {'dates':[...],'numbers':[...],...}
    chunks = []
    for k, v in extracted.items():
        if not v: 
            continue
        vals = ", ".join(map(str, v[:max_items]))
        chunks.append(f"<li><b>{k}</b>: {vals}</li>")
    return "<ul>" + "".join(chunks) + "</ul>"


In [12]:
merged = (sample[["id","title","text"]]
          .merge(rb_out, on="id", how="left")
          .merge(ner_out, on="id", how="left", suffixes=("_rb","_ner")))

def show_row(r, max_text=600):
    text = (r["text"] or "")[:max_text].replace("\n", " ")
    ents = r.get("entities") or []
    exts = r.get("extracted") or {}
    html = f"""
    <div style="border:1px solid #ddd;border-radius:10px;padding:10px;margin-bottom:12px">
      <div style="color:#666;font-size:14px">id: <b>{r['id']}</b></div>
      <div style="color:#333;font-size:16px;margin:6px 0"><b>{(r.get('title') or '')}</b></div>
      <div style="white-space:normal;line-height:1.4;margin:8px 0">{text}</div>
      <div style="margin-top:8px"><b>Rule-based:</b> {html_rule_hits(exts)}</div>
      <div style="margin-top:4px"><b>spaCy NER:</b> {html_entities(ents)}</div>
    </div>
    """
    display(HTML(html))

for _, row in merged.head(5).iterrows():
    show_row(row)


In [13]:
# Count total hits per category for rule-based
rb_counts = []
for _, row in rb_out.iterrows():
    ex = row["extracted"] or {}
    for k, vals in ex.items():
        rb_counts.append({"type": k, "count": len(vals)})
rb_counts = pd.DataFrame(rb_counts).groupby("type")["count"].sum().sort_values(ascending=False).reset_index()
print("Rule-based total hits:")
display(rb_counts)

# Count entities by label
ent_counts = []
for _, row in ner_out.iterrows():
    ents = row["entities"] or []
    for e in ents:
        ent_counts.append({"label": e["label"]})
ent_counts = pd.DataFrame(ent_counts)
if len(ent_counts):
    ent_counts = ent_counts.value_counts().reset_index(name="count").rename(columns={"index":"label"})
    print("NER label counts:")
    display(ent_counts)
else:
    print("No entities found in sample.")


Rule-based total hits:


Unnamed: 0,type,count
0,numbers,203
1,dates,0
2,emails,0
3,urls,0


NER label counts:


Unnamed: 0,label,count
0,ORG,78
1,DATE,75
2,GPE,34
3,PERSON,13
4,MONEY,5


In [14]:
def to_set_safe(v):
    try:
        return set(v)
    except Exception:
        return set()

rows = []
for _, r in merged.iterrows():
    ex = r.get("extracted") or {}
    numbers = to_set_safe(ex.get("numbers", []))
    dates_rb = to_set_safe(ex.get("dates", []))

    ents = r.get("entities") or []
    dates_ner = set(e["text"] for e in ents if e["label"] == "DATE")
    money_ner = set(e["text"] for e in ents if e["label"] == "MONEY")
    percent_ner = set(e["text"] for e in ents if e["label"] == "PERCENT")

    rows.append({
        "id": r["id"],
        "rb_numbers": len(numbers),
        "rb_dates": len(dates_rb),
        "ner_dates": len(dates_ner),
        "ner_money": len(money_ner),
        "ner_percent": len(percent_ner),
        "any_overlap_dates": int(len(dates_rb & dates_ner) > 0)
    })
overlap_df = pd.DataFrame(rows)
display(overlap_df.head(10))
print("Docs with DATE overlap (regex vs NER):", int(overlap_df["any_overlap_dates"].sum()), "/", len(overlap_df))


Unnamed: 0,id,rb_numbers,rb_dates,ner_dates,ner_money,ner_percent,any_overlap_dates
0,training/12077,21,0,3,0,0,0
1,training/2714,7,0,1,0,0,0
2,training/13263,15,0,6,1,0,0
3,training/10215,2,0,0,0,0,0
4,training/12243,6,0,1,0,0,0
5,training/10720,7,0,2,0,0,0
6,training/12883,7,0,1,0,0,0
7,training/5286,4,0,4,0,0,0
8,training/13559,7,0,9,0,0,0
9,training/9652,26,0,1,0,0,0


Docs with DATE overlap (regex vs NER): 0 / 20


In [15]:
out_dir = REPO_ROOT / "reports" / "extraction_demos"
out_dir.mkdir(parents=True, exist_ok=True)

merged.to_parquet(out_dir / f"{ds_cfg['id']}_sample_extractions.parquet", index=False)
rb_counts.to_csv(out_dir / f"{ds_cfg['id']}_rulebased_counts.csv", index=False)
if isinstance(ent_counts, pd.DataFrame) and not ent_counts.empty:
    ent_counts.to_csv(out_dir / f"{ds_cfg['id']}_ner_label_counts.csv", index=False)

print("Saved:")
print(" -", out_dir / f"{ds_cfg['id']}_sample_extractions.parquet")
if "rb_counts" in locals(): print(" -", out_dir / f"{ds_cfg['id']}_rulebased_counts.csv")
if "ent_counts" in locals() and isinstance(ent_counts, pd.DataFrame) and not ent_counts.empty:
    print(" -", out_dir / f"{ds_cfg['id']}_ner_label_counts.csv")


Saved:
 - C:\Users\BAB AL SAFA\Documents\Vani\personal\escalate-nlp-agent\reports\extraction_demos\reuters_sample_extractions.parquet
 - C:\Users\BAB AL SAFA\Documents\Vani\personal\escalate-nlp-agent\reports\extraction_demos\reuters_rulebased_counts.csv
 - C:\Users\BAB AL SAFA\Documents\Vani\personal\escalate-nlp-agent\reports\extraction_demos\reuters_ner_label_counts.csv
