# 03 — Summarization Demos (Extractive TextRank + Optional Abstractive BART)

This notebook:
- Loads processed data
- Runs **extractive** TextRank (sumy) on a sample and full split
- (Optional) Runs **abstractive** BART for comparison
- Shows qualitative side-by-side examples and quick metrics
- Saves summarized artifacts for the report


In [1]:
import os, sys, time, json, yaml, math
from pathlib import Path

# Ensure local package is importable (src/ layout)
REPO_ROOT = Path.cwd().resolve().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
SRC_DIR = REPO_ROOT / "src"
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

print("Repo root:", REPO_ROOT)
print("Using src dir:", SRC_DIR)

import pandas as pd
import numpy as np
from IPython.display import display, HTML

# our summarizers
from escalate_nlp_agent.summarize.textrank import run_textrank
# abstractive is optional; we'll import lazily later


Repo root: C:\Users\BAB AL SAFA\Documents\Vani\personal\escalate-nlp-agent
Using src dir: C:\Users\BAB AL SAFA\Documents\Vani\personal\escalate-nlp-agent\src


In [2]:
CONFIG_PATH = REPO_ROOT / "configs" / "config.yaml"
with open(CONFIG_PATH, "r") as f:
    cfg = yaml.safe_load(f)

DS_CFG_PATH = REPO_ROOT / cfg["dataset_config"]
with open(DS_CFG_PATH, "r") as f:
    ds_cfg = yaml.safe_load(f)

TEXTRANK_CFG_PATH = REPO_ROOT / "configs" / "summarize" / "textrank.yaml"
with open(TEXTRANK_CFG_PATH, "r") as f:
    textrank_cfg = yaml.safe_load(f)

print("Dataset:", ds_cfg["name"], "| id:", ds_cfg["id"])
print("TextRank sentences:", textrank_cfg.get("sentences", 3))


Dataset: NLTK Reuters | id: reuters
TextRank sentences: 3


In [3]:
proc_dir = REPO_ROOT / ds_cfg["outputs"]["processed_dir"]
train_path = proc_dir / "train.parquet"
assert train_path.exists(), f"Missing {train_path}. Run Step 1 preprocessing."

df = pd.read_parquet(train_path)[["id","title","text"]]
print("Train shape:", df.shape)

SAMPLE_N = 20
sample = df.sample(SAMPLE_N, random_state=7).reset_index(drop=True)
sample.head(3)


Train shape: (7616, 3)


Unnamed: 0,id,title,text
0,training/11644,,overland express inc &lt;over> year loss shr l...
1,training/7076,,progressive bank inc &lt;psbk> qtly div seven ...
2,training/5830,,oecd january annual inflation steady at 2.1 pc...


In [4]:
t0 = time.time()
textrank_sample = run_textrank(sample[["id","text"]].copy(), textrank_cfg)
t1 = time.time()
print(f"TextRank on {len(textrank_sample)} docs: {t1 - t0:.2f}s")

demo = sample.merge(textrank_sample, on="id", how="left")
demo.head(3)


TextRank on 20 docs: 0.07s


Unnamed: 0,id,title,text,summary
0,training/11644,,overland express inc &lt;over> year loss shr l...,overland express inc &lt;over> year loss shr l...
1,training/7076,,progressive bank inc &lt;psbk> qtly div seven ...,progressive bank inc &lt;psbk> qtly div seven ...
2,training/5830,,oecd january annual inflation steady at 2.1 pc...,oecd january annual inflation steady at 2.1 pc...


In [5]:
def show_pair(row, max_input=600, max_sum=600):
    title = (row.get("title") or "").strip()
    text = (row["text"] or "").strip().replace("\n", " ")
    summ = (row["summary"] or "").strip().replace("\n", " ")
    text = text[:max_input]
    summ = summ[:max_sum]
    html = f"""
    <div style="border:1px solid #ddd;border-radius:10px;padding:12px;margin:10px 0">
      <div style="color:#666">id: <b>{row['id']}</b></div>
      <div style="font-weight:600;margin:6px 0">{title}</div>
      <div><b>Original:</b> <span style="color:#333">{text}</span></div>
      <div style="margin-top:6px"><b>Summary (TextRank):</b> <span style="color:#333">{summ}</span></div>
    </div>
    """
    display(HTML(html))

for _, r in demo.head(5).iterrows():
    show_pair(r)


In [7]:
def token_count(s):
    if not isinstance(s, str): return 0
    return len([t for t in s.split() if t.strip()])

m = demo.copy()
m["len_text"] = m["text"].map(token_count)
m["len_sum"] = m["summary"].map(token_count)
m["compression"] = np.where(m["len_text"]>0, m["len_sum"]/m["len_text"], np.nan)

metrics = {
    "n_docs": int(len(m)),
    "avg_len_text": float(m["len_text"].mean()),
    "avg_len_sum": float(m["len_sum"].mean()),
    "avg_compression_ratio": float(m["compression"].mean()),
    "pct_empty_summary": float((m["len_sum"]==0).mean()),
}
metrics


{'n_docs': 20,
 'avg_len_text': 101.3,
 'avg_len_sum': 46.25,
 'avg_compression_ratio': 0.6213364148700167,
 'pct_empty_summary': 0.0}

In [9]:
t0 = time.time()
textrank_full = run_textrank(df[["id","text"]].copy(), textrank_cfg)
t1 = time.time()
print(f"TextRank (full {len(textrank_full)} docs): {t1 - t0:.2f}s")

out_parquet = REPO_ROOT / f"data/processed/{ds_cfg['id']}/summaries_textrank.parquet"
out_parquet.parent.mkdir(parents=True, exist_ok=True)
textrank_full.to_parquet(out_parquet, index=False)
print("Saved:", out_parquet)


TextRank (full 7616 docs): 6.49s
Saved: C:\Users\BAB AL SAFA\Documents\Vani\personal\escalate-nlp-agent\data\processed\reuters\summaries_textrank.parquet


In [10]:
demo_dir = REPO_ROOT / "reports" / "summarization_demos"
demo_dir.mkdir(parents=True, exist_ok=True)

demo_out = demo[["id","title","text","summary"]]
demo_out.to_parquet(demo_dir / f"{ds_cfg['id']}_textrank_sample.parquet", index=False)

with open(demo_dir / f"{ds_cfg['id']}_textrank_metrics.json", "w", encoding="utf-8") as f:
    json.dump(metrics, f, indent=2)

print("Saved:")
print(" -", demo_dir / f"{ds_cfg['id']}_textrank_sample.parquet")
print(" -", demo_dir / f"{ds_cfg['id']}_textrank_metrics.json")


Saved:
 - C:\Users\BAB AL SAFA\Documents\Vani\personal\escalate-nlp-agent\reports\summarization_demos\reuters_textrank_sample.parquet
 - C:\Users\BAB AL SAFA\Documents\Vani\personal\escalate-nlp-agent\reports\summarization_demos\reuters_textrank_metrics.json
