In [1]:
# %% [Downstream Infinite Ripple Stability Bench] Infinite downstream with support≥2 + growth cap pool
import os, math, time, logging, json, re
from collections import defaultdict
from typing import Dict, List, Set, Any, Optional

import numpy as np
import pandas as pd
import requests

# --------------------
# Config (edit to taste)
# --------------------
CFG = dict(
    # Choose seeds: either set CLUSTER_ID, or leave None to auto-pick largest from clusters_snapshot.json
    CLUSTER_ID=None,                      # e.g., 13
    SNAP_PATH="clusters_snapshot.json",   # produced by the previous cell

    # iCite
    icite_batch=200,
    sleep_between=0.0,

    # Structural gates (GLOBAL)
    support_min=2,                        # strict global gate: candidate must cite ≥2 seeds (directly)

    # Growth guard (size-aware):
    # cap the number of nodes admitted at depth L based on the previous frontier size
    pool_factor=3.0,                      # cap = pool_factor * |frontier_prev| + pool_offset
    pool_offset=10,

    # dynamic explosion tolerance (logs only; cap already enforces control):
    alpha_base=1.5,                       # baseline allowed growth (kept) vs previous frontier (log only)
    alpha_small_frontier=6.0,             # when prev frontier is small (<= small_frontier_thresh), allow much higher growth
    small_frontier_thresh=12,

    # Entrez (years optional; coupling does not require years)
    entrez_email="your@email",
    entrez_api_key=os.environ.get("NCBI_API_KEY", None),

    # Safety limits
    max_depth=50,                         # hard ceiling; bench aims to run "until exhaustion" otherwise
)

# --------------------
# Logging
# --------------------
def _setup_logging():
    fmt = "[%(asctime)s] %(levelname)s %(message)s"
    datefmt = "%H:%M:%S"
    logging.basicConfig(level=logging.INFO, format=fmt, datefmt=datefmt, force=True)
    return logging.getLogger("downstream_bench")
log = _setup_logging()

# --------------------
# iCite helpers
# --------------------
def _to_int_list(val):
    if not val: return []
    out=[]
    if isinstance(val, list):
        for x in val:
            try: out.append(int(x))
            except: pass
    elif isinstance(val, (int,str)) and str(val).isdigit():
        out.append(int(val))
    return out

def icite_fetch(pmids_list: List[int], batch=200, pause=0.0) -> Dict[int, Dict[str, Any]]:
    """
    Return {pmid: {'references':[...], 'cited_by':[...], 'raw':...}}
    """
    out = {}
    pmids_int = [int(p) for p in pmids_list if str(p).isdigit()]
    if not pmids_int:
        return out
    total = int(math.ceil(len(pmids_int) / batch))
    for i in range(0, len(pmids_int), batch):
        sub = pmids_int[i:i+batch]
        r = requests.get("https://icite.od.nih.gov/api/pubs",
                         params={"pmids": ",".join(str(x) for x in sub), "legacy":"false"},
                         timeout=90)
        r.raise_for_status()
        data = r.json()
        recs = data.get("data", data if isinstance(data, list) else []) or []
        for rec in recs:
            pid = rec.get("pmid")
            if pid is None or not str(pid).isdigit(): 
                continue
            pid = int(pid)
            refs = _to_int_list(rec.get("citedPmids")) or _to_int_list(rec.get("references"))
            cby  = _to_int_list(rec.get("citedByPmids")) or _to_int_list(rec.get("citedBy")) \
                   or _to_int_list(rec.get("cited_in")) or _to_int_list(rec.get("citedIn"))
            out[pid] = {"references": refs, "cited_by": cby, "raw": rec}
        log.info(f"[iCite] chunk {i//batch+1}/{total} | items={len(recs)}")
        if pause:
            time.sleep(pause)
    return out

# --------------------
# Bibliographic scores (structural, normalized)
# --------------------
def biblio_coupling_score(refs_a: Set[int], refs_seed_union: Set[int]) -> float:
    if not refs_a or not refs_seed_union:
        return 0.0
    inter = len(refs_a & refs_seed_union)
    return inter / math.sqrt(max(1, len(refs_a)) * max(1, len(refs_seed_union)))

# --------------------
# Growth cap logic
# --------------------
def cap_for_frontier(prev_frontier_size: int) -> int:
    return int(CFG["pool_factor"] * max(1, prev_frontier_size) + CFG["pool_offset"])

def effective_alpha(prev_frontier_size: int) -> float:
    # Allow larger relative growth when the previous frontier is small
    if prev_frontier_size <= CFG["small_frontier_thresh"]:
        return CFG["alpha_small_frontier"]
    return CFG["alpha_base"]

# --------------------
# Snapshot loader
# --------------------
def load_clusters_snapshot(path="clusters_snapshot.json"):
    with open(path, "r", encoding="utf-8") as f:
        raw = json.load(f)
    return {int(k): list(map(int, v)) for k, v in raw.items()}

# --------------------
# Main bench
# --------------------
def bench_downstream_infinite():
    # 1) Seeds from snapshot (or largest)
    snap = load_clusters_snapshot(CFG["SNAP_PATH"])
    cid = CFG["CLUSTER_ID"] if CFG["CLUSTER_ID"] is not None else max(snap.items(), key=lambda kv: len(kv[1]))[0]
    seeds = sorted(set(int(x) for x in snap[cid]))
    log.info(f"=== Downstream Bench | cluster={cid} | seeds={len(seeds)} ===")

    # 2) iCite for seeds (we need seed references for coupling)
    ic_seeds = icite_fetch(seeds, batch=CFG["icite_batch"], pause=CFG["sleep_between"])
    seed_refs_union = set()
    for s in seeds:
        seed_refs_union |= set(ic_seeds.get(s, {}).get("references", []) or [])

    # 3) Ripple loop (downstream only)
    visited = set(seeds)
    frontier = set(seeds)
    depth = 0
    roll = []  # per-depth logs

    while depth < CFG["max_depth"]:
        depth += 1

        # discover downstream citers of all frontier nodes, minus visited
        discovered = set()
        for u in frontier:
            cby = ic_seeds.get(u, {}).get("cited_by", []) if depth == 1 else ic_map.get(u, {}).get("cited_by", [])
            discovered.update(int(x) for x in (cby or []))
        discovered -= visited

        if not discovered:
            log.info(f"[D d={depth}] discovered=0 → stop")
            break

        # fetch iCite for discovered to obtain their references and future citers
        ic_map = icite_fetch(sorted(discovered), batch=CFG["icite_batch"], pause=CFG["sleep_between"])

        # support gate (GLOBAL): how many seeds does a candidate directly cite?
        support = {}
        for c in discovered:
            refs = set(ic_map.get(c, {}).get("references", []) or [])
            support[c] = len(refs & set(seeds))
        keep_support = [c for c in discovered if support.get(c, 0) >= CFG["support_min"]]

        # bibliographic coupling vs union of seed refs
        rows=[]
        for c in keep_support:
            refs = set(ic_map.get(c, {}).get("references", []) or [])
            score = biblio_coupling_score(refs, seed_refs_union)
            rows.append((c, score, support[c]))
        rows.sort(key=lambda t: (-t[1], -t[2], t[0]))  # by coupling desc, then support desc

        # global growth cap pool (size based on previous frontier)
        pool_cap = cap_for_frontier(len(frontier))
        kept_capped = [c for (c,_,_) in rows[:pool_cap]]

        # logs
        disc_n = len(discovered)
        sup_n  = len(keep_support)
        kept_n = len(kept_capped)
        alpha_eff = effective_alpha(len(frontier))
        growth_ratio = kept_n / max(1, len(frontier))
        unstable_flag = (growth_ratio > alpha_eff)  # informational; we still cap

        log.info(f"[D d={depth}] disc={disc_n} → support_keep={sup_n} → after_cap={kept_n} | "
                 f"prev_frontier={len(frontier)} | cap={pool_cap} | growth×={growth_ratio:.2f} "
                 f"{'| [explosive]' if unstable_flag else ''}")

        # rollup row
        roll.append(dict(depth=depth, discovered=disc_n, support_keep=sup_n, after_cap=kept_n,
                         prev_frontier=len(frontier), cap=pool_cap, growth=growth_ratio,
                         explosive=bool(unstable_flag)))

        if kept_n == 0:
            log.info(f"[D d={depth}] frontier empty after cap → stop")
            break

        # prepare next iteration
        visited.update(kept_capped)
        frontier = set(kept_capped)

        # cache their icite for next depth (citers)
        # (ic_map already holds data for 'frontier'; reuse in next loop via 'ic_map' variable)

    # Final table
    df = pd.DataFrame(roll)
    if not df.empty:
        print("\n=== Downstream ripple (per-depth) ===")
        print(df.to_string(index=False))
        print("\n[summary] total kept (all depths) =", int(df["after_cap"].sum()))
    else:
        print("\n=== Downstream ripple (per-depth) ===\n(no expansions)")

# --------------------
# Run the bench
# --------------------
bench_downstream_infinite()


[12:15:32] INFO === Downstream Bench | cluster=13 | seeds=31 ===
[12:15:32] INFO [iCite] chunk 1/1 | items=31
[12:15:33] INFO [iCite] chunk 1/2 | items=200
[12:15:34] INFO [iCite] chunk 2/2 | items=102
[12:15:34] INFO [D d=1] disc=302 → support_keep=19 → after_cap=19 | prev_frontier=31 | cap=103 | growth×=0.61 
[12:15:34] INFO [iCite] chunk 1/1 | items=196
[12:15:34] INFO [D d=2] disc=196 → support_keep=0 → after_cap=0 | prev_frontier=19 | cap=67 | growth×=0.00 
[12:15:34] INFO [D d=2] frontier empty after cap → stop



=== Downstream ripple (per-depth) ===
 depth  discovered  support_keep  after_cap  prev_frontier  cap   growth  explosive
     1         302            19         19             31  103 0.612903      False
     2         196             0          0             19   67 0.000000      False

[summary] total kept (all depths) = 19


In [None]:
# %% [All-clusters downstream ripple with layer-relative support and TF-IDF logging]
import os, math, time, json, logging
from collections import defaultdict
from typing import Dict, List, Any, Set, Tuple
import xml.etree.ElementTree as ET

import numpy as np
import pandas as pd
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from scipy.sparse import vstack

# --------------------
# Config
# --------------------
CFG = dict(
    SNAP_PATH="clusters_snapshot.json",         # {cluster_id: [pmids]}
    CLUSTERS_INCLUDE=None,                      # e.g. [13, 7, 5]; None => all
    MAX_CLUSTERS=None,                          # limit number of clusters for a quick pass
    icite_batch=200,
    efetch_batch=200,
    sleep_between=0.0,                          # add 0.05–0.2 if you want to be gentle
    support_min=2,                              # GLOBAL gate, now measured vs frontier
    pool_factor=3.0,                            # cap = pool_factor*|prev_frontier| + pool_offset
    pool_offset=10,
    max_depth=25,                               # downstream depth ceiling; frontier empties sooner in practice
    entrez_email="you@example",                 # fill yours
    entrez_api_key=os.environ.get("NCBI_API_KEY"),  # optional
    WRITE_CSV=False,
    OUT_PREFIX="downstream_all_clusters_layer_support",
)

# --------------------
# Logging
# --------------------
def _setup_logging():
    fmt = "[%(asctime)s] %(levelname)s %(message)s"
    datefmt = "%H:%M:%S"
    logging.basicConfig(level=logging.INFO, format=fmt, datefmt=datefmt, force=True)
    return logging.getLogger("bench_downstream_layer_support")
log = _setup_logging()

# --------------------
# Snapshot
# --------------------
def load_clusters_snapshot(path) -> Dict[int, List[int]]:
    with open(path, "r", encoding="utf-8") as f:
        raw = json.load(f)
    return {int(k): [int(x) for x in v] for k, v in raw.items()}

# --------------------
# iCite cache
# --------------------
ICACHE: Dict[int, Dict[str, Any]] = {}

def _to_int_list(val) -> List[int]:
    if not val: return []
    if isinstance(val, list):
        out=[]
        for x in val:
            try:
                xi = int(x)
                out.append(xi)
            except:
                continue
        return out
    try:
        return [int(val)]
    except:
        return []

def icite_fetch_missing(pmids: List[int], batch: int, pause: float) -> None:
    need = [int(p) for p in pmids if str(p).isdigit() and int(p) not in ICACHE]
    if not need: return
    total = math.ceil(len(need)/batch)
    for i in range(0, len(need), batch):
        sub = need[i:i+batch]
        r = requests.get(
            "https://icite.od.nih.gov/api/pubs",
            params={"pmids": ",".join(str(x) for x in sub), "legacy":"false"},
            timeout=90
        )
        r.raise_for_status()
        data = r.json()
        recs = data.get("data", data if isinstance(data, list) else []) or []
        for rec in recs:
            pid = rec.get("pmid")
            if pid is None: continue
            try: pid = int(str(pid))
            except: continue
            refs = _to_int_list(rec.get("citedPmids")) or _to_int_list(rec.get("references"))
            cby  = _to_int_list(rec.get("citedByPmids")) or _to_int_list(rec.get("citedBy")) \
                   or _to_int_list(rec.get("cited_in")) or _to_int_list(rec.get("citedIn"))
            ICACHE[pid] = {"references": refs or [], "cited_by": cby or [], "raw": rec}
        log.info(f"[iCite] chunk {i//batch+1}/{total} | items={len(recs)}")
        if pause: time.sleep(pause)

# --------------------
# EFetch title+abstract cache
# --------------------
TCACHE: Dict[int, Dict[str, Any]] = {}  # pmid -> {title, abstract, year}

def efetch_missing(pmids: List[int], email: str, api_key: str, batch: int, pause: float=0.0):
    need = [p for p in pmids if p not in TCACHE]
    if not need: return
    total = math.ceil(len(need)/batch)
    base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    for i in range(0, len(need), batch):
        sub = need[i:i+batch]
        params = {
            "db": "pubmed",
            "id": ",".join(str(x) for x in sub),
            "retmode": "xml",
            "email": email
        }
        if api_key: params["api_key"] = api_key
        r = requests.get(base, params=params, timeout=90)
        r.raise_for_status()
        root = ET.fromstring(r.text)

        for art in root.findall(".//PubmedArticle"):
            pmid_el = art.find(".//PMID")
            if pmid_el is None: continue
            try:
                pid = int(pmid_el.text.strip())
            except:
                continue
            title = ""
            t_el = art.find(".//ArticleTitle")
            if t_el is not None and t_el.text:
                title = t_el.text.strip()

            abs_parts = []
            for a in art.findall(".//Abstract/AbstractText"):
                # join labeled sections if present
                if a.text: abs_parts.append(a.text.strip())
            abstract = " ".join(abs_parts)

            year = None
            y_el = art.find(".//JournalIssue/PubDate/Year")
            if y_el is not None and y_el.text and y_el.text.isdigit():
                year = int(y_el.text)

            TCACHE[pid] = {"title": title, "abstract": abstract, "year": year}
        if pause: time.sleep(pause)

def text_for(pmids: List[int]) -> List[str]:
    out=[]
    for p in pmids:
        rec = TCACHE.get(p, {})
        t = (rec.get("title","") or "").strip()
        a = (rec.get("abstract","") or "").strip()
        out.append((t + " " + a).strip())
    return out

# --------------------
# Scoring, caps, TF-IDF metrics
# --------------------
def biblio_coupling_score(refs_a: Set[int], refs_frontier_union: Set[int]) -> float:
    if not refs_a or not refs_frontier_union: return 0.0
    inter = len(refs_a & refs_frontier_union)
    return inter / math.sqrt(max(1, len(refs_a)) * max(1, len(refs_frontier_union)))

def cap_for_frontier(prev_frontier_size: int) -> int:
    return int(CFG["pool_factor"] * max(1, prev_frontier_size) + CFG["pool_offset"])

def tfidf_metrics(seed_pmids: List[int], kept_pmids: List[int]) -> Dict[str, float]:
    # Nothing to score
    if not kept_pmids:
        return dict(
            cos_seed_centroid_med=np.nan, cos_seed_centroid_p90=np.nan,
            centroid_drift=np.nan, cohesion=np.nan
        )

    # Ensure we have text
    efetch_missing(seed_pmids + kept_pmids, CFG["entrez_email"], CFG["entrez_api_key"], CFG["efetch_batch"])
    seed_texts = text_for(seed_pmids)
    kept_texts = text_for(kept_pmids)

    # If all texts are empty, avoid vectorizer failure
    if not any(seed_texts) and not any(kept_texts):
        return dict(
            cos_seed_centroid_med=np.nan, cos_seed_centroid_p90=np.nan,
            centroid_drift=np.nan, cohesion=np.nan
        )

    # Build TF-IDF on seeds + kept (title+abstract), row-L2
    try:
        vect = TfidfVectorizer(max_features=50000, ngram_range=(1,2), lowercase=True)
        X = vect.fit_transform(seed_texts + kept_texts)
    except ValueError:
        # empty vocabulary fallback
        return dict(
            cos_seed_centroid_med=np.nan, cos_seed_centroid_p90=np.nan,
            centroid_drift=np.nan, cohesion=np.nan
        )

    from sklearn.preprocessing import normalize as sk_normalize
    X = sk_normalize(X, norm="l2", axis=1)

    n_seed = len(seed_texts)
    X_seed = X[:n_seed]
    X_keep = X[n_seed:]

    # ---- Centroids as ndarrays (no np.matrix) ----
    # Seed centroid
    seed_centroid_vec = np.asarray(X_seed.mean(axis=0)).ravel()
    nrm = np.linalg.norm(seed_centroid_vec)
    if nrm > 0:
        seed_centroid_vec = seed_centroid_vec / nrm

    # Cosine to seed centroid for kept rows (sparse @ dense -> dense 1D)
    cos = X_keep.dot(seed_centroid_vec)
    cos = np.asarray(cos).ravel()
    cos_med = float(np.median(cos)) if cos.size else np.nan
    cos_p90 = float(np.quantile(cos, 0.90)) if cos.size else np.nan

    # Layer centroid & drift (cosine between centroids)
    layer_centroid_vec = np.asarray(X_keep.mean(axis=0)).ravel()
    nrm = np.linalg.norm(layer_centroid_vec)
    if nrm > 0:
        layer_centroid_vec = layer_centroid_vec / nrm
    drift = float(layer_centroid_vec @ seed_centroid_vec) if seed_centroid_vec.size else np.nan

    # Cohesion: mean pairwise cosine among kept rows.
    # With L2 rows, sum-vector trick: (||Σx_i||^2 - n) / (n*(n-1))
    n = X_keep.shape[0]
    if n >= 2:
        sum_vec = np.asarray(X_keep.sum(axis=0)).ravel()
        num = float(sum_vec @ sum_vec) - n
        den = n * (n - 1)
        cohesion = num / den
    else:
        cohesion = np.nan

    return dict(
        cos_seed_centroid_med=cos_med,
        cos_seed_centroid_p90=cos_p90,
        centroid_drift=drift,
        cohesion=cohesion
    )


# --------------------
# Per-cluster downstream ripple
# --------------------
def run_downstream_for_cluster(cluster_id: int, seeds: List[int]) -> Tuple[pd.DataFrame, Dict[str, Any]]:
    seeds = sorted(set(int(x) for x in seeds if str(x).isdigit()))
    if not seeds:
        return pd.DataFrame(), dict(cluster_id=cluster_id, seeds=0, total_kept=0, depth_reached=0)

    log.info(f"[cluster {cluster_id}] seeds={len(seeds)}")
    # Warm icite for seeds
    icite_fetch_missing(seeds, CFG["icite_batch"], CFG["sleep_between"])

    visited = set(seeds)
    frontier = set(seeds)
    depth = 0
    rows = []

    while depth < CFG["max_depth"]:
        depth += 1

        # 1) Discover downstream citers of current frontier
        discovered = set()
        need = [u for u in frontier if u not in ICACHE]
        if need:
            icite_fetch_missing(need, CFG["icite_batch"], CFG["sleep_between"])
        for u in frontier:
            discovered.update(ICACHE.get(u, {}).get("cited_by", []))
        discovered -= visited
        disc_n = len(discovered)
        if disc_n == 0:
            rows.append(dict(cluster_id=cluster_id, depth=depth,
                             discovered=0, support_keep=0, after_cap=0,
                             prev_frontier=len(frontier), cap=cap_for_frontier(len(frontier)),
                             growth=0.0, support_seed_med=0.0, support_frontier_med=0.0,
                             attach_ratio_med=0.0, cos_seed_centroid_med=np.nan,
                             cos_seed_centroid_p90=np.nan, centroid_drift=np.nan, cohesion=np.nan))
            break

        # Fetch icite for discovered
        icite_fetch_missing(sorted(discovered), CFG["icite_batch"], CFG["sleep_between"])

        # 2) Support vs FRONTIER (global gate)
        frontier_set = set(frontier)
        seeds_set    = set(seeds)
        keep = []
        sup_seed = []
        sup_front = []
        attach_ratio = []   # fraction of candidate refs that hit frontier

        # Precompute frontier refs union (for biblio ranking)
        frontier_refs_union = set()
        for f in frontier:
            frontier_refs_union.update(ICACHE.get(f, {}).get("references", []))

        for c in discovered:
            refs = set(ICACHE.get(c, {}).get("references", []))
            k_front = len(refs & frontier_set)
            k_seed  = len(refs & seeds_set)
            if len(refs) > 0:
                attach_ratio.append(k_front / len(refs))
            else:
                attach_ratio.append(0.0)
            if k_front >= CFG["support_min"]:
                keep.append(c)
            sup_front.append(k_front)
            sup_seed.append(k_seed)

        support_frontier_med = float(np.median(sup_front)) if sup_front else 0.0
        support_seed_med     = float(np.median(sup_seed)) if sup_seed else 0.0
        attach_ratio_med     = float(np.median(attach_ratio)) if attach_ratio else 0.0

        # 3) Rank kept by biblio coupling vs frontier refs union, then by higher frontier support
        scored = []
        for c in keep:
            refs = set(ICACHE.get(c, {}).get("references", []))
            sc = biblio_coupling_score(refs, frontier_refs_union)
            kf = len(refs & frontier_set)
            scored.append((c, sc, kf))
        scored.sort(key=lambda t: (-t[1], -t[2], t[0]))

        # 4) Cap
        cap = cap_for_frontier(len(frontier))
        kept = [c for (c, _, _) in scored[:cap]]

        # 5) Semantics: TF-IDF(title+abstract) metrics vs seed centroid
        tfm = tfidf_metrics(seeds, kept)

        growth = len(kept) / max(1, len(frontier))

        rows.append(dict(cluster_id=cluster_id, depth=depth,
                         discovered=disc_n,
                         support_keep=len(keep),
                         after_cap=len(kept),
                         prev_frontier=len(frontier),
                         cap=cap,
                         growth=growth,
                         support_seed_med=support_seed_med,
                         support_frontier_med=support_frontier_med,
                         attach_ratio_med=attach_ratio_med,
                         **tfm))

        if not kept:
            break

        visited.update(kept)
        frontier = set(kept)

    df = pd.DataFrame(rows)
    total_kept = int(df["after_cap"].sum()) if not df.empty else 0
    depth_reached = int(df["depth"].max()) if not df.empty else 0
    meta = dict(cluster_id=cluster_id, seeds=len(seeds),
                total_kept=total_kept, depth_reached=depth_reached)
    return df, meta

# --------------------
# Run across all clusters
# --------------------
def run_all_clusters_downstream():
    snap = load_clusters_snapshot(CFG["SNAP_PATH"])
    cluster_ids = sorted(snap.keys())
    if CFG["CLUSTERS_INCLUDE"] is not None:
        filt = set(CFG["CLUSTERS_INCLUDE"])
        cluster_ids = [c for c in cluster_ids if c in filt]
    if CFG["MAX_CLUSTERS"]:
        cluster_ids = cluster_ids[:CFG["MAX_CLUSTERS"]]

    log.info(f"=== Running downstream bench across {len(cluster_ids)} clusters (layer-relative support≥{CFG['support_min']}) ===")

    per_depth = []
    summaries = []

    for cid in cluster_ids:
        seeds = snap[cid]
        df, meta = run_downstream_for_cluster(cid, seeds)
        if not df.empty:
            per_depth.append(df)
        summaries.append(meta)

    depth_df = pd.concat(per_depth, ignore_index=True) if per_depth else pd.DataFrame()
    cluster_df = pd.DataFrame(summaries)

    print("\n=== Per-cluster summary (top 30 by total_kept) ===")
    if not cluster_df.empty:
        print(cluster_df.sort_values(["total_kept","depth_reached","seeds"], ascending=[False,False,False])
                    .head(30).to_string(index=False))
    else:
        print("(no clusters)")

    print("\n=== Depth-wise aggregate across clusters ===")
    if not depth_df.empty:
        agg = (depth_df
               .groupby("depth")
               .agg(n_clusters=("cluster_id","nunique"),
                    discovered_med=("discovered","median"),
                    discovered_p90=("discovered",lambda s: float(np.quantile(s,0.9))),
                    support_keep_med=("support_keep","median"),
                    support_keep_p90=("support_keep",lambda s: float(np.quantile(s,0.9))),
                    after_cap_med=("after_cap","median"),
                    after_cap_p90=("after_cap",lambda s: float(np.quantile(s,0.9))),
                    growth_med=("growth","median"),
                    growth_p90=("growth",lambda s: float(np.quantile(s,0.9))),
                    support_frontier_med=("support_frontier_med","median"),
                    attach_ratio_med=("attach_ratio_med","median"),
                    cos_seed_centroid_med=("cos_seed_centroid_med","median"),
                    cos_seed_centroid_p90=("cos_seed_centroid_p90","median"),
                    centroid_drift_med=("centroid_drift","median"),
                    cohesion_med=("cohesion","median"))
               .reset_index())
        print(agg.to_string(index=False))
    else:
        print("(no depth expansions)")

    print("\n=== Halt pattern (first depth with after_cap==0 per cluster) ===")
    if not depth_df.empty:
        first_zero = (depth_df.assign(zero = depth_df["after_cap"]==0)
                      .loc[lambda d: d["zero"]]
                      .sort_values(["cluster_id","depth"])
                      .groupby("cluster_id").first()["depth"]
                      .rename("first_zero_depth"))
        tmp = pd.merge(pd.DataFrame({"cluster_id": depth_df["cluster_id"].unique()}),
                       first_zero.reset_index(), how="left", on="cluster_id")
        dist = (tmp["first_zero_depth"]
                .value_counts(dropna=True)
                .sort_index()
                .rename_axis("depth")
                .reset_index(name="clusters"))
        if dist.empty:
            print("(none hit zero; all marched to max-depth)")
        else:
            print(dist.to_string(index=False))
    else:
        print("(no depth expansions)")

    if CFG["WRITE_CSV"]:
        cluster_df.to_csv(f"{CFG['OUT_PREFIX']}_clusters.csv", index=False)
        if not depth_df.empty:
            depth_df.to_csv(f"{CFG['OUT_PREFIX']}_depths.csv", index=False)
        print(f"\n[wrote CSV] {CFG['OUT_PREFIX']}*.csv")

# --------------------
# Go
# --------------------
run_all_clusters_downstream()


[12:37:35] INFO === Running downstream bench across 19 clusters (layer-relative support≥2) ===
[12:37:35] INFO [cluster 0] seeds=4
[12:37:36] INFO [iCite] chunk 1/1 | items=4
[12:37:36] INFO [iCite] chunk 1/1 | items=36
[12:37:36] INFO [cluster 1] seeds=4
[12:37:37] INFO [iCite] chunk 1/1 | items=4
[12:37:37] INFO [iCite] chunk 1/1 | items=28
[12:37:37] INFO [cluster 2] seeds=6
[12:37:37] INFO [iCite] chunk 1/1 | items=6
[12:37:38] INFO [iCite] chunk 1/1 | items=57
[12:37:39] INFO [iCite] chunk 1/1 | items=2
[12:37:39] INFO [cluster 3] seeds=6
[12:37:39] INFO [iCite] chunk 1/1 | items=6
[12:37:40] INFO [iCite] chunk 1/1 | items=60
[12:37:41] INFO [iCite] chunk 1/1 | items=10
[12:37:41] INFO [iCite] chunk 1/1 | items=7
[12:37:41] INFO [cluster 4] seeds=8
[12:37:42] INFO [iCite] chunk 1/1 | items=8
[12:37:42] INFO [iCite] chunk 1/1 | items=92
[12:37:44] INFO [iCite] chunk 1/1 | items=181
[12:37:45] INFO [iCite] chunk 1/1 | items=119
[12:37:46] INFO [iCite] chunk 1/1 | items=8
[12:37:46] 


=== Per-cluster summary (top 30 by total_kept) ===
 cluster_id  seeds  total_kept  depth_reached
          8     12          72              5
          4      8          66              4
         18      8          47              3
         16      5          37              4
         13     31          34              4
         17      6          30              3
          6     11          23              4
         14      5          12              4
         10      6          11              3
         15      6           7              4
          9      4           7              2
          7      4           6              3
         11     12           5              2
          3      6           4              3
         12      4           4              3
          2      6           3              2
          0      4           0              1
          1      4           0              1
          5      4           0              1

=== Depth-wise aggregate ac

In [12]:
# %% [All-in-one Downstream Ripple Analysis]
# This script combines all necessary functions into a single, reliable block.
# It starts from a set of publication clusters, explores their citation network
# downstream, and logs bibliographic and semantic metrics at each step.

import os
import math
import time
import json
import logging
import xml.etree.ElementTree as ET
from collections import Counter
from typing import Dict, List, Any

import numpy as np
import pandas as pd
import requests
from sklearn.feature_extraction.text import TfidfVectorizer

# =================================================================================
# >> CONFIGURATION <<
# All user-configurable settings are here.
# =================================================================================
CFG = dict(
    # --- Input ---
    CLUSTERS_PATH="clusters_snapshot.json", # Your input file of {cluster_id: [pmids]}

    # --- API Settings ---
    ENTREZ_EMAIL="you@example.com",  # IMPORTANT: Replace with your email for NCBI
    NCBI_API_KEY=os.environ.get("NCBI_API_KEY"), # Optional: set as environment variable for higher rate limits

    # --- Ripple Control ---
    SUPPORT_MIN=2,      # A paper must be cited by at least this many papers in the current frontier to be included.
    POOL_FACTOR=3.0,    # The max number of papers to keep per layer is calculated as:
    POOL_OFFSET=20,     # POOL_OFFSET + (POOL_FACTOR * size_of_previous_layer)
    MAX_DEPTH=25,       # Safety limit for how many layers to explore.

    # --- Output ---
    WRITE_CSV=True,     # Set to True to save the results as CSV files.
    OUT_PREFIX="downstream_analysis_results", # Filename prefix for the output CSVs.
)

# =================================================================================
# Logging Setup
# =================================================================================
def _setup_logging():
    """Sets up a simple, clean logger."""
    fmt = "[%(asctime)s] %(levelname)s: %(message)s"
    datefmt = "%H:%M:%S"
    # Using force=True to reconfigure logging in a Jupyter environment if the cell is re-run
    logging.basicConfig(level=logging.INFO, format=fmt, datefmt=datefmt, force=True)
    return logging.getLogger("DownstreamRipple")

log = _setup_logging()

# =================================================================================
# Data Caching & API Fetching
# =================================================================================
ICACHE: Dict[int, Dict[str, Any]] = {}  # Caches iCite data (citations, references)
TCACHE: Dict[int, Dict[str, Any]] = {}  # Caches EFetch data (title, abstract, year)

def icite_fetch_missing(pmids: List[int]):
    """Fetches citation data from NIH iCite for any PMIDs not in the cache."""
    pmids_to_fetch = [p for p in set(pmids) if p not in ICACHE]
    if not pmids_to_fetch:
        return

    log.info(f"Fetching iCite data for {len(pmids_to_fetch)} PMIDs...")
    batch_size = 200
    for i in range(0, len(pmids_to_fetch), batch_size):
        sub_list = pmids_to_fetch[i:i + batch_size]
        try:
            r = requests.get(
                "https://icite.od.nih.gov/api/pubs",
                params={"pmids": ",".join(map(str, sub_list)), "format": "json"},
                timeout=90
            )
            r.raise_for_status()
            data = r.json().get("data", [])
            for rec in data:
                pid = rec.get("pmid")
                if pid:
                    ICACHE[pid] = {
                        "cited_by": rec.get("cited_by", []),
                        "references": rec.get("references", []),
                        "citation_count": rec.get("citation_count", 0),
                        "year": rec.get("year"),
                    }
        except requests.RequestException as e:
            log.error(f"iCite request failed: {e}")
            continue

def efetch_missing(pmids: List[int]):
    """Fetches metadata (title, abstract) from PubMed EFetch for any PMIDs not in the cache."""
    pmids_to_fetch = [p for p in set(pmids) if p not in TCACHE]
    if not pmids_to_fetch:
        return

    log.info(f"Fetching PubMed metadata for {len(pmids_to_fetch)} PMIDs...")
    batch_size = 200
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    for i in range(0, len(pmids_to_fetch), batch_size):
        sub_list = pmids_to_fetch[i:i + batch_size]
        params = {
            "db": "pubmed",
            "id": ",".join(map(str, sub_list)),
            "retmode": "xml",
            "email": CFG["ENTREZ_EMAIL"]
        }
        if CFG["NCBI_API_KEY"]:
            params["api_key"] = CFG["NCBI_API_KEY"]

        try:
            r = requests.get(base_url, params=params, timeout=90)
            r.raise_for_status()
            root = ET.fromstring(r.text)
            for article in root.findall(".//PubmedArticle"):
                pmid_el = article.find(".//PMID")
                if pmid_el is None or not pmid_el.text: continue
                pid = int(pmid_el.text)

                title_el = article.find(".//ArticleTitle")
                title = "".join(title_el.itertext()).strip() if title_el is not None else ""

                abstract_els = article.findall(".//Abstract/AbstractText")
                abstract = " ".join("".join(el.itertext()).strip() for el in abstract_els)

                TCACHE[pid] = {"title": title, "abstract": abstract}
        except requests.RequestException as e:
            log.error(f"EFetch request failed: {e}")
            continue
        except ET.ParseError as e:
            log.error(f"XML parsing failed: {e}")
            continue

def get_full_meta(pmids: List[int]) -> Dict[int, Dict[str, Any]]:
    """A helper to ensure all data is fetched and combined from both caches."""
    icite_fetch_missing(pmids)
    efetch_missing(pmids)
    
    combined_meta = {}
    for pmid in set(pmids):
        icite_data = ICACHE.get(pmid, {})
        efetch_data = TCACHE.get(pmid, {})
        combined_meta[pmid] = {
            "title": efetch_data.get("title", ""),
            "abstract": efetch_data.get("abstract", ""),
            "year": icite_data.get("year"),
            "citations": icite_data.get("citation_count"),
        }
    return combined_meta

# =================================================================================
# Analysis & Scoring Helpers
# =================================================================================
def biblio_scores(pmids: List[int], meta: Dict[int, Dict[str, Any]], w_cpy=0.7, w_year=0.3) -> Dict[int, float]:
    """Scores papers based on a normalized mix of citations-per-year and publication year."""
    rows = []
    current_year = time.gmtime().tm_year
    for pmid in pmids:
        rec = meta.get(pmid, {})
        year = rec.get("year")
        citations = rec.get("citations")
        cpy = 0.0
        if year and citations is not None:
            try:
                age = max(1.0, current_year - int(year) + 1.0)
                cpy = float(citations) / age
            except (ValueError, TypeError):
                year = np.nan
        rows.append({"pmid": pmid, "cpy": cpy, "year": year})
    
    if not rows: return {}
    df = pd.DataFrame(rows).fillna(0)

    # Normalize CPY
    cpy_range = df["cpy"].max() - df["cpy"].min()
    df["cpy_n"] = (df["cpy"] - df["cpy"].min()) / cpy_range if cpy_range > 0 else 0.0
    
    # Normalize Year
    year_std = df["year"].std()
    df["year_n"] = 1 / (1 + np.exp(-(df["year"] - df["year"].mean()) / year_std)) if year_std > 0 else 0.5
    
    df["score"] = w_cpy * df["cpy_n"] + w_year * df["year_n"]
    return dict(zip(df["pmid"], df["score"]))

def tfidf_metrics(seed_pmids: List[int], kept_pmids: List[int], meta: Dict[int, Dict[str, Any]]) -> Dict[str, float]:
    """Calculates semantic similarity metrics using TF-IDF vectors."""
    if not kept_pmids:
        return {"cos_seed_centroid_med": np.nan, "centroid_drift": np.nan, "cohesion": np.nan}

    def get_text(pmid):
        rec = meta.get(pmid, {})
        return f"{rec.get('title', '')} {rec.get('abstract', '')}".strip()

    seed_texts = [get_text(p) for p in seed_pmids]
    kept_texts = [get_text(p) for p in kept_pmids]

    if not any(seed_texts) and not any(kept_texts):
        return {"cos_seed_centroid_med": np.nan, "centroid_drift": np.nan, "cohesion": np.nan}
    
    try:
        vectorizer = TfidfVectorizer(stop_words='english', max_features=50000, lowercase=True)
        X = vectorizer.fit_transform(seed_texts + kept_texts)
    except ValueError:
        return {"cos_seed_centroid_med": np.nan, "centroid_drift": np.nan, "cohesion": np.nan}

    X_seed, X_kept = X[:len(seed_texts)], X[len(seed_texts):]

    # Calculate centroids, ensuring they are standard numpy arrays
    seed_centroid = np.asarray(X_seed.mean(axis=0))
    seed_centroid_norm = np.linalg.norm(seed_centroid)
    if seed_centroid_norm > 0:
        seed_centroid /= seed_centroid_norm

    kept_centroid = np.asarray(X_kept.mean(axis=0))
    kept_centroid_norm = np.linalg.norm(kept_centroid)
    if kept_centroid_norm > 0:
        kept_centroid /= kept_centroid_norm

    # CORRECTED LINE 1: The dot product now returns a standard array, so .toarray() is removed.
    cos_sims = X_kept.dot(seed_centroid.T).flatten()
    
    # Centroid drift: how much has the topic shifted?
    drift = np.dot(kept_centroid.flatten(), seed_centroid.flatten())

    # Cohesion: mean pairwise cosine similarity within the new layer
    n = X_kept.shape[0]
    if n >= 2:
        # CORRECTED LINE 2: Ensure sum_vec is a standard array and remove .toarray().
        sum_vec = np.asarray(X_kept.sum(axis=0))
        sum_vec_sq_norm = np.dot(sum_vec, sum_vec.T)[0, 0]
        cohesion = (sum_vec_sq_norm - n) / (n * (n - 1))
    else:
        cohesion = np.nan

    return {
        "cos_seed_centroid_med": float(np.median(cos_sims)) if cos_sims.size > 0 else np.nan,
        "centroid_drift": float(drift),
        "cohesion": float(cohesion),
    }

# =================================================================================
# Main Ripple Logic
# =================================================================================
def run_downstream_for_cluster(cluster_id: int, seed_pmids: List[int]):
    """Performs the iterative downstream ripple for a single cluster."""
    log.info(f"--- Starting analysis for Cluster {cluster_id} (seeds={len(seed_pmids)}) ---")
    
    visited = set(seed_pmids)
    frontier = set(seed_pmids)
    
    history = []
    depth = 0
    while frontier and depth < CFG["MAX_DEPTH"]:
        depth += 1
        
        # 1. Discover new candidates
        icite_fetch_missing(list(frontier))
        candidates = Counter()
        for pmid in frontier:
            candidates.update(ICACHE.get(pmid, {}).get("cited_by", []))
        
        # Filter out already visited papers
        discovered = {p: count for p, count in candidates.items() if p not in visited}
        if not discovered:
            log.info(f"Depth {depth}: Frontier exhausted. No new papers discovered.")
            break

        # 2. Filter by support
        supported = {p for p, count in discovered.items() if count >= CFG["SUPPORT_MIN"]}
        
        # 3. Rank and Cap
        pool_cap = int(CFG["POOL_OFFSET"] + (CFG["POOL_FACTOR"] * len(frontier)))
        
        if len(supported) > pool_cap:
            meta = get_full_meta(list(supported))
            scores = biblio_scores(list(supported), meta)
            # Sort by score (desc), then by support count (desc) as a tie-breaker
            sorted_pmids = sorted(supported, key=lambda p: (scores.get(p, 0), discovered.get(p, 0)), reverse=True)
            kept = sorted_pmids[:pool_cap]
        else:
            kept = list(supported)

        # 4. Calculate metrics for the kept papers
        full_meta = get_full_meta(seed_pmids + kept)
        metrics = tfidf_metrics(seed_pmids, kept, full_meta)
        
        # 5. Log history and update for next iteration
        history.append({
            "cluster_id": cluster_id,
            "depth": depth,
            "prev_frontier_size": len(frontier),
            "discovered": len(discovered),
            "support_keep": len(supported),
            "after_cap": len(kept),
            "cap": pool_cap,
            **metrics,
        })
        log.info(f"Depth {depth}: Discovered={len(discovered)}, Kept={len(kept)}")

        if not kept:
            break
            
        visited.update(kept)
        frontier = set(kept)

    summary = {
        "cluster_id": cluster_id,
        "seeds": len(seed_pmids),
        "total_kept": sum(row['after_cap'] for row in history),
        "depth_reached": depth -1 if history else 0,
    }
    return pd.DataFrame(history), summary

# =================================================================================
# Main Execution Block
# =================================================================================
def run_all_clusters():
    """Loads clusters and runs the analysis for all of them."""
    # 1. Load Clusters
    try:
        with open(CFG["CLUSTERS_PATH"], "r") as f:
            clusters_raw = json.load(f)
        clusters = {int(k): [int(p) for p in v] for k, v in clusters_raw.items()}
        log.info(f"Successfully loaded {len(clusters)} clusters from '{CFG['CLUSTERS_PATH']}'.")
    except FileNotFoundError:
        log.error(f"FATAL: Input file not found at '{CFG['CLUSTERS_PATH']}'. Please check the path.")
        return
    except (json.JSONDecodeError, ValueError) as e:
        log.error(f"FATAL: Could not parse the clusters file. Ensure it is a valid JSON of {{'id': [pmid1, ...]}}. Error: {e}")
        return

    # 2. Run Analysis
    all_depths_data = []
    all_summaries = []
    for cid, pmids in sorted(clusters.items()):
        if not pmids:
            log.warning(f"Skipping Cluster {cid} as it has no seeds.")
            continue
        
        depth_df, summary = run_downstream_for_cluster(cid, pmids)
        if not depth_df.empty:
            all_depths_data.append(depth_df)
        all_summaries.append(summary)

    if not all_summaries:
        log.warning("Analysis finished, but no data was generated. Check if your seed clusters produced any results.")
        return

    # 3. Aggregate and Print Results
    depths_df = pd.concat(all_depths_data, ignore_index=True) if all_depths_data else pd.DataFrame()
    summaries_df = pd.DataFrame(all_summaries).sort_values("total_kept", ascending=False)
    
    print("\n" + "="*80)
    print("                    PER-CLUSTER SUMMARY (Top 30)")
    print("="*80)
    print(summaries_df.head(30).to_string(index=False))
    
    if not depths_df.empty:
        agg_df = depths_df.groupby("depth").agg(
            n_clusters=("cluster_id", "nunique"),
            discovered_med=("discovered", "median"),
            after_cap_med=("after_cap", "median"),
            cos_med=("cos_seed_centroid_med", "median"),
            drift_med=("centroid_drift", "median"),
            cohesion_med=("cohesion", "median"),
        ).reset_index()
        
        print("\n" + "="*80)
        print("                  DEPTH-WISE AGGREGATE METRICS (Median)")
        print("="*80)
        print(agg_df.to_string(index=False))

    # 4. Save to CSV if enabled
    if CFG["WRITE_CSV"]:
        try:
            summaries_path = f"{CFG['OUT_PREFIX']}_clusters.csv"
            depths_path = f"{CFG['OUT_PREFIX']}_depths.csv"
            
            summaries_df.to_csv(summaries_path, index=False)
            if not depths_df.empty:
                depths_df.to_csv(depths_path, index=False)
            
            log.info(f"Results successfully saved to '{summaries_path}' and '{depths_path}'.")
        except Exception as e:
            log.error(f"Could not write CSV files. Error: {e}")

    print("\n" + "="*80)
    log.info("Analysis complete.")


# --- Run the entire analysis ---
if __name__ == "__main__":
    run_all_clusters()

[13:49:50] INFO: Successfully loaded 19 clusters from 'clusters_snapshot.json'.
[13:49:50] INFO: --- Starting analysis for Cluster 0 (seeds=4) ---
[13:49:50] INFO: Fetching iCite data for 4 PMIDs...
[13:49:51] INFO: Fetching PubMed metadata for 4 PMIDs...
[13:49:51] INFO: Depth 1: Discovered=36, Kept=0
[13:49:51] INFO: --- Starting analysis for Cluster 1 (seeds=4) ---
[13:49:51] INFO: Fetching iCite data for 4 PMIDs...
[13:49:51] INFO: Fetching PubMed metadata for 4 PMIDs...
[13:49:52] INFO: Depth 1: Discovered=28, Kept=0
[13:49:52] INFO: --- Starting analysis for Cluster 2 (seeds=6) ---
[13:49:52] INFO: Fetching iCite data for 6 PMIDs...
[13:49:52] INFO: Fetching iCite data for 3 PMIDs...
[13:49:53] INFO: Fetching PubMed metadata for 9 PMIDs...
[13:49:53] INFO: Depth 1: Discovered=57, Kept=3
[13:49:53] INFO: Depth 2: Discovered=2, Kept=0
[13:49:53] INFO: --- Starting analysis for Cluster 3 (seeds=6) ---
[13:49:53] INFO: Fetching iCite data for 6 PMIDs...
[13:49:54] INFO: Fetching iCit


                    PER-CLUSTER SUMMARY (Top 30)
 cluster_id  seeds  total_kept  depth_reached
          8     12          72              4
          4      8          66              3
         18      8          47              2
         16      5          37              3
         13     31          34              3
         17      6          30              2
          6     11          23              3
         14      5          12              3
         10      6          11              2
          9      4           7              1
         15      6           7              3
          7      4           6              2
         11     12           5              1
         12      4           4              2
          3      6           4              2
          2      6           3              1
          0      4           0              0
          1      4           0              0
          5      4           0              0

                  DEPTH-WISE 

In [13]:
# %% [Folding Benchmark: D->U and U->D]
# This script performs the folding analysis on a select group of high-performing
# clusters to measure semantic stability and topical evolution.
# It relies on the functions and caches defined in the 'All-in-one' script above.

def _perform_fold_step(frontier_pmids: set, direction: str):
    """
    Performs a single ripple step (either Upstream or Downstream) with the
    global support rule and pool cap.
    
    Args:
        frontier_pmids (set): The set of PMIDs to ripple from.
        direction (str): Either 'D' for downstream (cited_by) or 'U' for upstream (references).
        
    Returns:
        list: A list of PMIDs in the new, capped layer.
    """
    if not frontier_pmids:
        return []

    # 1. Discover candidates and count layer-relative support
    # This part is crucial: we count how many papers in the *current frontier*
    # point to each candidate.
    support_counts = Counter()
    
    # Ensure all PMIDs have their citation data fetched
    icite_fetch_missing(list(frontier_pmids))
    
    for pmid in frontier_pmids:
        if direction == 'D':
            # Find papers that CITE the frontier papers
            candidates = ICACHE.get(pmid, {}).get("cited_by", [])
        else: # 'U'
            # Find papers that the frontier papers REFERENCE
            candidates = ICACHE.get(pmid, {}).get("references", [])
        support_counts.update(candidates)

    # 2. Gate: Keep only candidates with support >= SUPPORT_MIN
    supported_pmids = {p for p, count in support_counts.items() if count >= CFG["SUPPORT_MIN"]}
    
    # 3. Cap: If the set is too large, rank by biblio score and keep the top K
    pool_cap = int(CFG["POOL_OFFSET"] + (CFG["POOL_FACTOR"] * len(frontier_pmids)))
    
    if len(supported_pmids) > pool_cap:
        log.info(f"  > Capping {len(supported_pmids)} supported papers to {pool_cap}...")
        meta = get_full_meta(list(supported_pmids))
        scores = biblio_scores(list(supported_pmids), meta)
        kept_pmids = sorted(supported_pmids, key=lambda p: scores.get(p, 0), reverse=True)[:pool_cap]
    else:
        kept_pmids = list(supported_pmids)
        
    return kept_pmids

def run_folding_benchmark():
    """
    Main function to run the D->U and U->D benchmarks on specified clusters.
    """
    # Load the full cluster set to get seed PMIDs
    try:
        with open(CFG["CLUSTERS_PATH"], "r") as f:
            clusters = {int(k): v for k, v in json.load(f).items()}
    except FileNotFoundError:
        log.error(f"FATAL: Clusters file not found at '{CFG['CLUSTERS_PATH']}'. Cannot run benchmark.")
        return

    cluster_ids_to_run = [8, 4, 18, 16, 13]
    log.info(f"=== Starting Folding Benchmark for clusters: {cluster_ids_to_run} ===")

    for cid in cluster_ids_to_run:
        seed_pmids = set(clusters.get(cid, []))
        if not seed_pmids:
            log.warning(f"Skipping cluster {cid}: No seeds found.")
            continue

        print("\n" + "="*80)
        print(f"               Cluster {cid} (Seeds: {len(seed_pmids)})")
        print("="*80)

        # --- Perform Folds ---
        log.info(f"[{cid}] Performing D->U fold...")
        d_set = set(_perform_fold_step(seed_pmids, 'D'))
        du_set = set(_perform_fold_step(d_set, 'U'))

        log.info(f"[{cid}] Performing U->D fold...")
        u_set = set(_perform_fold_step(seed_pmids, 'U'))
        ud_set = set(_perform_fold_step(u_set, 'D'))
        
        # Consolidate all unique PMIDs to fetch metadata efficiently
        all_pmids_in_play = seed_pmids | d_set | du_set | u_set | ud_set
        log.info(f"[{cid}] Fetching metadata for {len(all_pmids_in_play)} unique PMIDs...")
        full_meta = get_full_meta(list(all_pmids_in_play))

        # --- Analyze and Report ---
        sets_to_analyze = {
            'S (Original Seeds)': seed_pmids,
            'D (Downstream)': d_set,
            'U (Upstream)': u_set,
            'D->U (Folded)': du_set,
            'U->D (Folded)': ud_set,
        }

        results = []
        for name, pmid_set in sets_to_analyze.items():
            if not pmid_set:
                results.append({'Set': name, 'Count': 0, 'Cos (vs S)': np.nan, 'Drift (vs S)': np.nan, 'Cohesion (Internal)': np.nan})
                continue
            
            # Metrics vs. the original seed set
            metrics_vs_s = tfidf_metrics(list(seed_pmids), list(pmid_set), full_meta)
            
            # Internal cohesion of the set itself
            internal_metrics = tfidf_metrics(list(pmid_set), list(pmid_set), full_meta)
            
            results.append({
                'Set': name,
                'Count': len(pmid_set),
                'Cos (vs S)': metrics_vs_s['cos_seed_centroid_med'] if name != 'S (Original Seeds)' else 1.0,
                'Drift (vs S)': metrics_vs_s['centroid_drift'] if name != 'S (Original Seeds)' else 1.0,
                'Cohesion (Internal)': internal_metrics['cohesion']
            })
            
        # --- Print Semantic Geometry Table ---
        results_df = pd.DataFrame(results).set_index('Set')
        print("\n--- Semantic Geometry ---\n")
        print(results_df.to_string(float_format="%.4f"))
        
        # --- Print Top 20 Biblio Picks ---
        for name, pmid_set in sets_to_analyze.items():
            if name == 'S (Original Seeds)' or not pmid_set:
                continue
            
            print(f"\n--- Top 20 Bibliometric Picks for {name} ---\n")
            scores = biblio_scores(list(pmid_set), full_meta)
            top_20 = sorted(scores.items(), key=lambda item: item[1], reverse=True)[:20]
            
            top_picks_data = []
            for i, (pmid, score) in enumerate(top_20):
                title = full_meta.get(pmid, {}).get('title', 'N/A')
                top_picks_data.append({
                    'Rank': i + 1,
                    'PMID': pmid,
                    'Score': score,
                    'Title': title[:80] + '...' if len(title) > 80 else title
                })
            
            if top_picks_data:
                print(pd.DataFrame(top_picks_data).to_string(index=False))
            else:
                print("(No papers found for this set)")

    log.info("=== Folding Benchmark Complete ===")

# --- Run the benchmark ---
run_folding_benchmark()

[13:59:46] INFO: === Starting Folding Benchmark for clusters: [8, 4, 18, 16, 13] ===
[13:59:46] INFO: [8] Performing D->U fold...
[13:59:46] INFO:   > Capping 136 supported papers to 89...
[13:59:46] INFO: Fetching iCite data for 121 PMIDs...



               Cluster 8 (Seeds: 12)


[13:59:47] INFO: Fetching PubMed metadata for 121 PMIDs...
[13:59:48] INFO: [8] Performing U->D fold...
[13:59:48] INFO: Fetching iCite data for 11 PMIDs...
[13:59:48] INFO:   > Capping 220 supported papers to 155...
[13:59:48] INFO: Fetching iCite data for 133 PMIDs...
[13:59:49] INFO: Fetching PubMed metadata for 140 PMIDs...
[13:59:50] INFO: [8] Fetching metadata for 221 unique PMIDs...
[13:59:50] INFO: Fetching PubMed metadata for 4 PMIDs...
[13:59:51] INFO: [4] Performing D->U fold...
[13:59:51] INFO:   > Capping 122 supported papers to 68...
[13:59:51] INFO: Fetching iCite data for 107 PMIDs...



--- Semantic Geometry ---

                    Count  Cos (vs S)  Drift (vs S)  Cohesion (Internal)
Set                                                                     
S (Original Seeds)     12      1.0000        1.0000               0.1682
D (Downstream)         23      0.3167        0.7749               0.1297
U (Upstream)           45      0.3615        0.8144               0.1559
D->U (Folded)          89      0.1879        0.7673               0.0956
U->D (Folded)         155      0.1945        0.7947               0.0832

--- Top 20 Bibliometric Picks for D (Downstream) ---

 Rank     PMID    Score                                                                               Title
    1 28803811 0.769316                                           Office Operative Hysteroscopy: An Update.
    2 32008214 0.412323 Updates in office hysteroscopy: a practical decalogue to perform a correct proce...
    3 35926213 0.328990 Implementation of Office Hysteroscopy for the Evaluation a

[13:59:52] INFO: Fetching PubMed metadata for 107 PMIDs...
[13:59:53] INFO: [4] Performing U->D fold...
[13:59:53] INFO:   > Capping 48 supported papers to 44...
[13:59:53] INFO: Fetching iCite data for 9 PMIDs...
[13:59:54] INFO: Fetching PubMed metadata for 9 PMIDs...
[13:59:54] INFO:   > Capping 688 supported papers to 152...
[13:59:54] INFO: Fetching iCite data for 580 PMIDs...
[13:59:57] INFO: Fetching PubMed metadata for 580 PMIDs...
[14:00:03] INFO: [4] Fetching metadata for 246 unique PMIDs...
[14:00:03] INFO: [18] Performing D->U fold...
[14:00:03] INFO:   > Capping 246 supported papers to 119...
[14:00:03] INFO: Fetching iCite data for 223 PMIDs...



--- Semantic Geometry ---

                    Count  Cos (vs S)  Drift (vs S)  Cohesion (Internal)
Set                                                                     
S (Original Seeds)      8      1.0000        1.0000               0.1349
D (Downstream)         16      0.2321        0.6487               0.1108
U (Upstream)           44      0.2230        0.6787               0.0929
D->U (Folded)          68      0.1909        0.7126               0.0694
U->D (Folded)         152      0.1376        0.5971               0.0569

--- Top 20 Bibliometric Picks for D (Downstream) ---

 Rank     PMID    Score                                                                               Title
    1 29230709 0.777996 Point-of-care gastric ultrasound and aspiration risk assessment: a narrative rev...
    2 37982593 0.484327 Evaluation of the 'Sip Til Send' regimen before elective caesarean delivery usin...
    3 29265187 0.431531 Gastric ultrasound in the third trimester of pregnancy: a 

[14:00:08] INFO: Fetching PubMed metadata for 223 PMIDs...
[14:00:11] INFO: [18] Performing U->D fold...
[14:00:11] INFO: Fetching iCite data for 1 PMIDs...
[14:00:11] INFO:   > Capping 781 supported papers to 80...
[14:00:11] INFO: Fetching iCite data for 679 PMIDs...
[14:00:15] INFO: Fetching PubMed metadata for 679 PMIDs...
[14:00:21] INFO: [18] Fetching metadata for 216 unique PMIDs...
[14:00:21] INFO: Fetching PubMed metadata for 1 PMIDs...
[14:00:22] INFO: [16] Performing D->U fold...
[14:00:22] INFO:   > Capping 113 supported papers to 80...
[14:00:22] INFO: Fetching iCite data for 107 PMIDs...



--- Semantic Geometry ---

                    Count  Cos (vs S)  Drift (vs S)  Cohesion (Internal)
Set                                                                     
S (Original Seeds)      8      1.0000        1.0000               0.2704
D (Downstream)         33      0.2374        0.7154               0.1188
U (Upstream)           20      0.3342        0.7720               0.1462
D->U (Folded)         119      0.1822        0.7488               0.0770
U->D (Folded)          80      0.1326        0.6096               0.0716

--- Top 20 Bibliometric Picks for D (Downstream) ---

 Rank     PMID    Score                                                                               Title
    1 38884982 0.917544 Povidone Iodine vs Chlorhexidine Gluconate in Alcohol for Preoperative Skin Anti...
    2 35644158 0.694444 Alcoholic chlorhexidine skin preparation or triclosan-coated sutures to reduce s...
    3 35985350 0.655556 Efficacy of different preoperative skin antiseptics on the

[14:00:23] INFO: Fetching PubMed metadata for 107 PMIDs...
[14:00:26] INFO: [16] Performing U->D fold...
[14:00:26] INFO: Fetching iCite data for 26 PMIDs...
[14:00:26] INFO:   > Capping 1308 supported papers to 122...
[14:00:26] INFO: Fetching iCite data for 1235 PMIDs...
[14:00:32] INFO: Fetching PubMed metadata for 1239 PMIDs...
[14:00:45] INFO: [16] Fetching metadata for 235 unique PMIDs...
[14:00:45] INFO: Fetching PubMed metadata for 22 PMIDs...
[14:00:46] INFO: [13] Performing D->U fold...
[14:00:46] INFO:   > Capping 253 supported papers to 77...
[14:00:46] INFO: Fetching iCite data for 242 PMIDs...



--- Semantic Geometry ---

                    Count  Cos (vs S)  Drift (vs S)  Cohesion (Internal)
Set                                                                     
S (Original Seeds)      5      1.0000        1.0000               0.2483
D (Downstream)         20      0.2252        0.5977               0.0980
U (Upstream)           34      0.1990        0.5851               0.1086
D->U (Folded)          80      0.2564        0.7113               0.1106
U->D (Folded)         122      0.1617        0.5999               0.0806

--- Top 20 Bibliometric Picks for D (Downstream) ---

 Rank     PMID    Score                                                                               Title
    1 36134567 0.869012 Recommendations From the International Consensus Conference on Anemia Management...
    2 36875314 0.388415 Iron Deficiency Anemia in Colorectal Cancer Patients: Is Preoperative Intravenou...
    3 36328926 0.377967                     Treatment Strategies in Anemic Patient

[14:00:48] INFO: Fetching PubMed metadata for 242 PMIDs...
[14:00:51] INFO: [13] Performing U->D fold...
[14:00:51] INFO: Fetching iCite data for 74 PMIDs...
[14:00:52] INFO:   > Capping 10671 supported papers to 314...
[14:00:52] INFO: Fetching iCite data for 10468 PMIDs...
[14:01:36] INFO: Fetching PubMed metadata for 10510 PMIDs...
[14:02:34] ERROR: EFetch request failed: Response ended prematurely
[14:03:41] INFO: [13] Fetching metadata for 527 unique PMIDs...
[14:03:41] INFO: Fetching PubMed metadata for 40 PMIDs...
[14:03:42] INFO: === Folding Benchmark Complete ===



--- Semantic Geometry ---

                    Count  Cos (vs S)  Drift (vs S)  Cohesion (Internal)
Set                                                                     
S (Original Seeds)     31      1.0000        1.0000               0.0945
D (Downstream)         19      0.2107        0.5697               0.0866
U (Upstream)           98      0.2305        0.7339               0.0817
D->U (Folded)          77      0.1493        0.5980               0.0712
U->D (Folded)         314      0.0689        0.4562               0.0326

--- Top 20 Bibliometric Picks for D (Downstream) ---

 Rank     PMID    Score                                                                               Title
    1 36756380 0.852555 ESGO/ESHRE/ESGE Guidelines for the fertility-sparing treatment of patients with ...
    2 36746507 0.536869 ESGO/ESHRE/ESGE Guidelines for the fertility-sparing treatment of patients with ...
    3 38438175 0.384514 Staging by imaging in gynecologic cancer and the role of u

In [15]:
# %% [Folding Benchmark with CORRECTED Conditional Hard Cap]
# This benchmark corrects the flawed capping logic. A strict growth-factor cap is now
# ONLY applied if the number of supported papers exceeds an "explosion threshold" (e.g., >100).
# Otherwise, ripples grow normally under the softer pool cap.

# --- New/Updated Configuration for the Corrected Cap ---
CFG['EXPLOSION_THRESHOLD'] = 100    # Hard cap is only considered if supported papers exceed this number.
CFG['HARD_CAP_FACTOR'] = 2.0       # Max growth factor (2.0 = 2x frontier size).
CFG['HARD_CAP_MINIMUM'] = 20        # A minimum size for the hard cap when it's active.

log.info(f"CONDITIONAL HARD CAP ENABLED: Strict cap only triggers for sets > {CFG['EXPLOSION_THRESHOLD']}.")

def _perform_fold_step_with_conditional_cap(frontier_pmids: set, direction: str):
    """
    Performs a single ripple step with the corrected conditional capping logic.
    """
    if not frontier_pmids:
        return []

    frontier_size = len(frontier_pmids)
    
    # 1. Discover candidates and count support
    support_counts = Counter()
    icite_fetch_missing(list(frontier_pmids))
    for pmid in frontier_pmids:
        candidates = ICACHE.get(pmid, {}).get("cited_by" if direction == 'D' else "references", [])
        support_counts.update(candidates)

    # 2. Gate by support
    supported_pmids = {p for p, count in support_counts.items() if count >= CFG["SUPPORT_MIN"]}
    
    # 3. CORRECTED LOGIC: Decide which cap to use
    if len(supported_pmids) > CFG['EXPLOSION_THRESHOLD']:
        # EXPLOSION DETECTED: Use the strict, growth-factor-based hard cap
        log.warning(f"  > EXPLOSION DETECTED: {len(supported_pmids)} supported papers. Applying strict growth cap.")
        cap_size = int(max(CFG['HARD_CAP_MINIMUM'], CFG['HARD_CAP_FACTOR'] * frontier_size))
    else:
        # NORMAL GROWTH: Use the original, more generous pool cap
        cap_size = int(CFG["POOL_OFFSET"] + (CFG["POOL_FACTOR"] * frontier_size))
    
    # 4. Apply the chosen cap
    if len(supported_pmids) > cap_size:
        log.info(f"  > Capping {len(supported_pmids)} papers to {cap_size} using biblio-score...")
        meta = get_full_meta(list(supported_pmids))
        scores = biblio_scores(list(supported_pmids), meta)
        final_kept_pmids = sorted(list(supported_pmids), key=lambda p: scores.get(p, 0), reverse=True)[:cap_size]
    else:
        final_kept_pmids = list(supported_pmids)
        
    return final_kept_pmids

def run_folding_benchmark_with_conditional_cap():
    """
    Main function to run the D->U and U->D benchmarks with the corrected capping logic.
    """
    try:
        with open(CFG["CLUSTERS_PATH"], "r") as f:
            clusters = {int(k): v for k, v in json.load(f).items()}
    except FileNotFoundError:
        log.error(f"FATAL: Clusters file not found at '{CFG['CLUSTERS_PATH']}'.")
        return

    cluster_ids_to_run = [8, 4, 18, 16, 13]
    log.info(f"=== Starting Folding Benchmark w/ Conditional Cap for clusters: {cluster_ids_to_run} ===")

    for cid in cluster_ids_to_run:
        seed_pmids = set(clusters.get(cid, []))
        if not seed_pmids:
            continue

        print("\n" + "="*80)
        print(f"               Cluster {cid} (Seeds: {len(seed_pmids)})")
        print("="*80)

        log.info(f"[{cid}] Performing D->U fold...")
        d_set = set(_perform_fold_step_with_conditional_cap(seed_pmids, 'D'))
        du_set = set(_perform_fold_step_with_conditional_cap(d_set, 'U'))

        log.info(f"[{cid}] Performing U->D fold...")
        u_set = set(_perform_fold_step_with_conditional_cap(seed_pmids, 'U'))
        ud_set = set(_perform_fold_step_with_conditional_cap(u_set, 'D'))
        
        all_pmids_in_play = seed_pmids | d_set | du_set | u_set | ud_set
        log.info(f"[{cid}] Fetching metadata for {len(all_pmids_in_play)} unique PMIDs...")
        full_meta = get_full_meta(list(all_pmids_in_play))

        # --- Analyze and Report ---
        sets_to_analyze = {
            'S (Original Seeds)': seed_pmids, 'D (Downstream)': d_set, 'U (Upstream)': u_set,
            'D->U (Folded)': du_set, 'U->D (Folded)': ud_set,
        }
        results = []
        for name, pmid_set in sets_to_analyze.items():
            if not pmid_set:
                results.append({'Set': name, 'Count': 0, 'Cos (vs S)': np.nan, 'Drift (vs S)': np.nan, 'Cohesion (Internal)': np.nan})
                continue
            
            metrics_vs_s = tfidf_metrics(list(seed_pmids), list(pmid_set), full_meta)
            internal_metrics = tfidf_metrics(list(pmid_set), list(pmid_set), full_meta)
            
            results.append({
                'Set': name, 'Count': len(pmid_set),
                'Cos (vs S)': metrics_vs_s['cos_seed_centroid_med'] if name != 'S (Original Seeds)' else 1.0,
                'Drift (vs S)': metrics_vs_s['centroid_drift'] if name != 'S (Original Seeds)' else 1.0,
                'Cohesion (Internal)': internal_metrics['cohesion']
            })
            
        print("\n--- Semantic Geometry ---\n")
        print(pd.DataFrame(results).set_index('Set').to_string(float_format="%.4f"))

    log.info("=== Folding Benchmark w/ Conditional Cap Complete ===")


# --- Run the new benchmark ---
run_folding_benchmark_with_conditional_cap()

[14:12:25] INFO: CONDITIONAL HARD CAP ENABLED: Strict cap only triggers for sets > 100.
[14:12:25] INFO: === Starting Folding Benchmark w/ Conditional Cap for clusters: [8, 4, 18, 16, 13] ===
[14:12:25] INFO: [8] Performing D->U fold...
[14:12:25] INFO:   > Capping 136 papers to 46 using biblio-score...
[14:12:25] INFO: [8] Performing U->D fold...
[14:12:25] INFO:   > Capping 220 papers to 90 using biblio-score...
[14:12:25] INFO: [8] Fetching metadata for 161 unique PMIDs...
[14:12:25] INFO: [4] Performing D->U fold...
[14:12:25] INFO:   > Capping 122 papers to 32 using biblio-score...
[14:12:25] INFO: [4] Performing U->D fold...
[14:12:25] INFO:   > Capping 48 papers to 44 using biblio-score...
[14:12:25] INFO:   > Capping 688 papers to 88 using biblio-score...
[14:12:25] INFO: [4] Fetching metadata for 162 unique PMIDs...



               Cluster 8 (Seeds: 12)

--- Semantic Geometry ---

                    Count  Cos (vs S)  Drift (vs S)  Cohesion (Internal)
Set                                                                     
S (Original Seeds)     12      1.0000        1.0000               0.1682
D (Downstream)         23      0.3167        0.7749               0.1297
U (Upstream)           45      0.3615        0.8144               0.1559
D->U (Folded)          46      0.1906        0.7145               0.0969
U->D (Folded)          90      0.1865        0.7519               0.0836

               Cluster 4 (Seeds: 8)


[14:12:25] INFO: [18] Performing D->U fold...
[14:12:25] INFO:   > Capping 246 papers to 66 using biblio-score...
[14:12:25] INFO: [18] Performing U->D fold...
[14:12:25] INFO:   > Capping 781 papers to 40 using biblio-score...
[14:12:25] INFO: [18] Fetching metadata for 140 unique PMIDs...
[14:12:25] INFO: [16] Performing D->U fold...
[14:12:25] INFO:   > Capping 113 papers to 40 using biblio-score...
[14:12:25] INFO: [16] Performing U->D fold...
[14:12:25] INFO:   > Capping 1308 papers to 68 using biblio-score...
[14:12:25] INFO: Fetching PubMed metadata for 1 PMIDs...



--- Semantic Geometry ---

                    Count  Cos (vs S)  Drift (vs S)  Cohesion (Internal)
Set                                                                     
S (Original Seeds)      8      1.0000        1.0000               0.1349
D (Downstream)         16      0.2321        0.6487               0.1108
U (Upstream)           44      0.2230        0.6787               0.0929
D->U (Folded)          32      0.2214        0.6872               0.0802
U->D (Folded)          88      0.1399        0.5691               0.0584

               Cluster 18 (Seeds: 8)

--- Semantic Geometry ---

                    Count  Cos (vs S)  Drift (vs S)  Cohesion (Internal)
Set                                                                     
S (Original Seeds)      8      1.0000        1.0000               0.2704
D (Downstream)         33      0.2374        0.7154               0.1188
U (Upstream)           20      0.3342        0.7720               0.1462
D->U (Folded)          66     

[14:12:25] INFO: [16] Fetching metadata for 146 unique PMIDs...
[14:12:25] INFO: [13] Performing D->U fold...
[14:12:25] INFO:   > Capping 253 papers to 38 using biblio-score...
[14:12:25] INFO: [13] Performing U->D fold...
[14:12:26] INFO:   > Capping 10671 papers to 196 using biblio-score...
[14:12:26] INFO: Fetching PubMed metadata for 55 PMIDs...



--- Semantic Geometry ---

                    Count  Cos (vs S)  Drift (vs S)  Cohesion (Internal)
Set                                                                     
S (Original Seeds)      5      1.0000        1.0000               0.2483
D (Downstream)         20      0.2252        0.5977               0.0980
U (Upstream)           34      0.1990        0.5851               0.1086
D->U (Folded)          40      0.2482        0.6542               0.1132
U->D (Folded)          68      0.1732        0.5826               0.0908

               Cluster 13 (Seeds: 31)


[14:12:27] INFO: [13] Fetching metadata for 373 unique PMIDs...
[14:12:27] INFO: Fetching PubMed metadata for 1 PMIDs...
[14:12:27] INFO: === Folding Benchmark w/ Conditional Cap Complete ===



--- Semantic Geometry ---

                    Count  Cos (vs S)  Drift (vs S)  Cohesion (Internal)
Set                                                                     
S (Original Seeds)     31      1.0000        1.0000               0.0945
D (Downstream)         19      0.2107        0.5697               0.0866
U (Upstream)           98      0.2305        0.7339               0.0817
D->U (Folded)          38      0.1417        0.5627               0.0741
U->D (Folded)         196      0.0701        0.4170               0.0364


In [16]:
# %% [Folding Benchmark with Absolute 100-Cap and Title Logging]
# This benchmark implements the simplified capping logic: every ripple step is
# hard-capped at a maximum of 100 papers. It also logs the top-5 titles
# from each step for direct semantic assessment.

# --- New Configuration for the Absolute Cap ---
CFG['GLOBAL_ABSOLUTE_CAP'] = 100 # Each ripple adds at most 100 new papers.

log.info(f"ABSOLUTE CAP ENABLED: Each ripple step will be capped at a max of {CFG['GLOBAL_ABSOLUTE_CAP']} papers.")

def _perform_fold_step_with_absolute_cap(frontier_pmids: set, direction: str):
    """
    Performs a single ripple step with a simple, absolute cap of 100.
    """
    if not frontier_pmids:
        return []

    # 1. Discover candidates and count support
    support_counts = Counter()
    icite_fetch_missing(list(frontier_pmids))
    for pmid in frontier_pmids:
        candidates = ICACHE.get(pmid, {}).get("cited_by" if direction == 'D' else "references", [])
        support_counts.update(candidates)

    # 2. Gate by support
    supported_pmids = {p for p, count in support_counts.items() if count >= CFG["SUPPORT_MIN"]}
    
    # 3. Apply the Global Absolute Cap
    if len(supported_pmids) > CFG['GLOBAL_ABSOLUTE_CAP']:
        log.info(f"  > Capping {len(supported_pmids)} supported papers to {CFG['GLOBAL_ABSOLUTE_CAP']} using biblio-score...")
        meta = get_full_meta(list(supported_pmids))
        scores = biblio_scores(list(supported_pmids), meta)
        final_kept_pmids = sorted(list(supported_pmids), key=lambda p: scores.get(p, 0), reverse=True)[:CFG['GLOBAL_ABSOLUTE_CAP']]
    else:
        final_kept_pmids = list(supported_pmids)
        # We still need scores to find the top 5 for logging, even if we didn't cap.
        meta = get_full_meta(final_kept_pmids)
        scores = biblio_scores(final_kept_pmids, meta)

    # 4. NEW: Log Top-5 Titles for qualitative assessment
    if final_kept_pmids:
        log.info(f"  > Top 5 semantic picks for this step:")
        # Sort by score to find the top 5
        sorted_for_logging = sorted(final_kept_pmids, key=lambda p: scores.get(p, 0), reverse=True)
        for i, pmid in enumerate(sorted_for_logging[:5]):
            title = meta.get(pmid, {}).get('title', 'N/A')
            log.info(f"    {i+1}. [{pmid}] {title[:90]}")
            
    return final_kept_pmids

def run_folding_benchmark_with_absolute_cap():
    """
    Main function to run the D->U and U->D benchmarks with the new absolute capping logic.
    """
    try:
        with open(CFG["CLUSTERS_PATH"], "r") as f:
            clusters = {int(k): v for k, v in json.load(f).items()}
    except FileNotFoundError:
        log.error(f"FATAL: Clusters file not found at '{CFG['CLUSTERS_PATH']}'.")
        return

    cluster_ids_to_run = [8, 4, 18, 16, 13]
    log.info(f"=== Starting Folding Benchmark w/ Absolute Cap for clusters: {cluster_ids_to_run} ===")

    for cid in cluster_ids_to_run:
        seed_pmids = set(clusters.get(cid, []))
        if not seed_pmids:
            continue

        print("\n" + "="*80)
        print(f"               Cluster {cid} (Seeds: {len(seed_pmids)})")
        print("="*80)

        log.info(f"[{cid}] Performing D->U fold...")
        d_set = set(_perform_fold_step_with_absolute_cap(seed_pmids, 'D'))
        du_set = set(_perform_fold_step_with_absolute_cap(d_set, 'U'))

        log.info(f"[{cid}] Performing U->D fold...")
        u_set = set(_perform_fold_step_with_absolute_cap(seed_pmids, 'U'))
        ud_set = set(_perform_fold_step_with_absolute_cap(u_set, 'D'))
        
        all_pmids_in_play = seed_pmids | d_set | du_set | u_set | ud_set
        log.info(f"[{cid}] Fetching metadata for {len(all_pmids_in_play)} unique PMIDs...")
        full_meta = get_full_meta(list(all_pmids_in_play))

        # --- Analyze and Report ---
        sets_to_analyze = {
            'S (Original Seeds)': seed_pmids, 'D (Downstream)': d_set, 'U (Upstream)': u_set,
            'D->U (Folded)': du_set, 'U->D (Folded)': ud_set,
        }
        results = []
        for name, pmid_set in sets_to_analyze.items():
            if not pmid_set:
                results.append({'Set': name, 'Count': 0, 'Cos (vs S)': np.nan, 'Drift (vs S)': np.nan, 'Cohesion (Internal)': np.nan})
                continue
            
            metrics_vs_s = tfidf_metrics(list(seed_pmids), list(pmid_set), full_meta)
            internal_metrics = tfidf_metrics(list(pmid_set), list(pmid_set), full_meta)
            
            results.append({
                'Set': name, 'Count': len(pmid_set),
                'Cos (vs S)': metrics_vs_s['cos_seed_centroid_med'] if name != 'S (Original Seeds)' else 1.0,
                'Drift (vs S)': metrics_vs_s['centroid_drift'] if name != 'S (Original Seeds)' else 1.0,
                'Cohesion (Internal)': internal_metrics['cohesion']
            })
            
        print("\n--- Semantic Geometry ---\n")
        print(pd.DataFrame(results).set_index('Set').to_string(float_format="%.4f"))

    log.info("=== Folding Benchmark w/ Absolute Cap Complete ===")

# --- Run the new benchmark ---
run_folding_benchmark_with_absolute_cap()

[14:18:20] INFO: ABSOLUTE CAP ENABLED: Each ripple step will be capped at a max of 100 papers.
[14:18:20] INFO: === Starting Folding Benchmark w/ Absolute Cap for clusters: [8, 4, 18, 16, 13] ===
[14:18:20] INFO: [8] Performing D->U fold...
[14:18:20] INFO:   > Top 5 semantic picks for this step:
[14:18:20] INFO:     1. [28803811] Office Operative Hysteroscopy: An Update.
[14:18:20] INFO:     2. [32008214] Updates in office hysteroscopy: a practical decalogue to perform a correct procedure.
[14:18:20] INFO:     3. [35926213] Implementation of Office Hysteroscopy for the Evaluation and Treatment of Intrauterine Pat
[14:18:20] INFO:     4. [39160077] Outpatient Hysteroscopy: (Green-top Guideline no. 59).
[14:18:20] INFO:     5. [33219606] Cervical dilatation and preparation prior to outpatient hysteroscopy: a systematic review 
[14:18:20] INFO:   > Capping 136 supported papers to 100 using biblio-score...
[14:18:20] INFO:   > Top 5 semantic picks for this step:
[14:18:20] INFO:     1. [2


               Cluster 8 (Seeds: 12)

--- Semantic Geometry ---

                    Count  Cos (vs S)  Drift (vs S)  Cohesion (Internal)
Set                                                                     
S (Original Seeds)     12      1.0000        1.0000               0.1682
D (Downstream)         23      0.3167        0.7749               0.1297
U (Upstream)           45      0.3615        0.8144               0.1559
D->U (Folded)         100      0.1900        0.7841               0.0983
U->D (Folded)         100      0.1867        0.7651               0.0820

               Cluster 4 (Seeds: 8)


[14:18:20] INFO:   > Capping 122 supported papers to 100 using biblio-score...
[14:18:20] INFO:   > Top 5 semantic picks for this step:
[14:18:20] INFO:     1. [18064739] The Strengthening the Reporting of Observational Studies in Epidemiology (STROBE) statemen
[14:18:20] INFO:     2. [17941715] Strengthening the Reporting of Observational Studies in Epidemiology (STROBE): explanation
[14:18:20] INFO:     3. [22392031] International evidence-based recommendations for point-of-care lung ultrasound.
[14:18:20] INFO:     4. [28045707] Practice Guidelines for Preoperative Fasting and the Use of Pharmacologic Agents to Reduce
[14:18:20] INFO:     5. [21356004] Saving Mothers' Lives: Reviewing maternal deaths to make motherhood safer: 2006-2008. The 
[14:18:20] INFO: [4] Performing U->D fold...
[14:18:20] INFO:   > Top 5 semantic picks for this step:
[14:18:20] INFO:     1. [21447488] Major complications of airway management in the UK: results of the Fourth National Audit P
[14:18:20] INFO: 


--- Semantic Geometry ---

                    Count  Cos (vs S)  Drift (vs S)  Cohesion (Internal)
Set                                                                     
S (Original Seeds)      8      1.0000        1.0000               0.1349
D (Downstream)         16      0.2321        0.6487               0.1108
U (Upstream)           48      0.2037        0.6729               0.0884
D->U (Folded)         100      0.1809        0.7067               0.0662
U->D (Folded)         100      0.1407        0.5763               0.0584

               Cluster 18 (Seeds: 8)

--- Semantic Geometry ---

                    Count  Cos (vs S)  Drift (vs S)  Cohesion (Internal)
Set                                                                     
S (Original Seeds)      8      1.0000        1.0000               0.2704
D (Downstream)         33      0.2374        0.7154               0.1188
U (Upstream)           20      0.3342        0.7720               0.1462
D->U (Folded)         100     

[14:18:22] INFO:   > Top 5 semantic picks for this step:
[14:18:22] INFO:     1. [36134567] Recommendations From the International Consensus Conference on Anemia Management in Surgic
[14:18:22] INFO:     2. [36875314] Iron Deficiency Anemia in Colorectal Cancer Patients: Is Preoperative Intravenous Iron Inf
[14:18:22] INFO:     3. [36328926] Treatment Strategies in Anemic Patients Before Cardiac Surgery.
[14:18:22] INFO:     4. [38167004] Reported outcomes in patients with iron deficiency or iron deficiency anemia undergoing ma
[14:18:22] INFO:     5. [36631901] The efficacy of intravenous iron for treatment of anemia before cardiac surgery: An update
[14:18:22] INFO:   > Capping 113 supported papers to 100 using biblio-score...
[14:18:22] INFO:   > Top 5 semantic picks for this step:
[14:18:22] INFO:     1. [30401705] Anemia of inflammation.
[14:18:22] INFO:     2. [15758012] Anemia of chronic disease.
[14:18:22] INFO:     3. [25946282] Iron-deficiency anemia.
[14:18:22] INFO:     4. 


--- Semantic Geometry ---

                    Count  Cos (vs S)  Drift (vs S)  Cohesion (Internal)
Set                                                                     
S (Original Seeds)      5      1.0000        1.0000               0.2483
D (Downstream)         20      0.2252        0.5977               0.0980
U (Upstream)           34      0.1990        0.5851               0.1086
D->U (Folded)         100      0.2443        0.7182               0.1050
U->D (Folded)         100      0.1630        0.5909               0.0819

               Cluster 13 (Seeds: 31)


[14:18:23] INFO:   > Top 5 semantic picks for this step:
[14:18:23] INFO:     1. [36756380] ESGO/ESHRE/ESGE Guidelines for the fertility-sparing treatment of patients with endometria
[14:18:23] INFO:     2. [36746507] ESGO/ESHRE/ESGE Guidelines for the fertility-sparing treatment of patients with endometria
[14:18:23] INFO:     3. [38438175] Staging by imaging in gynecologic cancer and the role of ultrasound: an update of European
[14:18:23] INFO:     4. [33793008] Machine Learning-Based Integration of Prognostic Magnetic Resonance Imaging Biomarkers for
[14:18:23] INFO:     5. [37010330] ESGO/ESHRE/ESGE Guidelines for the fertility-sparing treatment of patients with endometria
[14:18:23] INFO:   > Capping 253 supported papers to 100 using biblio-score...
[14:18:23] INFO:   > Top 5 semantic picks for this step:
[14:18:23] INFO:     1. [33538338] Global Cancer Statistics 2020: GLOBOCAN Estimates of Incidence and Mortality Worldwide for
[14:18:23] INFO:     2. [36791750] Uterine Neoplasm


--- Semantic Geometry ---

                    Count  Cos (vs S)  Drift (vs S)  Cohesion (Internal)
Set                                                                     
S (Original Seeds)     31      1.0000        1.0000               0.0945
D (Downstream)         19      0.2107        0.5697               0.0866
U (Upstream)           98      0.2305        0.7339               0.0817
D->U (Folded)         100      0.1427        0.6041               0.0683
U->D (Folded)         100      0.0751        0.3756               0.0445


In [17]:
# %% [OVERHAULED Folding Benchmark with Semantic Capping & Full Network Analysis]
# This script corrects prior failures by implementing two major overhauls:
# 1. The top-K cap is now 100% semantic, using TF-IDF cosine similarity to the seed cluster.
# 2. The final analysis includes the complete "Augmented Set" (the union of all ripples).
# Logging is also enhanced to include semantic scores with each title.

# --- Configuration for the Semantic Cap ---
CFG['GLOBAL_ABSOLUTE_CAP'] = 100 # Each ripple adds at most 100 new papers.

log.info(f"SEMANTIC CAP ENABLED: Ripples capped at {CFG['GLOBAL_ABSOLUTE_CAP']} using cosine similarity to seeds.")

def _get_semantic_scores(seed_pmids: set, candidate_pmids: set, meta: dict) -> dict:
    """
    Calculates cosine similarity for each candidate relative to the seed centroid.
    """
    if not seed_pmids or not candidate_pmids:
        return {}
    
    seed_texts = [f"{meta.get(p, {}).get('title','')} {meta.get(p, {}).get('abstract','')}".strip() for p in seed_pmids]
    candidate_texts = [f"{meta.get(p, {}).get('title','')} {meta.get(p, {}).get('abstract','')}".strip() for p in candidate_pmids]

    try:
        vectorizer = TfidfVectorizer(stop_words='english', max_features=50000, lowercase=True)
        X = vectorizer.fit_transform(seed_texts + candidate_texts)
        X_seed, X_candidates = X[:len(seed_texts)], X[len(seed_texts):]
        
        seed_centroid = np.asarray(X_seed.mean(axis=0))
        norm = np.linalg.norm(seed_centroid)
        if norm > 0:
            seed_centroid /= norm
            
        scores = X_candidates.dot(seed_centroid.T).flatten()
        return dict(zip(candidate_pmids, scores))
    except Exception as e:
        log.error(f"  > Error in semantic scoring: {e}")
        return {pmid: 0.0 for pmid in candidate_pmids}

def _perform_fold_step_with_semantic_cap(seed_pmids: set, frontier_pmids: set, direction: str):
    """
    Performs a ripple step, using semantic similarity for capping.
    Crucially, similarity is ALWAYS measured against the original seed_pmids.
    """
    if not frontier_pmids:
        return []

    # 1. Discover and Gate by Support
    support_counts = Counter()
    icite_fetch_missing(list(frontier_pmids))
    for pmid in frontier_pmids:
        candidates = ICACHE.get(pmid, {}).get("cited_by" if direction == 'D' else "references", [])
        support_counts.update(candidates)
    supported_pmids = {p for p, count in support_counts.items() if count >= CFG["SUPPORT_MIN"]}
    
    # 2. Apply the Semantic Cap if needed
    if len(supported_pmids) > CFG['GLOBAL_ABSOLUTE_CAP']:
        log.info(f"  > Capping {len(supported_pmids)} papers to {CFG['GLOBAL_ABSOLUTE_CAP']} using semantic score...")
        meta = get_full_meta(seed_pmids | supported_pmids)
        semantic_scores = _get_semantic_scores(seed_pmids, supported_pmids, meta)
        final_kept_pmids = sorted(list(supported_pmids), key=lambda p: semantic_scores.get(p, 0), reverse=True)[:CFG['GLOBAL_ABSOLUTE_CAP']]
    else:
        final_kept_pmids = list(supported_pmids)
    
    # 3. Log Top-5 Titles with their semantic scores
    if final_kept_pmids:
        log.info(f"  > Top 5 semantic picks for this step:")
        # We need scores for logging, even if we didn't cap.
        meta = get_full_meta(seed_pmids | set(final_kept_pmids))
        scores_for_logging = _get_semantic_scores(seed_pmids, set(final_kept_pmids), meta)
        
        sorted_for_logging = sorted(final_kept_pmids, key=lambda p: scores_for_logging.get(p, 0), reverse=True)
        for i, pmid in enumerate(sorted_for_logging[:5]):
            title = meta.get(pmid, {}).get('title', 'N/A')
            score = scores_for_logging.get(pmid, 0.0)
            log.info(f"    {i+1}. [{pmid}] (Score: {score:.3f}) {title[:85]}")
            
    return final_kept_pmids

def run_overhauled_folding_benchmark():
    """
    Main function to run the D->U and U->D benchmarks with semantic capping and full network analysis.
    """
    try:
        with open(CFG["CLUSTERS_PATH"], "r") as f:
            clusters = {int(k): v for k, v in json.load(f).items()}
    except FileNotFoundError:
        log.error(f"FATAL: Clusters file not found at '{CFG['CLUSTERS_PATH']}'.")
        return

    cluster_ids_to_run = [8, 4, 18, 16, 13]
    log.info(f"=== Starting OVERHAULED Folding Benchmark for clusters: {cluster_ids_to_run} ===")

    for cid in cluster_ids_to_run:
        seed_pmids = set(clusters.get(cid, []))
        if not seed_pmids: continue

        print("\n" + "="*80)
        print(f"               Cluster {cid} (Seeds: {len(seed_pmids)})")
        print("="*80)

        log.info(f"[{cid}] Performing D->U fold...")
        d_set = set(_perform_fold_step_with_semantic_cap(seed_pmids, seed_pmids, 'D'))
        du_set = set(_perform_fold_step_with_semantic_cap(seed_pmids, d_set, 'U'))

        log.info(f"[{cid}] Performing U->D fold...")
        u_set = set(_perform_fold_step_with_semantic_cap(seed_pmids, seed_pmids, 'U'))
        ud_set = set(_perform_fold_step_with_semantic_cap(seed_pmids, u_set, 'D'))
        
        # --- Create and Analyze the Complete Augmented Network ---
        augmented_set = seed_pmids | d_set | u_set | du_set | ud_set
        
        all_pmids_in_play = augmented_set
        log.info(f"[{cid}] Fetching metadata for {len(all_pmids_in_play)} unique PMIDs for final analysis...")
        full_meta = get_full_meta(list(all_pmids_in_play))

        sets_to_analyze = {
            'S (Original Seeds)': seed_pmids,
            'D (Downstream)': d_set,
            'U (Upstream)': u_set,
            'D->U (Folded)': du_set,
            'U->D (Folded)': ud_set,
            'Augmented Set (All)': augmented_set, # The complete network
        }
        
        results = []
        for name, pmid_set in sets_to_analyze.items():
            if not pmid_set:
                results.append({'Set': name, 'Count': 0, 'Cos (vs S)': np.nan, 'Drift (vs S)': np.nan, 'Cohesion (Internal)': np.nan})
                continue
            
            metrics_vs_s = tfidf_metrics(list(seed_pmids), list(pmid_set), full_meta)
            # For the seed set, cohesion is internal. For others, cohesion is vs seeds.
            internal_metrics = tfidf_metrics(list(pmid_set), list(pmid_set), full_meta)
            
            is_seed = name == 'S (Original Seeds)'
            results.append({
                'Set': name, 'Count': len(pmid_set),
                'Cos (vs S)': metrics_vs_s['cos_seed_centroid_med'] if not is_seed else 1.0,
                'Drift (vs S)': metrics_vs_s['centroid_drift'] if not is_seed else 1.0,
                'Cohesion (Internal)': internal_metrics['cohesion']
            })
            
        print("\n--- Final Semantic Geometry ---\n")
        print(pd.DataFrame(results).set_index('Set').to_string(float_format="%.4f"))

    log.info("=== OVERHAULED Folding Benchmark Complete ===")

# --- Run the new, overhauled benchmark ---
run_overhauled_folding_benchmark()

[14:25:30] INFO: SEMANTIC CAP ENABLED: Ripples capped at 100 using cosine similarity to seeds.
[14:25:30] INFO: === Starting OVERHAULED Folding Benchmark for clusters: [8, 4, 18, 16, 13] ===
[14:25:30] INFO: [8] Performing D->U fold...
[14:25:30] INFO:   > Top 5 semantic picks for this step:
[14:25:30] INFO:     1. [29438271] (Score: 0.607) Efficacy of misoprostol before diagnostic hysteroscopy in postmenopausal women: a ran
[14:25:30] INFO:     2. [30907209] (Score: 0.564) Effectiveness of misoprostol administration for cervical ripening in women before ope
[14:25:30] INFO:     3. [28109045] (Score: 0.494) Misoprostol for cervical priming prior to hysteroscopy in postmenopausal and premenop
[14:25:30] INFO:     4. [34695232] (Score: 0.464) Comparison between 200 μg and 800 μg of vaginal misoprostol for cervical ripening bef
[14:25:30] INFO:     5. [28069480] (Score: 0.441) Different Routes of Misoprostol for Same-Day Cervical Priming Prior to Operative Hyst
[14:25:30] INFO:   > Cappin


               Cluster 8 (Seeds: 12)


[14:25:31] INFO: [4] Performing D->U fold...
[14:25:31] INFO:   > Top 5 semantic picks for this step:
[14:25:31] INFO:     1. [31006483] (Score: 0.568) [Ultrasound assessment of gastric antrum in term pregnant women before elective cesar
[14:25:31] INFO:     2. [30052550] (Score: 0.452) Gastric Ultrasound for the Regional Anesthesiologist and Pain Specialist.
[14:25:31] INFO:     3. [32693329] (Score: 0.354) Gastric point-of-care ultrasound (PoCUS) during pregnancy and the postpartum period: 
[14:25:31] INFO:     4. [29265187] (Score: 0.333) Gastric ultrasound in the third trimester of pregnancy: a randomised controlled trial
[14:25:31] INFO:     5. [35130855] (Score: 0.295) Prevalence of risk stomach in laboring women allowed to unrestrictive oral intake: a 
[14:25:31] INFO:   > Capping 122 papers to 100 using semantic score...
[14:25:31] INFO:   > Top 5 semantic picks for this step:
[14:25:31] INFO:     1. [31006483] (Score: 0.580) [Ultrasound assessment of gastric antrum in term pre


--- Final Semantic Geometry ---

                     Count  Cos (vs S)  Drift (vs S)  Cohesion (Internal)
Set                                                                      
S (Original Seeds)      12      1.0000        1.0000               0.1682
D (Downstream)          23      0.3167        0.7749               0.1297
U (Upstream)            45      0.3615        0.8144               0.1559
D->U (Folded)          100      0.2919        0.8213               0.1234
U->D (Folded)          100      0.3284        0.8460               0.1461
Augmented Set (All)    164      0.2886        0.8415               0.1144

               Cluster 4 (Seeds: 8)


[14:25:31] INFO:   > Top 5 semantic picks for this step:
[14:25:31] INFO:     1. [31006483] (Score: 0.560) [Ultrasound assessment of gastric antrum in term pregnant women before elective cesar
[14:25:31] INFO:     2. [27259094] (Score: 0.476) Determination of a cut-off value of antral area measured in the supine position for t
[14:25:31] INFO:     3. [27561371] (Score: 0.467) Changes in qualitative and quantitative ultrasound assessment of the gastric antrum b
[14:25:31] INFO:     4. [37641782] (Score: 0.457) Ultrasonographic Assessment of Gastric Volume in Fasted Patients Undergoing Gastroint
[14:25:31] INFO:     5. [32171605] (Score: 0.457) Ultrasound to guide the individual medical decision by evaluating the gastric content
[14:25:31] INFO: [4] Fetching metadata for 186 unique PMIDs for final analysis...
[14:25:31] INFO: [18] Performing D->U fold...
[14:25:31] INFO:   > Top 5 semantic picks for this step:
[14:25:31] INFO:     1. [34404473] (Score: 0.658) Chlorhexidine-alcohol versus


--- Final Semantic Geometry ---

                     Count  Cos (vs S)  Drift (vs S)  Cohesion (Internal)
Set                                                                      
S (Original Seeds)       8      1.0000        1.0000               0.1349
D (Downstream)          16      0.2321        0.6487               0.1108
U (Upstream)            48      0.2037        0.6729               0.0884
D->U (Folded)          100      0.1799        0.7117               0.0722
U->D (Folded)          100      0.2577        0.7342               0.1278
Augmented Set (All)    186      0.2325        0.7350               0.0832

               Cluster 18 (Seeds: 8)


[14:25:31] INFO:   > Top 5 semantic picks for this step:
[14:25:31] INFO:     1. [28599898] (Score: 0.681) A randomized open-label controlled trial of chlorhexidine-alcohol vs povidone-iodine 
[14:25:31] INFO:     2. [26551196] (Score: 0.634) Skin Preparation for Prevention of Surgical Site Infection After Cesarean Delivery: A
[14:25:31] INFO:     3. [34404473] (Score: 0.625) Chlorhexidine-alcohol versus povidone-iodine as preoperative skin antisepsis for prev
[14:25:31] INFO:     4. [30021481] (Score: 0.596) Chlorhexidine-alcohol versus povidone-iodine for skin preparation before elective ces
[14:25:31] INFO:     5. [34942679] (Score: 0.588) Skin preparation for prevention of surgical site infection after obstetrics and gynec
[14:25:31] INFO: [18] Fetching metadata for 173 unique PMIDs for final analysis...
[14:25:32] INFO: [16] Performing D->U fold...
[14:25:32] INFO:   > Top 5 semantic picks for this step:
[14:25:32] INFO:     1. [32790892] (Score: 0.679) Erythropoietin plus iron ve


--- Final Semantic Geometry ---

                     Count  Cos (vs S)  Drift (vs S)  Cohesion (Internal)
Set                                                                      
S (Original Seeds)       8      1.0000        1.0000               0.2704
D (Downstream)          33      0.2374        0.7154               0.1188
U (Upstream)            20      0.3342        0.7720               0.1462
D->U (Folded)          100      0.2745        0.8228               0.1231
U->D (Folded)          100      0.2844        0.8172               0.1232
Augmented Set (All)    173      0.2515        0.8207               0.1023

               Cluster 16 (Seeds: 5)


[14:25:32] INFO:   > Top 5 semantic picks for this step:
[14:25:32] INFO:     1. [31811820] (Score: 0.749) Iron therapy for preoperative anaemia.
[14:25:32] INFO:     2. [26694949] (Score: 0.679) Iron therapy for pre-operative anaemia.
[14:25:32] INFO:     3. [32790892] (Score: 0.638) Erythropoietin plus iron versus control treatment including placebo or iron for preop
[14:25:32] INFO:     4. [29944518] (Score: 0.469) Treating Anemia in the Preanesthesia Assessment Clinic: Results of a Retrospective Ev
[14:25:32] INFO:     5. [25550190] (Score: 0.427) Iron therapy in anaemic adults without chronic kidney disease.
[14:25:32] INFO: [16] Fetching metadata for 214 unique PMIDs for final analysis...
[14:25:33] INFO: [13] Performing D->U fold...
[14:25:33] INFO:   > Top 5 semantic picks for this step:
[14:25:33] INFO: Fetching PubMed metadata for 1 PMIDs...



--- Final Semantic Geometry ---

                     Count  Cos (vs S)  Drift (vs S)  Cohesion (Internal)
Set                                                                      
S (Original Seeds)       5      1.0000        1.0000               0.2483
D (Downstream)          20      0.2252        0.5977               0.0980
U (Upstream)            34      0.1990        0.5851               0.1086
D->U (Folded)          100      0.2335        0.7037               0.1138
U->D (Folded)          100      0.2937        0.7476               0.1571
Augmented Set (All)    214      0.2539        0.7173               0.1122

               Cluster 13 (Seeds: 31)


[14:25:33] INFO:     1. [39615611] (Score: 0.369) Preoperative risk stratification of early-stage endometrial cancer assessed by multim
[14:25:33] INFO:     2. [35656849] (Score: 0.325) Three-dimensional transvaginal ultrasound vs magnetic resonance imaging for preoperat
[14:25:33] INFO:     3. [31186375] (Score: 0.304) Diagnostic Accuracy of Clinical Biomarkers for Preoperative Prediction of Lymph Node 
[14:25:33] INFO:     4. [40215804] (Score: 0.272) Diagnostic accuracy of TVUS and MRI in the preoperative evaluation of myometrial infi
[14:25:33] INFO:     5. [37068415] (Score: 0.259) A nomogram for preoperative risk stratification based on MRI morphological parameters
[14:25:33] INFO:   > Capping 253 papers to 100 using semantic score...
[14:25:33] INFO:   > Top 5 semantic picks for this step:
[14:25:33] INFO:     1. [27654258] (Score: 0.520) Predicting Model of Lymph Node Metastasis Using Preoperative Tumor Grade, Transvagina
[14:25:33] INFO:     2. [34257578] (Score: 0.396) Clinic


--- Final Semantic Geometry ---

                     Count  Cos (vs S)  Drift (vs S)  Cohesion (Internal)
Set                                                                      
S (Original Seeds)      31      1.0000        1.0000               0.0945
D (Downstream)          19      0.2107        0.5697               0.0866
U (Upstream)            98      0.2305        0.7339               0.0817
D->U (Folded)          100      0.2404        0.7636               0.0974
U->D (Folded)          100      0.3192        0.8230               0.1417
Augmented Set (All)    290      0.2533        0.8486               0.0873


In [4]:
# %% [DEFINITIVE - Deep Ripple with Full Detailed Reporting]
# This definitive version restores full visibility by logging the Top-5 titles and
# their semantic scores at every step of the ripple, providing the crucial
# qualitative context needed to interpret the quantitative metrics.

import os
import time
import json
import logging
from collections import Counter
from typing import Dict, List, Any
from datetime import datetime

import numpy as np
import pandas as pd
import requests
from sklearn.feature_extraction.text import TfidfVectorizer

# =================================================================================
# >> CONFIGURATION <<
# =================================================================================
CFG = dict(
    CLUSTERS_PATH="clusters_snapshot.json",
    ENTREZ_EMAIL="you@example.com",
    NCBI_API_KEY=os.environ.get("NCBI_API_KEY"),
    SUPPORT_MIN=2,
    GLOBAL_ABSOLUTE_CAP=100,
    MAX_DEPTH=25,
    WRITE_CSV=True,
    OUT_PREFIX="definitive_downstream_results",
)

# =================================================================================
# Core Functions (Logging, Caching, Fetching, Semantic Scoring)
# These are unchanged.
# =================================================================================
def _setup_logging():
    fmt = "[%(asctime)s] %(levelname)s: %(message)s"; datefmt = "%H:%M:%S"
    logging.basicConfig(level=logging.INFO, format=fmt, datefmt=datefmt, force=True)
    return logging.getLogger("DefinitiveRipple")
log = _setup_logging()
ICACHE: Dict[int, Dict[str, Any]] = {}; TCACHE: Dict[int, Dict[str, Any]] = {}
def esummary_fetch_missing(pmids: List[int]):
    pmids_to_fetch = [p for p in set(pmids) if p not in TCACHE or 'pubdate' not in TCACHE[p]]
    if not pmids_to_fetch: return
    log.debug(f"Fetching full metadata for {len(pmids_to_fetch)} PMIDs via ESummary...")
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    params = {"db": "pubmed", "retmode": "json", "email": CFG["ENTREZ_EMAIL"]}
    if CFG["NCBI_API_KEY"]: params["api_key"] = CFG["NCBI_API_KEY"]
    try:
        r = requests.post(base_url, data={"id": ",".join(map(str, pmids_to_fetch))}, params=params, timeout=90)
        r.raise_for_status(); data = r.json()
        for pmid_str, rec in data.get('result', {}).items():
            if pmid_str == 'uids': continue
            pmid = int(pmid_str)
            pubdate_str = rec.get('pubdate', ''); dt = None
            try: dt = datetime.strptime(pubdate_str, '%Y %b %d')
            except ValueError:
                try: dt = datetime.strptime(pubdate_str, '%Y %b')
                except ValueError:
                    try: dt = datetime.strptime(pubdate_str, '%Y')
                    except ValueError: dt = None
            TCACHE[pmid] = {"title": rec.get('title', ''), "pubdate": dt}
    except requests.RequestException as e: log.error(f"ESummary request failed: {e}")
def icite_fetch_missing(pmids: List[int]):
    pmids_to_fetch = [p for p in set(pmids) if p not in ICACHE]
    if not pmids_to_fetch: return
    log.debug(f"Fetching citation links for {len(pmids_to_fetch)} PMIDs via iCite...")
    for i in range(0, len(pmids_to_fetch), 200):
        sub_list = pmids_to_fetch[i:i + 200]
        try:
            r = requests.get("https://icite.od.nih.gov/api/pubs", params={"pmids": ",".join(map(str, sub_list)), "format": "json"}, timeout=90)
            r.raise_for_status()
            for rec in r.json().get("data", []):
                if rec.get("pmid"): ICACHE[rec["pmid"]] = {"cited_by": rec.get("cited_by", [])}
        except requests.RequestException as e: log.error(f"iCite request failed: {e}")
def get_full_meta(pmids: List[int]) -> Dict[int, Dict[str, Any]]:
    esummary_fetch_missing(pmids); icite_fetch_missing(pmids)
    return {pmid: {**TCACHE.get(pmid, {}), **ICACHE.get(pmid, {})} for pmid in set(pmids)}
def _get_semantic_scores(seed_pmids: set, candidate_pmids: set, meta: dict) -> dict:
    if not seed_pmids or not candidate_pmids: return {}
    try:
        # Using only titles for scoring is faster and often sufficient
        seed_texts = [meta.get(p, {}).get('title','') for p in seed_pmids]
        candidate_texts = [meta.get(p, {}).get('title','') for p in candidate_pmids]
        vectorizer = TfidfVectorizer(stop_words='english', max_features=50000, lowercase=True)
        X = vectorizer.fit_transform(seed_texts + candidate_texts)
        X_seed, X_candidates = X[:len(seed_texts)], X[len(seed_texts):]
        seed_centroid = np.asarray(X_seed.mean(axis=0)); norm = np.linalg.norm(seed_centroid)
        if norm > 0: seed_centroid /= norm
        scores = X_candidates.dot(seed_centroid.T).flatten()
        return dict(zip(candidate_pmids, scores))
    except Exception: return {pmid: 0.0 for pmid in candidate_pmids}
def _get_main_metrics(seed_pmids: List[int], kept_pmids: List[int], meta: Dict[int, Dict[str, Any]]) -> Dict[str, Any]:
    if not kept_pmids: return {"cohesion": np.nan, "median_age_years": np.nan}
    now = datetime.now()
    ages = [(now - meta.get(p, {}).get('pubdate')).days / 365.25 for p in kept_pmids if meta.get(p, {}).get('pubdate')]
    median_age = np.median(ages) if ages else np.nan
    try:
        # Cohesion is calculated on titles only for speed and consistency with the sampler
        all_texts = [meta.get(p, {}).get('title', '') for p in kept_pmids]
        if len(set(all_texts)) < 2: # Handle cases where all titles are identical
            return {"cohesion": 1.0 if len(all_texts) >=2 else np.nan, "median_age_years": median_age}
        vectorizer = TfidfVectorizer(stop_words='english', max_features=50000)
        X_kept = vectorizer.fit_transform(all_texts)
        n = X_kept.shape[0]
        cohesion = (np.dot(X_kept.sum(axis=0), X_kept.sum(axis=0).T)[0,0] - n) / (n * (n - 1)) if n >= 2 else np.nan
    except Exception: cohesion = np.nan
    return {"cohesion": cohesion, "median_age_years": median_age}

# =================================================================================
# Main Ripple Logic with Detailed Reporting
# =================================================================================
def run_downstream_for_cluster(cluster_id: int, seed_pmids: List[int]):
    log.info(f"--- Starting ripple for Cluster {cluster_id} (seeds={len(seed_pmids)}) ---")
    visited, frontier, history, depth = set(seed_pmids), set(seed_pmids), [], 0
    
    while frontier and depth < CFG["MAX_DEPTH"]:
        depth += 1
        log.info(f"  > Processing Depth {depth} (Frontier size: {len(frontier)})...")
        support_counts = Counter()
        icite_fetch_missing(list(frontier))
        for pmid in frontier:
            support_counts.update(ICACHE.get(pmid, {}).get("cited_by", []))
        
        discovered = {p for p, count in support_counts.items() if p not in visited and count >= CFG["SUPPORT_MIN"]}
        
        full_meta = get_full_meta(seed_pmids + list(discovered))
        scores = _get_semantic_scores(set(seed_pmids), discovered, full_meta)

        if len(discovered) > CFG['GLOBAL_ABSOLUTE_CAP']:
            kept = sorted(list(discovered), key=lambda p: scores.get(p, 0), reverse=True)[:CFG['GLOBAL_ABSOLUTE_CAP']]
        else:
            kept = list(discovered)

        # --- RESTORED: Detailed Title Logging ---
        if kept:
            log.info(f"    Top 5 semantic picks for this layer:")
            sorted_for_logging = sorted(kept, key=lambda p: scores.get(p, 0.0), reverse=True)
            for i, pmid in enumerate(sorted_for_logging[:5]):
                title = full_meta.get(pmid, {}).get('title', 'N/A')
                score = scores.get(pmid, 0.0)
                log.info(f"      {i+1}. [{pmid}] (Score: {score:.3f}) {title[:85]}")

        metrics = _get_main_metrics(seed_pmids, kept, full_meta)
        
        history.append({
            "depth": depth, "frontier_size": len(frontier), "discovered": len(discovered),
            "kept": len(kept), "median_age_years": metrics['median_age_years'], "cohesion": metrics['cohesion'],
        })

        if not kept:
            log.info(f"  > Ripple for Cluster {cluster_id} halted at depth {depth}.")
            break
        visited.update(kept); frontier = set(kept)

    depth_df = pd.DataFrame(history)
    print("\n" + "-"*80); print(f"Cluster {cluster_id} - Detailed Ripple History"); print("-"*(80))
    if not depth_df.empty:
        print(depth_df.to_string(index=False, float_format="%.2f"))
    else:
        print("(No ripple activity detected for this cluster)")
    
    summary = {
        "cluster_id": cluster_id, "seeds": len(seed_pmids), "total_kept": int(depth_df["kept"].sum()),
        "depth_reached": depth - (1 if not kept else 0), "kept_d1": int(depth_df[depth_df.depth == 1]['kept'].sum()),
        "final_cohesion": float(depth_df['cohesion'].dropna().tail(1).iloc[0]) if not depth_df['cohesion'].dropna().empty else np.nan,
        "final_median_age": float(depth_df['median_age_years'].dropna().tail(1).iloc[0]) if not depth_df['median_age_years'].dropna().empty else np.nan,
    }
    return summary, depth_df

# =================================================================================
# Main Execution Block
# =================================================================================
def run_all_clusters_definitive_analysis():
    try:
        with open(CFG["CLUSTERS_PATH"], "r") as f:
            clusters = {int(k): v for k, v in json.load(f).items()}
        log.info(f"Successfully loaded {len(clusters)} clusters.")
    except Exception as e:
        log.error(f"FATAL: Could not load clusters file. Error: {e}"); return

    all_summaries, all_depths_dfs = [], []
    for cid, pmids in sorted(clusters.items()):
        if not pmids: continue
        summary, depth_df = run_downstream_for_cluster(cid, pmids)
        all_summaries.append(summary)
        if not depth_df.empty: all_depths_dfs.append(depth_df)

    if not all_summaries: log.warning("Analysis finished with no results."); return

    summaries_df = pd.DataFrame(all_summaries).fillna(np.nan)
    summaries_df = summaries_df.sort_values(by=['depth_reached', 'total_kept'], ascending=[False, False])
    
    print("\n\n" + "="*80); print(" " * 28 + "FINAL RIPPLE SUMMARY"); print("="*80)
    print(summaries_df.to_string(index=False, float_format="%.2f"))

    if CFG["WRITE_CSV"]:
        summaries_df.to_csv(f"{CFG['OUT_PREFIX']}_summary.csv", index=False)
        if all_depths_dfs:
            pd.concat(all_depths_dfs).to_csv(f"{CFG['OUT_PREFIX']}_all_depths.csv", index=False)
        log.info(f"Results saved to '{CFG['OUT_PREFIX']}_summary.csv' and '{CFG['OUT_PREFIX']}_all_depths.csv'")

    log.info("Definitive deep ripple analysis complete.")

# --- Run the definitive analysis ---
run_all_clusters_definitive_analysis()

[15:18:47] INFO: Successfully loaded 19 clusters.
[15:18:47] INFO: --- Starting ripple for Cluster 0 (seeds=4) ---
[15:18:47] INFO:   > Processing Depth 1 (Frontier size: 4)...
[15:18:48] INFO:   > Ripple for Cluster 0 halted at depth 1.
[15:18:48] INFO: --- Starting ripple for Cluster 1 (seeds=4) ---
[15:18:48] INFO:   > Processing Depth 1 (Frontier size: 4)...



--------------------------------------------------------------------------------
Cluster 0 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  frontier_size  discovered  kept  median_age_years  cohesion
     1              4           0     0               NaN       NaN


[15:18:49] INFO:   > Ripple for Cluster 1 halted at depth 1.
[15:18:49] INFO: --- Starting ripple for Cluster 2 (seeds=6) ---
[15:18:49] INFO:   > Processing Depth 1 (Frontier size: 6)...



--------------------------------------------------------------------------------
Cluster 1 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  frontier_size  discovered  kept  median_age_years  cohesion
     1              4           0     0               NaN       NaN


[15:18:50] INFO:     Top 5 semantic picks for this layer:
[15:18:50] INFO:       1. [39983885] (Score: 0.280) Gestational age at birth varies by surgical technique in prenatal open spina bifida r
[15:18:50] INFO:       2. [36569393] (Score: 0.206) The Role of Fetal Brain Magnetic Resonance Imaging in Current Fetal Medicine.
[15:18:50] INFO:       3. [39737688] (Score: 0.123) T(2)* relaxometry of fetal brain structures using low-field (0.55T) MRI.
[15:18:50] INFO:   > Processing Depth 2 (Frontier size: 3)...
[15:18:50] INFO:   > Ripple for Cluster 2 halted at depth 2.
[15:18:50] INFO: --- Starting ripple for Cluster 3 (seeds=6) ---
[15:18:50] INFO:   > Processing Depth 1 (Frontier size: 6)...



--------------------------------------------------------------------------------
Cluster 2 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  frontier_size  discovered  kept  median_age_years  cohesion
     1              6           3     3              0.31      0.07
     2              3           0     0               NaN       NaN


[15:18:51] INFO:     Top 5 semantic picks for this layer:
[15:18:51] INFO:       1. [37315892] (Score: 0.346) Successful Hysteroscopic Management of Cesarean Scar Defect Pregnancy.
[15:18:51] INFO:       2. [35016136] (Score: 0.310) Hysteroscopic treatment of Cesarean Scar Pregnancy: A systematic review.
[15:18:51] INFO:       3. [38383385] (Score: 0.172) The effectiveness of hysteroscopy for the treatment of cesarean scar pregnancy: a ret
[15:18:51] INFO:   > Processing Depth 2 (Frontier size: 3)...
[15:18:52] INFO:     Top 5 semantic picks for this layer:
[15:18:52] INFO:       1. [38485053] (Score: 0.162) Efficacy and safety of treatment modalities for cesarean scar pregnancy: a systematic
[15:18:52] INFO:   > Processing Depth 3 (Frontier size: 1)...
[15:18:52] INFO:   > Ripple for Cluster 3 halted at depth 3.
[15:18:52] INFO: --- Starting ripple for Cluster 4 (seeds=8) ---
[15:18:52] INFO:   > Processing Depth 1 (Frontier size: 8)...



--------------------------------------------------------------------------------
Cluster 3 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  frontier_size  discovered  kept  median_age_years  cohesion
     1              6           3     3              1.90      0.29
     2              3           1     1              1.06       NaN
     3              1           0     0               NaN       NaN


[15:18:54] INFO:     Top 5 semantic picks for this layer:
[15:18:54] INFO:       1. [40399137] (Score: 0.286) Comparison of gastric volumes using ultrasound in term pregnant women with and withou
[15:18:54] INFO:       2. [29230709] (Score: 0.283) Point-of-care gastric ultrasound and aspiration risk assessment: a narrative review.
[15:18:54] INFO:       3. [30579409] (Score: 0.254) Term pregnant patients have similar gastric volume to non-pregnant females: a single-
[15:18:54] INFO:       4. [34890858] (Score: 0.251) Ultrasound assessment of gastric contents prior to placental delivery: A prospective 
[15:18:54] INFO:       5. [40250977] (Score: 0.222) ASRA pain medicine narrative review and expert practice recommendations for gastric p
[15:18:54] INFO:   > Processing Depth 2 (Frontier size: 14)...
[15:18:55] INFO:     Top 5 semantic picks for this layer:
[15:18:55] INFO:       1. [38290374] (Score: 0.348) Diagnostic accuracy of a simple qualitative ultrasound assessment for the diagno


--------------------------------------------------------------------------------
Cluster 4 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  frontier_size  discovered  kept  median_age_years  cohesion
     1              8          14    14              4.27      0.09
     2             14          37    37              1.73      0.05
     3             37          15    15              1.44      0.08
     4             15           0     0               NaN       NaN


[15:18:57] INFO:   > Ripple for Cluster 5 halted at depth 1.
[15:18:57] INFO: --- Starting ripple for Cluster 6 (seeds=11) ---
[15:18:57] INFO:   > Processing Depth 1 (Frontier size: 11)...



--------------------------------------------------------------------------------
Cluster 5 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  frontier_size  discovered  kept  median_age_years  cohesion
     1              4           0     0               NaN       NaN


[15:18:58] INFO:     Top 5 semantic picks for this layer:
[15:18:58] INFO:       1. [37568381] (Score: 0.422) Preoperative Oral Carbohydrate (CHO) Supplementation Is Beneficial for Clinical and B
[15:18:58] INFO:       2. [35882375] (Score: 0.319) The effect of preoperative oral carbohydrate on the time to colostrum and amount of v
[15:18:58] INFO:       3. [38586152] (Score: 0.302) Effects of Preoperative Oral Carbohydrate on Perioperative Maternal Outcomes Undergoi
[15:18:58] INFO:       4. [33403738] (Score: 0.295) Safety and feasibility of oral carbohydrate consumption before cesarean delivery on p
[15:18:58] INFO:       5. [36505254] (Score: 0.198) Effects of preoperative carbohydrate loading on recovery after elective surgery: A sy
[15:18:58] INFO:   > Processing Depth 2 (Frontier size: 11)...
[15:18:59] INFO:     Top 5 semantic picks for this layer:
[15:18:59] INFO:       1. [40481396] (Score: 0.251) Clinical value of preoperative oral carbohydrate loading in patients with diabe


--------------------------------------------------------------------------------
Cluster 6 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  frontier_size  discovered  kept  median_age_years  cohesion
     1             11          11    11              1.65      0.09
     2             11          11    11              0.69      0.06
     3             11           1     1              0.73       NaN
     4              1           0     0               NaN       NaN


[15:19:01] INFO:     Top 5 semantic picks for this layer:
[15:19:01] INFO:       1. [36239236] (Score: 0.503) Preoperative sonographic sliding sign for prediction of intra-abdominal adhesions bef
[15:19:01] INFO:       2. [39533845] (Score: 0.334) Transabdominal sonographic sliding signs for preoperative prediction of dense intra-a
[15:19:01] INFO:       3. [38679772] (Score: 0.307) Prediction of Intraperitoneal Adhesions in Repeated Cesarean Deliveries with Stria Gr
[15:19:01] INFO:       4. [37302234] (Score: 0.127) Prediction of intraperitoneal adhesions in repeated cesarean sections: A Systematic r
[15:19:01] INFO:       5. [38549181] (Score: 0.100) V-EMF therapy: A new painless and completely non-invasive treatment for striae gravid
[15:19:01] INFO:   > Processing Depth 2 (Frontier size: 5)...
[15:19:02] INFO:     Top 5 semantic picks for this layer:
[15:19:02] INFO:       1. [40230504] (Score: 0.096) Ischemic fallopian tube necrosis with hydatid of Morgagni secondary to post‑caes


--------------------------------------------------------------------------------
Cluster 7 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  frontier_size  discovered  kept  median_age_years  cohesion
     1              4           5     5              1.40      0.15
     2              5           1     1               NaN       NaN
     3              1           0     0               NaN       NaN


[15:19:03] INFO:     Top 5 semantic picks for this layer:
[15:19:03] INFO:       1. [30907209] (Score: 0.519) Effectiveness of misoprostol administration for cervical ripening in women before ope
[15:19:03] INFO:       2. [29542270] (Score: 0.489) Re: Misoprostol for cervical priming prior to hysteroscopy in postmenopausal and prem
[15:19:03] INFO:       3. [29542228] (Score: 0.370) Authors' reply re: Misoprostol for cervical priming prior to hysteroscopy in postmeno
[15:19:03] INFO:       4. [38034111] (Score: 0.365) Comparison of Efficacy of Vaginal Misoprostol versus a Synthetic Osmotic Dilator (Dil
[15:19:03] INFO:       5. [28069480] (Score: 0.342) Different Routes of Misoprostol for Same-Day Cervical Priming Prior to Operative Hyst
[15:19:03] INFO:   > Processing Depth 2 (Frontier size: 19)...
[15:19:04] INFO:     Top 5 semantic picks for this layer:
[15:19:04] INFO:       1. [31689673] (Score: 0.445) Efficacy and safety of oral vs vaginal misoprostol for cervical priming before 


--------------------------------------------------------------------------------
Cluster 8 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  frontier_size  discovered  kept  median_age_years  cohesion
     1             12          19    19              3.64      0.09
     2             19          24    24              2.64      0.05
     3             24          23    23              0.48      0.05
     4             23           6     6              0.33      0.05
     5              6           0     0               NaN       NaN


[15:19:07] INFO:     Top 5 semantic picks for this layer:
[15:19:07] INFO:       1. [38218312] (Score: 0.391) Adjuvant misoprostol or mifepristone for cervical preparation with osmotic dilators b
[15:19:07] INFO:       2. [37678155] (Score: 0.352) Cervical preparation for second-trimester procedural abortion.
[15:19:07] INFO:       3. [34464634] (Score: 0.338) Mifepristone prior to osmotic dilators for dilation and evacuation cervical preparati
[15:19:07] INFO:       4. [33285100] (Score: 0.194) A single-blinded randomized controlled trial evaluating pain and opioid use after dil
[15:19:07] INFO:       5. [34323228] (Score: 0.179) Abortion Care Beyond 13 Weeks' Gestation: A Global Perspective.
[15:19:07] INFO:   > Processing Depth 2 (Frontier size: 7)...
[15:19:07] INFO:   > Ripple for Cluster 9 halted at depth 2.
[15:19:07] INFO: --- Starting ripple for Cluster 10 (seeds=6) ---
[15:19:07] INFO:   > Processing Depth 1 (Frontier size: 6)...



--------------------------------------------------------------------------------
Cluster 9 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  frontier_size  discovered  kept  median_age_years  cohesion
     1              4           7     7              3.48      0.10
     2              7           0     0               NaN       NaN


[15:19:08] INFO:     Top 5 semantic picks for this layer:
[15:19:08] INFO:       1. [33345816] (Score: 0.328) Comparison of 3 protocols for analgesia control after cesarean delivery: a randomized
[15:19:08] INFO:       2. [34548165] (Score: 0.295) Assessing efficacy of intravenous acetaminophen for perioperative pain control for oo
[15:19:08] INFO:       3. [30561704] (Score: 0.258) Intravenous vs Oral Acetaminophen for Analgesia After Cesarean Delivery: A Randomized
[15:19:08] INFO:       4. [31370298] (Score: 0.180) A Meta-Analysis of the Utility of Preoperative Intravenous Paracetamol for Post-Caesa
[15:19:08] INFO:       5. [30791974] (Score: 0.144) Anesthesia considerations and post-operative pain management in pregnant women with c
[15:19:08] INFO:   > Processing Depth 2 (Frontier size: 8)...
[15:19:11] INFO:     Top 5 semantic picks for this layer:
[15:19:11] INFO:       1. [32039926] (Score: 0.234) Optimal pain management for cesarean delivery.
[15:19:11] INFO:       2. [389514


--------------------------------------------------------------------------------
Cluster 10 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  frontier_size  discovered  kept  median_age_years  cohesion
     1              6           8     8              5.94      0.12
     2              8           3     3              1.27      0.11
     3              3           0     0               NaN       NaN


[15:19:12] INFO:     Top 5 semantic picks for this layer:
[15:19:12] INFO:       1. [39916685] (Score: 0.260) General anaesthesia for nonobstetric surgery during pregnancy: A narrative review.
[15:19:12] INFO:       2. [38706246] (Score: 0.208) Effectiveness of preoperative multimedia educational sessions on the levels of anxiet
[15:19:12] INFO:       3. [38167236] (Score: 0.206) Informational video on preoperative anxiety and postoperative satisfaction prior to e
[15:19:12] INFO:       4. [39598290] (Score: 0.090) Laparoscopic Cholecystectomy Under Combined Spinal and Epidural Anesthesia in the Fir
[15:19:12] INFO:       5. [24276372] (Score: 0.024) Removal of tracheal tube following unsuccessful resuscitation: a survey of current pr
[15:19:12] INFO:   > Processing Depth 2 (Frontier size: 5)...
[15:19:12] INFO:   > Ripple for Cluster 11 halted at depth 2.
[15:19:12] INFO: --- Starting ripple for Cluster 12 (seeds=4) ---
[15:19:12] INFO:   > Processing Depth 1 (Frontier size: 4)...



--------------------------------------------------------------------------------
Cluster 11 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  frontier_size  discovered  kept  median_age_years  cohesion
     1             12           5     5              2.52      0.07
     2              5           0     0               NaN       NaN


[15:19:13] INFO:     Top 5 semantic picks for this layer:
[15:19:13] INFO:       1. [38243910] (Score: 0.066) Evaluation of preoperative ultrasound signs associated with bladder injury during com
[15:19:13] INFO:       2. [37592837] (Score: 0.000) Placental lakes vs lacunae: spot the differences.
[15:19:13] INFO:   > Processing Depth 2 (Frontier size: 2)...
[15:19:14] INFO:     Top 5 semantic picks for this layer:
[15:19:14] INFO:       1. [40473652] (Score: 0.398) Placenta accreta spectrum.
[15:19:14] INFO:       2. [39676233] (Score: 0.147) Exploring pathophysiological insights to improve diagnostic utility of ultrasound mar
[15:19:14] INFO:   > Processing Depth 3 (Frontier size: 2)...
[15:19:14] INFO:   > Ripple for Cluster 12 halted at depth 3.
[15:19:14] INFO: --- Starting ripple for Cluster 13 (seeds=31) ---
[15:19:14] INFO:   > Processing Depth 1 (Frontier size: 31)...



--------------------------------------------------------------------------------
Cluster 12 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  frontier_size  discovered  kept  median_age_years  cohesion
     1              4           2     2              1.39      0.00
     2              2           2     2              0.43      0.34
     3              2           0     0               NaN       NaN


[15:19:16] INFO:     Top 5 semantic picks for this layer:
[15:19:16] INFO:       1. [35656849] (Score: 0.392) Three-dimensional transvaginal ultrasound vs magnetic resonance imaging for preoperat
[15:19:16] INFO:       2. [39615611] (Score: 0.325) Preoperative risk stratification of early-stage endometrial cancer assessed by multim
[15:19:16] INFO:       3. [31186375] (Score: 0.292) Diagnostic Accuracy of Clinical Biomarkers for Preoperative Prediction of Lymph Node 
[15:19:16] INFO:       4. [39364314] (Score: 0.291) Preoperative prediction of lymph node metastasis in endometrial cancer patients via a
[15:19:16] INFO:       5. [40215804] (Score: 0.279) Diagnostic accuracy of TVUS and MRI in the preoperative evaluation of myometrial infi
[15:19:16] INFO:   > Processing Depth 2 (Frontier size: 19)...
[15:19:17] INFO:     Top 5 semantic picks for this layer:
[15:19:17] INFO:       1. [38473269] (Score: 0.326) Diagnostic Accuracy of Transvaginal Ultrasound and Magnetic Resonance Imaging f


--------------------------------------------------------------------------------
Cluster 13 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  frontier_size  discovered  kept  median_age_years  cohesion
     1             31          19    19              2.36      0.09
     2             19          13    13              1.48      0.09
     3             13           2     2              0.40      0.06
     4              2           0     0               NaN       NaN


[15:19:20] INFO:     Top 5 semantic picks for this layer:
[15:19:20] INFO:       1. [37311484] (Score: 0.322) Tranexamic acid for the prevention of blood loss after cesarean section: an updated s
[15:19:20] INFO:       2. [39652279] (Score: 0.245) Meta-analysis: the prophylactic use of tranexamic acid to reduce blood loss during ca
[15:19:20] INFO:       3. [38453797] (Score: 0.215) Prophylactic tranexamic acid in Cesarean delivery: an updated meta-analysis with a tr
[15:19:20] INFO:       4. [36436014] (Score: 0.205) Use of tranexamic acid in decreasing blood loss during and after delivery among women
[15:19:20] INFO:       5. [38001439] (Score: 0.036) Efficacy and safety of tranexamic acid in prevention of postpartum hemorrhage: a syst
[15:19:20] INFO:   > Processing Depth 2 (Frontier size: 5)...
[15:19:21] INFO:     Top 5 semantic picks for this layer:
[15:19:21] INFO:       1. [39535297] (Score: 0.163) Tranexamic acid for preventing postpartum haemorrhage after caesarean section.
[


--------------------------------------------------------------------------------
Cluster 14 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  frontier_size  discovered  kept  median_age_years  cohesion
     1              5           5     5              1.75      0.28
     2              5           3     3              0.73      0.27
     3              3           4     4              0.19      0.02
     4              4           0     0               NaN       NaN


[15:19:24] INFO:     Top 5 semantic picks for this layer:
[15:19:24] INFO:       1. [33663536] (Score: 0.382) The effect prophylactic internal iliac artery balloon occlusion in patients with plac
[15:19:24] INFO:       2. [32550143] (Score: 0.073) The role of interventional radiology in the management of abnormally invasive placent
[15:19:24] INFO:   > Processing Depth 2 (Frontier size: 2)...
[15:19:24] INFO:     Top 5 semantic picks for this layer:
[15:19:24] INFO:       1. [34282489] (Score: 0.316) Prophylactic Intraoperative Uterine Artery Embolization During Cesarean Section or Ce
[15:19:24] INFO:       2. [34846565] (Score: 0.264) The effect of prophylactic balloon occlusion in patients with placenta accreta spectr
[15:19:24] INFO:       3. [38022115] (Score: 0.104) Placenta Accreta: A Case Report on the Role of Interventional Radiology.
[15:19:24] INFO:       4. [37713901] (Score: 0.096) Current state of interventional procedures to treat pernicious placenta previa accomp
[15:19:


--------------------------------------------------------------------------------
Cluster 15 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  frontier_size  discovered  kept  median_age_years  cohesion
     1              6           2     2              4.85      0.12
     2              2           4     4              2.63      0.13
     3              4           1     1              0.23       NaN
     4              1           0     0               NaN       NaN


[15:19:26] INFO:     Top 5 semantic picks for this layer:
[15:19:26] INFO:       1. [36875314] (Score: 0.210) Iron Deficiency Anemia in Colorectal Cancer Patients: Is Preoperative Intravenous Iro
[15:19:26] INFO:       2. [33965294] (Score: 0.209) Effectiveness of Iron Supplementation With or Without Erythropoiesis-Stimulating Agen
[15:19:26] INFO:       3. [36253838] (Score: 0.208) Adverse events of iron and/or erythropoiesis-stimulating agent therapy in preoperativ
[15:19:26] INFO:       4. [38167004] (Score: 0.182) Reported outcomes in patients with iron deficiency or iron deficiency anemia undergoi
[15:19:26] INFO:       5. [37169135] (Score: 0.170) The Impact of Pre-Operative Anaemia on One Year Amputation Free Survival and Re-Admis
[15:19:26] INFO:   > Processing Depth 2 (Frontier size: 19)...
[15:19:28] INFO:     Top 5 semantic picks for this layer:
[15:19:28] INFO:       1. [39764837] (Score: 0.603) Preoperative iron therapy: Where are we?
[15:19:28] INFO:       2. [40603803] (


--------------------------------------------------------------------------------
Cluster 16 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  frontier_size  discovered  kept  median_age_years  cohesion
     1              5          19    19              2.52      0.07
     2             19          15    15              1.29      0.06
     3             15           3     3              0.30      1.00
     4              3           0     0               NaN       NaN


[15:19:30] INFO:     Top 5 semantic picks for this layer:
[15:19:30] INFO:       1. [35904080] (Score: 0.566) Vaginal preparation with different antiseptic solutions before cesarean section for p
[15:19:30] INFO:       2. [30954518] (Score: 0.373) Povidone-iodine 1% is the most effective vaginal antiseptic for preventing post-cesar
[15:19:30] INFO:       3. [33319753] (Score: 0.355) Vaginal cleansing with chlorhexidine gluconate or povidone-iodine prior to cesarean d
[15:19:30] INFO:       4. [40564224] (Score: 0.316) Vaginal Cleansing and Post-Cesarean Infectious Morbidity? Updated Systematic Review a
[15:19:30] INFO:       5. [39788360] (Score: 0.313) Vaginal antiseptic preparation at the time of hysterectomy: a systematic review and m
[15:19:30] INFO:   > Processing Depth 2 (Frontier size: 22)...
[15:19:31] INFO:     Top 5 semantic picks for this layer:
[15:19:31] INFO:       1. [40264037] (Score: 0.329) Epidemiology of surgical site infections post-cesarean section in Africa: a com


--------------------------------------------------------------------------------
Cluster 17 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  frontier_size  discovered  kept  median_age_years  cohesion
     1              6          22    22              2.44      0.09
     2             22           8     8              3.27      0.05
     3              8           0     0               NaN       NaN


[15:19:39] INFO:     Top 5 semantic picks for this layer:
[15:19:39] INFO:       1. [38884982] (Score: 0.540) Povidone Iodine vs Chlorhexidine Gluconate in Alcohol for Preoperative Skin Antisepsi
[15:19:39] INFO:       2. [31996985] (Score: 0.517) Preoperative Antisepsis with Chlorhexidine Versus Povidone-Iodine for the Prevention 
[15:19:39] INFO:       3. [40188587] (Score: 0.429) A comparative meta-analysis of povidone-iodine-alcohol vs. chlorhexidine-alcohol for 
[15:19:39] INFO:       4. [30346040] (Score: 0.401) Skin preparation for preventing infection following caesarean section.
[15:19:39] INFO:       5. [37149313] (Score: 0.393) Prevention of Postoperative Surgical Site Infection Following Cesarean Delivery.
[15:19:39] INFO:   > Processing Depth 2 (Frontier size: 30)...
[15:19:40] INFO:     Top 5 semantic picks for this layer:
[15:19:40] INFO:       1. [39531051] (Score: 0.401) Chlorhexidine-alcohol compared with povidone-iodine-alcohol skin antisepsis protocols
[15:19:40] IN


--------------------------------------------------------------------------------
Cluster 18 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  frontier_size  discovered  kept  median_age_years  cohesion
     1              8          30    30              2.81      0.10
     2             30          17    17              0.31      0.07
     3             17           0     0               NaN       NaN


                            FINAL RIPPLE SUMMARY
 cluster_id  seeds  total_kept  depth_reached  kept_d1  final_cohesion  final_median_age
          8     12          72              4       19            0.05              0.33
          4      8          66              3       14            0.08              1.44
         16      5          37              3       19            1.00              0.30
         13     31          34              3       19            0.06              0.40
          6     11          23  

In [1]:
# %% [DEFINITIVE - Final Ripple Script with Robustness Fixes]
# This definitive version fixes the TypeError crash by handling null API responses,
# disables the noisy embedding progress bars, and makes UMAP plotting robust for small clusters.

import os
import time
import json
import logging
from collections import Counter
from typing import Dict, List, Any
from datetime import datetime

import numpy as np
import pandas as pd
import requests
import torch
from sentence_transformers import SentenceTransformer
import umap
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

# =================================================================================
# >> CONFIGURATION <<
# =================================================================================
CFG = dict(
    CLUSTERS_PATH="clusters_snapshot.json",
    ENTREZ_EMAIL="you@example.com",
    NCBI_API_KEY=os.environ.get("NCBI_API_KEY"),
    MODEL_ID="Qwen/Qwen3-Embedding-0.6B",
    DEVICE="cuda" if torch.cuda.is_available() else "cpu",
    SUPPORT_MIN=2,
    GLOBAL_ABSOLUTE_CAP=100,
    MAX_DEPTH=25,
    WRITE_CSV=True,
    OUT_PREFIX="embedding_downstream_results",
)

# =================================================================================
# Core Functions
# =================================================================================
def _setup_logging():
    fmt = "[%(asctime)s] %(levelname)s: %(message)s"; datefmt = "%H:%M:%S"
    logging.basicConfig(level=logging.INFO, format=fmt, datefmt=datefmt, force=True)
    return logging.getLogger("EmbeddingRipple")
log = _setup_logging()
ICACHE, TCACHE, EMBEDDING_CACHE = {}, {}, {}

def esummary_fetch_missing(pmids: List[int]):
    pmids_to_fetch = [p for p in set(pmids) if p not in TCACHE or 'pubdate' not in TCACHE[p]]
    if not pmids_to_fetch: return
    log.debug(f"Fetching metadata for {len(pmids_to_fetch)} PMIDs via ESummary...")
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    params = {"db": "pubmed", "retmode": "json", "email": CFG["ENTREZ_EMAIL"]}
    if CFG["NCBI_API_KEY"]: params["api_key"] = CFG["NCBI_API_KEY"]
    try:
        r = requests.post(base_url, data={"id": ",".join(map(str, pmids_to_fetch))}, params=params, timeout=90)
        r.raise_for_status(); data = r.json()
        for pmid_str, rec in data.get('result', {}).items():
            if pmid_str == 'uids': continue
            pmid = int(pmid_str)
            pubdate_str = rec.get('pubdate', ''); dt = None
            try: dt = datetime.strptime(pubdate_str, '%Y %b %d')
            except ValueError:
                try: dt = datetime.strptime(pubdate_str, '%Y %b')
                except ValueError:
                    try: dt = datetime.strptime(pubdate_str, '%Y')
                    except ValueError: dt = None
            TCACHE[pmid] = {"title": rec.get('title', ''), "pubdate": dt}
    except requests.RequestException as e: log.error(f"ESummary request failed: {e}")

def icite_fetch_missing(pmids: List[int]):
    pmids_to_fetch = [p for p in set(pmids) if p not in ICACHE]
    if not pmids_to_fetch: return
    log.debug(f"Fetching citation links for {len(pmids_to_fetch)} PMIDs via iCite...")
    for i in range(0, len(pmids_to_fetch), 200):
        sub_list = pmids_to_fetch[i:i + 200]
        try:
            r = requests.get("https://icite.od.nih.gov/api/pubs", params={"pmids": ",".join(map(str, sub_list)), "format": "json"}, timeout=90)
            r.raise_for_status()
            for rec in r.json().get("data", []):
                if rec.get("pmid"):
                    # --- FIX: Handle API returning null (None) for citation lists ---
                    ICACHE[rec["pmid"]] = {
                        "cited_by": rec.get("cited_by") or [],
                        "references": rec.get("references") or []
                    }
        except requests.RequestException as e: log.error(f"iCite request failed: {e}")

def get_full_meta(pmids: List[int]) -> Dict[int, Dict[str, Any]]:
    esummary_fetch_missing(pmids); icite_fetch_missing(pmids)
    return {pmid: {**TCACHE.get(pmid, {}), **ICACHE.get(pmid, {})} for pmid in set(pmids)}

def _get_embeddings(pmids: set, meta: dict, model: SentenceTransformer) -> np.ndarray:
    pmids_to_embed = [p for p in pmids if p not in EMBEDDING_CACHE]
    if pmids_to_embed:
        log.info(f"  > Embedding {len(pmids_to_embed)} new papers with {CFG['MODEL_ID']}...")
        texts_to_embed = [f"passage: {meta.get(p, {}).get('title', '')}" for p in pmids_to_embed]
        new_embeddings = model.encode(
            texts_to_embed,
            batch_size=32,
            convert_to_numpy=True,
            normalize_embeddings=True,
            device=CFG['DEVICE'],
            show_progress_bar=False # --- FIX: Disable noisy progress bar ---
        )
        for pmid, emb in zip(pmids_to_embed, new_embeddings):
            EMBEDDING_CACHE[pmid] = emb
    return np.array([EMBEDDING_CACHE[p] for p in pmids])

def _get_semantic_scores(seed_embeddings: np.ndarray, candidate_embeddings: np.ndarray) -> np.ndarray:
    if seed_embeddings.shape[0] == 0 or candidate_embeddings.shape[0] == 0: return np.array([])
    seed_centroid = np.mean(seed_embeddings, axis=0, keepdims=True)
    return (candidate_embeddings @ seed_centroid.T).flatten()

def _get_main_metrics(kept_embeddings: np.ndarray, kept_pmids: list, meta: dict) -> Dict[str, Any]:
    if not kept_pmids: return {"cohesion": np.nan, "median_age_years": np.nan}
    now = datetime.now()
    ages = [(now - meta.get(p, {}).get('pubdate')).days / 365.25 for p in kept_pmids if meta.get(p, {}).get('pubdate')]
    median_age = np.median(ages) if ages else np.nan
    n = kept_embeddings.shape[0]
    if n >= 2:
        sum_vec = np.sum(kept_embeddings, axis=0)
        cohesion = (np.dot(sum_vec, sum_vec) - n) / (n * (n - 1))
    else:
        cohesion = np.nan
    return {"cohesion": cohesion, "median_age_years": median_age}

def _plot_ripple_network(cluster_id: int, pmid_to_depth: dict, all_pmids: set, meta: dict):
    log.info(f"  > Generating UMAP visualization for Cluster {cluster_id}...")
    pmids_list = list(all_pmids)
    # Ensure all embeddings are available before plotting
    embeddings = np.array([EMBEDDING_CACHE.get(p) for p in pmids_list if p in EMBEDDING_CACHE])
    
    if len(embeddings) < 5:
        log.warning("  > Skipping UMAP plot: not enough embedded papers to visualize.")
        return
        
    # --- FIX: Dynamically adjust n_neighbors for small datasets ---
    n_neighbors = min(15, len(embeddings) - 1)
    
    reducer = umap.UMAP(n_components=2, n_neighbors=n_neighbors, min_dist=0.1, metric='cosine', random_state=42)
    pos_2d = reducer.fit_transform(embeddings)
    
    G = nx.DiGraph()
    for i, pmid in enumerate(pmids_list): G.add_node(pmid, pos=(pos_2d[i, 0], pos_2d[i, 1]))
    
    for pmid in pmids_list:
        # The fix in icite_fetch_missing prevents this from ever being None
        cited_by_list = meta.get(pmid, {}).get('cited_by', [])
        for citer_pmid in cited_by_list:
            if citer_pmid in G: G.add_edge(pmid, citer_pmid)

    max_depth = max(pmid_to_depth.values()) if pmid_to_depth else 1
    cmap = plt.get_cmap('YlOrRd')
    node_colors = [cmap(pmid_to_depth.get(p, 0) / max_depth) for p in G.nodes()]
    edge_colors = [cmap(pmid_to_depth.get(u, 0) / max_depth) for u, v in G.edges()]

    plt.figure(figsize=(14, 10))
    nx.draw_networkx_nodes(G, {p: G.nodes[p]['pos'] for p in G.nodes()}, node_size=25, node_color=node_colors)
    nx.draw_networkx_edges(G, {p: G.nodes[p]['pos'] for p in G.nodes()}, edgelist=G.edges(), edge_color=edge_colors, width=0.5, alpha=0.5, arrows=False)
    
    sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=0, vmax=max_depth))
    cbar = plt.colorbar(sm, ax=plt.gca()); cbar.set_label('Ripple Depth', rotation=270, labelpad=15)
    
    plt.title(f'Cluster {cluster_id} - UMAP Ripple Network (Nodes Colored by Depth)')
    filename = f"{CFG['OUT_PREFIX']}_cluster_{cluster_id}_umap.png"
    plt.savefig(filename, dpi=150, bbox_inches='tight'); plt.close()
    log.info(f"  > UMAP plot saved to {filename}")

def run_downstream_for_cluster(cluster_id: int, seed_pmids: List[int], model: SentenceTransformer):
    log.info(f"--- Starting ripple for Cluster {cluster_id} (seeds={len(seed_pmids)}) ---")
    visited, frontier = set(seed_pmids), set(seed_pmids)
    history, depth, pmid_to_depth = [], 0, {p: 0 for p in seed_pmids}
    final_kept = []
    
    while frontier and depth < CFG["MAX_DEPTH"]:
        depth += 1
        log.info(f"  > Processing Depth {depth} (Frontier size: {len(frontier)})...")
        support_counts = Counter()
        icite_fetch_missing(list(frontier))
        for pmid in frontier:
            support_counts.update(ICACHE.get(pmid, {}).get("cited_by", []))
        
        discovered = {p for p, count in support_counts.items() if p not in visited and count >= CFG["SUPPORT_MIN"]}
        
        scores = {}
        if discovered:
            full_meta = get_full_meta(seed_pmids + list(discovered))
            seed_embs = _get_embeddings(set(seed_pmids), full_meta, model)
            disc_embs = _get_embeddings(discovered, full_meta, model)
            scores_array = _get_semantic_scores(seed_embs, disc_embs)
            scores = dict(zip(discovered, scores_array))

        if len(discovered) > CFG['GLOBAL_ABSOLUTE_CAP']:
            kept = sorted(list(discovered), key=lambda p: scores.get(p, 0), reverse=True)[:CFG['GLOBAL_ABSOLUTE_CAP']]
        else:
            kept = list(discovered)

        final_kept = kept
        if kept:
            log.info(f"    Top 5 semantic picks for this layer:")
            full_meta = get_full_meta(kept)
            sorted_for_logging = sorted(kept, key=lambda p: scores.get(p, 0.0), reverse=True)
            for i, pmid in enumerate(sorted_for_logging[:5]):
                title = full_meta.get(pmid, {}).get('title', 'N/A')
                score = scores.get(pmid, 0.0)
                log.info(f"      {i+1}. [{pmid}] (Score: {score:.3f}) {title[:85]}")

        full_meta_for_metrics = get_full_meta(kept)
        kept_embs = _get_embeddings(set(kept), full_meta_for_metrics, model)
        metrics = _get_main_metrics(kept_embs, kept, full_meta_for_metrics)
        
        history.append({"depth": depth, "kept": len(kept), **metrics})

        if not kept:
            log.info(f"  > Ripple for Cluster {cluster_id} halted at depth {depth}.")
            break
        visited.update(kept); frontier = set(kept)
        for p in kept: pmid_to_depth[p] = depth

    depth_df = pd.DataFrame(history)
    print("\n" + "-"*80); print(f"Cluster {cluster_id} - Detailed Ripple History"); print("-"*(80))
    if not depth_df.empty: print(depth_df.to_string(index=False, float_format="%.3f"))
    else: print("(No ripple activity detected for this cluster)")
    
    if len(visited) > len(seed_pmids):
        _plot_ripple_network(cluster_id, pmid_to_depth, visited, get_full_meta(list(visited)))
    
    if not depth_df.empty:
        summary = {
            "cluster_id": cluster_id, "seeds": len(seed_pmids), "total_kept": int(depth_df["kept"].sum()),
            "depth_reached": depth - (1 if not final_kept else 0), "kept_d1": int(depth_df[depth_df.depth == 1]['kept'].sum()),
            "final_cohesion": float(depth_df['cohesion'].dropna().tail(1).iloc[0]) if not depth_df['cohesion'].dropna().empty else np.nan
        }
    else:
        summary = {"cluster_id": cluster_id, "seeds": len(seed_pmids), "total_kept": 0, "depth_reached": 0, "kept_d1": 0, "final_cohesion": np.nan}
    return summary, depth_df

def run_all_clusters_embedding_analysis():
    if CFG['DEVICE'] == 'cuda': torch.cuda.empty_cache()
    log.info(f"Loading embedding model: {CFG['MODEL_ID']} onto {CFG['DEVICE']}...")
    model = SentenceTransformer(CFG['MODEL_ID'], device=CFG['DEVICE'])
    log.info("Model loaded successfully.")
    try:
        with open(CFG["CLUSTERS_PATH"], "r") as f: clusters = {int(k): v for k, v in json.load(f).items()}
        log.info(f"Successfully loaded {len(clusters)} clusters.")
    except Exception as e:
        log.error(f"FATAL: Could not load clusters file. Error: {e}"); return

    all_summaries, all_depths_dfs = [], []
    for cid, pmids in sorted(clusters.items()):
        if not pmids: continue
        summary, depth_df = run_downstream_for_cluster(cid, pmids, model)
        all_summaries.append(summary)
        if not depth_df.empty: all_depths_dfs.append(depth_df)

    if not all_summaries: log.warning("Analysis finished with no results."); return
    
    summaries_df = pd.DataFrame(all_summaries).fillna(np.nan)
    summaries_df = summaries_df.sort_values(by=['depth_reached', 'total_kept'], ascending=[False, False])
    print("\n\n" + "="*80); print(" " * 20 + "FINAL EMBEDDING RIPPLE SUMMARY"); print("="*80)
    print(summaries_df.to_string(index=False, float_format="%.3f"))
    
    if CFG["WRITE_CSV"]:
        summaries_df.to_csv(f"{CFG['OUT_PREFIX']}_summary.csv", index=False)
        if all_depths_dfs: pd.concat(all_depths_dfs).to_csv(f"{CFG['OUT_PREFIX']}_all_depths.csv", index=False)
        log.info(f"Results saved to '{CFG['OUT_PREFIX']}_summary.csv' and '{CFG['OUT_PREFIX']}_all_depths.csv'")
    
    if CFG['DEVICE'] == 'cuda':
        log.info("Clearing CUDA cache after analysis..."); del model; torch.cuda.empty_cache()
    log.info("Embedding-based deep ripple analysis complete.")

# --- Run the definitive analysis ---
run_all_clusters_embedding_analysis()

[15:33:14] INFO: Loading embedding model: Qwen/Qwen3-Embedding-0.6B onto cuda...
[15:33:14] INFO: Load pretrained SentenceTransformer: Qwen/Qwen3-Embedding-0.6B
[15:33:21] INFO: 1 prompt is loaded, with the key: query
[15:33:21] INFO: Model loaded successfully.
[15:33:21] INFO: Successfully loaded 19 clusters.
[15:33:21] INFO: --- Starting ripple for Cluster 0 (seeds=4) ---
[15:33:21] INFO:   > Processing Depth 1 (Frontier size: 4)...
[15:33:22] INFO:   > Ripple for Cluster 0 halted at depth 1.
[15:33:22] INFO: --- Starting ripple for Cluster 1 (seeds=4) ---
[15:33:22] INFO:   > Processing Depth 1 (Frontier size: 4)...



--------------------------------------------------------------------------------
Cluster 0 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  kept  cohesion  median_age_years
     1     0       NaN               NaN


[15:33:22] INFO:   > Ripple for Cluster 1 halted at depth 1.
[15:33:22] INFO: --- Starting ripple for Cluster 2 (seeds=6) ---
[15:33:22] INFO:   > Processing Depth 1 (Frontier size: 6)...



--------------------------------------------------------------------------------
Cluster 1 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  kept  cohesion  median_age_years
     1     0       NaN               NaN


[15:33:23] INFO:   > Embedding 6 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:33:23] INFO:   > Embedding 3 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:33:23] INFO:     Top 5 semantic picks for this layer:
[15:33:23] INFO:       1. [39983885] (Score: 0.712) Gestational age at birth varies by surgical technique in prenatal open spina bifida r
[15:33:23] INFO:       2. [36569393] (Score: 0.599) The Role of Fetal Brain Magnetic Resonance Imaging in Current Fetal Medicine.
[15:33:23] INFO:       3. [39737688] (Score: 0.581) T(2)* relaxometry of fetal brain structures using low-field (0.55T) MRI.
[15:33:23] INFO:   > Processing Depth 2 (Frontier size: 3)...
[15:33:23] INFO:   > Ripple for Cluster 2 halted at depth 2.
[15:33:23] INFO:   > Generating UMAP visualization for Cluster 2...
  warn(



--------------------------------------------------------------------------------
Cluster 2 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  kept  cohesion  median_age_years
     1     3     0.578             0.315
     2     0       NaN               NaN


[15:33:29] INFO:   > UMAP plot saved to embedding_downstream_results_cluster_2_umap.png
[15:33:29] INFO: --- Starting ripple for Cluster 3 (seeds=6) ---
[15:33:29] INFO:   > Processing Depth 1 (Frontier size: 6)...
[15:33:31] INFO:   > Embedding 6 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:33:31] INFO:   > Embedding 3 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:33:31] INFO:     Top 5 semantic picks for this layer:
[15:33:31] INFO:       1. [35016136] (Score: 0.813) Hysteroscopic treatment of Cesarean Scar Pregnancy: A systematic review.
[15:33:31] INFO:       2. [37315892] (Score: 0.803) Successful Hysteroscopic Management of Cesarean Scar Defect Pregnancy.
[15:33:31] INFO:       3. [38383385] (Score: 0.800) The effectiveness of hysteroscopy for the treatment of cesarean scar pregnancy: a ret
[15:33:31] INFO:   > Processing Depth 2 (Frontier size: 3)...
[15:33:32] INFO:   > Embedding 1 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:33:32] INFO:     Top 5 semantic picks for


--------------------------------------------------------------------------------
Cluster 3 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  kept  cohesion  median_age_years
     1     3     0.880             1.897
     2     1       NaN             1.062
     3     0       NaN               NaN


[15:33:32] INFO: --- Starting ripple for Cluster 4 (seeds=8) ---
[15:33:32] INFO:   > Processing Depth 1 (Frontier size: 8)...
[15:33:33] INFO:   > Embedding 8 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:33:34] INFO:   > Embedding 14 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:33:34] INFO:     Top 5 semantic picks for this layer:
[15:33:34] INFO:       1. [34890858] (Score: 0.685) Ultrasound assessment of gastric contents prior to placental delivery: A prospective 
[15:33:34] INFO:       2. [29265187] (Score: 0.665) Gastric ultrasound in the third trimester of pregnancy: a randomised controlled trial
[15:33:34] INFO:       3. [40399137] (Score: 0.656) Comparison of gastric volumes using ultrasound in term pregnant women with and withou
[15:33:34] INFO:       4. [29230709] (Score: 0.656) Point-of-care gastric ultrasound and aspiration risk assessment: a narrative review.
[15:33:34] INFO:       5. [32693329] (Score: 0.635) Gastric point-of-care ultrasound (PoCUS) during pregnan


--------------------------------------------------------------------------------
Cluster 4 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  kept  cohesion  median_age_years
     1    14     0.623             4.270
     2    37     0.565             1.730
     3    15     0.569             1.444
     4     0       NaN               NaN


[15:33:37] INFO:   > UMAP plot saved to embedding_downstream_results_cluster_4_umap.png
[15:33:37] INFO: --- Starting ripple for Cluster 5 (seeds=4) ---
[15:33:37] INFO:   > Processing Depth 1 (Frontier size: 4)...
[15:33:37] INFO:   > Ripple for Cluster 5 halted at depth 1.
[15:33:37] INFO: --- Starting ripple for Cluster 6 (seeds=11) ---
[15:33:37] INFO:   > Processing Depth 1 (Frontier size: 11)...



--------------------------------------------------------------------------------
Cluster 5 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  kept  cohesion  median_age_years
     1     0       NaN               NaN


[15:33:38] INFO:   > Embedding 11 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:33:38] INFO:   > Embedding 11 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:33:39] INFO:     Top 5 semantic picks for this layer:
[15:33:39] INFO:       1. [37568381] (Score: 0.777) Preoperative Oral Carbohydrate (CHO) Supplementation Is Beneficial for Clinical and B
[15:33:39] INFO:       2. [35882375] (Score: 0.766) The effect of preoperative oral carbohydrate on the time to colostrum and amount of v
[15:33:39] INFO:       3. [38586152] (Score: 0.765) Effects of Preoperative Oral Carbohydrate on Perioperative Maternal Outcomes Undergoi
[15:33:39] INFO:       4. [33403738] (Score: 0.740) Safety and feasibility of oral carbohydrate consumption before cesarean delivery on p
[15:33:39] INFO:       5. [38574855] (Score: 0.642) Evidence-based cesarean delivery: preoperative management (part 7).
[15:33:39] INFO:   > Processing Depth 2 (Frontier size: 11)...
[15:33:39] INFO:   > Embedding 11 new papers with


--------------------------------------------------------------------------------
Cluster 6 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  kept  cohesion  median_age_years
     1    11     0.596             1.645
     2    11     0.578             0.686
     3     1       NaN             0.728
     4     0       NaN               NaN


[15:33:41] INFO:   > UMAP plot saved to embedding_downstream_results_cluster_6_umap.png
[15:33:41] INFO: --- Starting ripple for Cluster 7 (seeds=4) ---
[15:33:41] INFO:   > Processing Depth 1 (Frontier size: 4)...
[15:33:42] INFO:   > Embedding 4 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:33:42] INFO:   > Embedding 5 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:33:42] INFO:     Top 5 semantic picks for this layer:
[15:33:42] INFO:       1. [38679772] (Score: 0.811) Prediction of Intraperitoneal Adhesions in Repeated Cesarean Deliveries with Stria Gr
[15:33:42] INFO:       2. [36239236] (Score: 0.803) Preoperative sonographic sliding sign for prediction of intra-abdominal adhesions bef
[15:33:42] INFO:       3. [39533845] (Score: 0.778) Transabdominal sonographic sliding signs for preoperative prediction of dense intra-a
[15:33:42] INFO:       4. [37302234] (Score: 0.760) Prediction of intraperitoneal adhesions in repeated cesarean sections: A Systematic r
[15:33:42] INFO:   


--------------------------------------------------------------------------------
Cluster 7 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  kept  cohesion  median_age_years
     1     5     0.639             1.396
     2     1       NaN               NaN
     3     0       NaN               NaN


[15:33:43] INFO: --- Starting ripple for Cluster 8 (seeds=12) ---
[15:33:43] INFO:   > Processing Depth 1 (Frontier size: 12)...
[15:33:44] INFO:   > Embedding 12 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:33:45] INFO:   > Embedding 19 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:33:45] INFO:     Top 5 semantic picks for this layer:
[15:33:45] INFO:       1. [30907209] (Score: 0.849) Effectiveness of misoprostol administration for cervical ripening in women before ope
[15:33:45] INFO:       2. [29542270] (Score: 0.829) Re: Misoprostol for cervical priming prior to hysteroscopy in postmenopausal and prem
[15:33:45] INFO:       3. [29542228] (Score: 0.821) Authors' reply re: Misoprostol for cervical priming prior to hysteroscopy in postmeno
[15:33:45] INFO:       4. [28069480] (Score: 0.807) Different Routes of Misoprostol for Same-Day Cervical Priming Prior to Operative Hyst
[15:33:45] INFO:       5. [39867033] (Score: 0.801) Significance of Misoprostol-Induced Cervical Ripeni


--------------------------------------------------------------------------------
Cluster 8 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  kept  cohesion  median_age_years
     1    19     0.678             3.644
     2    24     0.553             2.645
     3    23     0.537             0.482
     4     6     0.543             0.326
     5     0       NaN               NaN


[15:33:49] INFO:   > UMAP plot saved to embedding_downstream_results_cluster_8_umap.png
[15:33:49] INFO: --- Starting ripple for Cluster 9 (seeds=4) ---
[15:33:49] INFO:   > Processing Depth 1 (Frontier size: 4)...
[15:33:50] INFO:   > Embedding 4 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:33:50] INFO:   > Embedding 7 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:33:50] INFO:     Top 5 semantic picks for this layer:
[15:33:50] INFO:       1. [37678155] (Score: 0.770) Cervical preparation for second-trimester procedural abortion.
[15:33:50] INFO:       2. [38218312] (Score: 0.724) Adjuvant misoprostol or mifepristone for cervical preparation with osmotic dilators b
[15:33:50] INFO:       3. [34464634] (Score: 0.692) Mifepristone prior to osmotic dilators for dilation and evacuation cervical preparati
[15:33:50] INFO:       4. [33285100] (Score: 0.601) A single-blinded randomized controlled trial evaluating pain and opioid use after dil
[15:33:50] INFO:       5. [38329421] (Scor


--------------------------------------------------------------------------------
Cluster 9 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  kept  cohesion  median_age_years
     1     7     0.582             3.483
     2     0       NaN               NaN


[15:33:51] INFO: --- Starting ripple for Cluster 10 (seeds=6) ---
[15:33:51] INFO:   > Processing Depth 1 (Frontier size: 6)...
[15:33:52] INFO:   > Embedding 6 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:33:52] INFO:   > Embedding 8 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:33:52] INFO:     Top 5 semantic picks for this layer:
[15:33:52] INFO:       1. [30561704] (Score: 0.716) Intravenous vs Oral Acetaminophen for Analgesia After Cesarean Delivery: A Randomized
[15:33:52] INFO:       2. [33345816] (Score: 0.697) Comparison of 3 protocols for analgesia control after cesarean delivery: a randomized
[15:33:52] INFO:       3. [31370298] (Score: 0.669) A Meta-Analysis of the Utility of Preoperative Intravenous Paracetamol for Post-Caesa
[15:33:52] INFO:       4. [34589125] (Score: 0.650) The Current Consideration, Approach, and Management in Postcesarean Delivery Pain Con
[15:33:52] INFO:       5. [31350096] (Score: 0.649) Systemic adjunct analgesics for cesarean delivery: a n


--------------------------------------------------------------------------------
Cluster 10 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  kept  cohesion  median_age_years
     1     8     0.700             5.938
     2     3     0.695             1.270
     3     0       NaN               NaN


[15:33:53] INFO:   > UMAP plot saved to embedding_downstream_results_cluster_10_umap.png
[15:33:53] INFO: --- Starting ripple for Cluster 11 (seeds=12) ---
[15:33:53] INFO:   > Processing Depth 1 (Frontier size: 12)...
[15:33:54] INFO:   > Embedding 12 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:33:55] INFO:   > Embedding 5 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:33:55] INFO:     Top 5 semantic picks for this layer:
[15:33:55] INFO:       1. [39916685] (Score: 0.685) General anaesthesia for nonobstetric surgery during pregnancy: A narrative review.
[15:33:55] INFO:       2. [38167236] (Score: 0.617) Informational video on preoperative anxiety and postoperative satisfaction prior to e
[15:33:55] INFO:       3. [38706246] (Score: 0.605) Effectiveness of preoperative multimedia educational sessions on the levels of anxiet
[15:33:55] INFO:       4. [39598290] (Score: 0.569) Laparoscopic Cholecystectomy Under Combined Spinal and Epidural Anesthesia in the Fir
[15:33:55] INFO: 


--------------------------------------------------------------------------------
Cluster 11 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  kept  cohesion  median_age_years
     1     5     0.547             2.520
     2     0       NaN               NaN


[15:33:55] INFO:   > UMAP plot saved to embedding_downstream_results_cluster_11_umap.png
[15:33:55] INFO: --- Starting ripple for Cluster 12 (seeds=4) ---
[15:33:55] INFO:   > Processing Depth 1 (Frontier size: 4)...
[15:33:56] INFO:   > Embedding 4 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:33:56] INFO:   > Embedding 2 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:33:56] INFO:     Top 5 semantic picks for this layer:
[15:33:56] INFO:       1. [38243910] (Score: 0.547) Evaluation of preoperative ultrasound signs associated with bladder injury during com
[15:33:56] INFO:       2. [37592837] (Score: 0.471) Placental lakes vs lacunae: spot the differences.
[15:33:56] INFO:   > Processing Depth 2 (Frontier size: 2)...
[15:33:57] INFO:   > Embedding 2 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:33:57] INFO:     Top 5 semantic picks for this layer:
[15:33:57] INFO:       1. [40473652] (Score: 0.639) Placenta accreta spectrum.
[15:33:57] INFO:       2. [39676233] (Score: 0.621) 


--------------------------------------------------------------------------------
Cluster 12 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  kept  cohesion  median_age_years
     1     2     0.439             1.395
     2     2     0.648             0.431
     3     0       NaN               NaN


[15:33:57] INFO: --- Starting ripple for Cluster 13 (seeds=31) ---
[15:33:57] INFO:   > Processing Depth 1 (Frontier size: 31)...
[15:33:59] INFO:   > Embedding 31 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:33:59] INFO:   > Embedding 19 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:34:00] INFO:     Top 5 semantic picks for this layer:
[15:34:00] INFO:       1. [39615611] (Score: 0.697) Preoperative risk stratification of early-stage endometrial cancer assessed by multim
[15:34:00] INFO:       2. [39364314] (Score: 0.691) Preoperative prediction of lymph node metastasis in endometrial cancer patients via a
[15:34:00] INFO:       3. [35656849] (Score: 0.683) Three-dimensional transvaginal ultrasound vs magnetic resonance imaging for preoperat
[15:34:00] INFO:       4. [37068415] (Score: 0.665) A nomogram for preoperative risk stratification based on MRI morphological parameters
[15:34:00] INFO:       5. [31186375] (Score: 0.664) Diagnostic Accuracy of Clinical Biomarkers for Pre


--------------------------------------------------------------------------------
Cluster 13 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  kept  cohesion  median_age_years
     1    19     0.601             2.357
     2    13     0.660             1.481
     3     2     0.634             0.402
     4     0       NaN               NaN


[15:34:02] INFO:   > UMAP plot saved to embedding_downstream_results_cluster_13_umap.png
[15:34:02] INFO: --- Starting ripple for Cluster 14 (seeds=5) ---
[15:34:02] INFO:   > Processing Depth 1 (Frontier size: 5)...
[15:34:03] INFO:   > Embedding 5 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:34:03] INFO:   > Embedding 5 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:34:03] INFO:     Top 5 semantic picks for this layer:
[15:34:03] INFO:       1. [37311484] (Score: 0.726) Tranexamic acid for the prevention of blood loss after cesarean section: an updated s
[15:34:03] INFO:       2. [39652279] (Score: 0.722) Meta-analysis: the prophylactic use of tranexamic acid to reduce blood loss during ca
[15:34:03] INFO:       3. [38453797] (Score: 0.698) Prophylactic tranexamic acid in Cesarean delivery: an updated meta-analysis with a tr
[15:34:03] INFO:       4. [36436014] (Score: 0.684) Use of tranexamic acid in decreasing blood loss during and after delivery among women
[15:34:03] INFO: 


--------------------------------------------------------------------------------
Cluster 14 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  kept  cohesion  median_age_years
     1     5     0.855             1.749
     2     3     0.813             0.728
     3     4     0.521             0.189
     4     0       NaN               NaN


[15:34:05] INFO:   > UMAP plot saved to embedding_downstream_results_cluster_14_umap.png
[15:34:05] INFO: --- Starting ripple for Cluster 15 (seeds=6) ---
[15:34:05] INFO:   > Processing Depth 1 (Frontier size: 6)...
[15:34:06] INFO:   > Embedding 6 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:34:06] INFO:   > Embedding 2 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:34:07] INFO:     Top 5 semantic picks for this layer:
[15:34:07] INFO:       1. [33663536] (Score: 0.728) The effect prophylactic internal iliac artery balloon occlusion in patients with plac
[15:34:07] INFO:       2. [32550143] (Score: 0.608) The role of interventional radiology in the management of abnormally invasive placent
[15:34:07] INFO:   > Processing Depth 2 (Frontier size: 2)...
[15:34:07] INFO:   > Embedding 4 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:34:07] INFO:     Top 5 semantic picks for this layer:
[15:34:07] INFO:       1. [34282489] (Score: 0.748) Prophylactic Intraoperative Uterine Artery 


--------------------------------------------------------------------------------
Cluster 15 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  kept  cohesion  median_age_years
     1     2     0.695             4.851
     2     4     0.668             2.628
     3     1       NaN             0.230
     4     0       NaN               NaN


[15:34:09] INFO: --- Starting ripple for Cluster 16 (seeds=5) ---
[15:34:09] INFO:   > Processing Depth 1 (Frontier size: 5)...
[15:34:10] INFO:   > Embedding 5 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:34:10] INFO:   > Embedding 19 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:34:11] INFO:     Top 5 semantic picks for this layer:
[15:34:11] INFO:       1. [33965294] (Score: 0.773) Effectiveness of Iron Supplementation With or Without Erythropoiesis-Stimulating Agen
[15:34:11] INFO:       2. [36253838] (Score: 0.733) Adverse events of iron and/or erythropoiesis-stimulating agent therapy in preoperativ
[15:34:11] INFO:       3. [36328926] (Score: 0.717) Treatment Strategies in Anemic Patients Before Cardiac Surgery.
[15:34:11] INFO:       4. [36631901] (Score: 0.709) The efficacy of intravenous iron for treatment of anemia before cardiac surgery: An u
[15:34:11] INFO:       5. [34173552] (Score: 0.708) Preoperative anemia management in the coronavirus disease (COVID-19) era.
[


--------------------------------------------------------------------------------
Cluster 16 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  kept  cohesion  median_age_years
     1    19     0.604             2.522
     2    15     0.571             1.292
     3     3     0.991             0.301
     4     0       NaN               NaN


[15:34:13] INFO:   > UMAP plot saved to embedding_downstream_results_cluster_16_umap.png
[15:34:13] INFO: --- Starting ripple for Cluster 17 (seeds=6) ---
[15:34:13] INFO:   > Processing Depth 1 (Frontier size: 6)...
[15:34:16] INFO:   > Embedding 6 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:34:16] INFO:   > Embedding 22 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:34:16] INFO:     Top 5 semantic picks for this layer:
[15:34:16] INFO:       1. [36462539] (Score: 0.833) Vaginal cleansing before unscheduled cesarean delivery to reduce infection: a randomi
[15:34:16] INFO:       2. [35904080] (Score: 0.808) Vaginal preparation with different antiseptic solutions before cesarean section for p
[15:34:16] INFO:       3. [33319753] (Score: 0.802) Vaginal cleansing with chlorhexidine gluconate or povidone-iodine prior to cesarean d
[15:34:16] INFO:       4. [37178722] (Score: 0.783) Different methods of vaginal preparation before cesarean delivery to prevent postoper
[15:34:16] INFO:


--------------------------------------------------------------------------------
Cluster 17 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  kept  cohesion  median_age_years
     1    22     0.592             2.438
     2     8     0.451             3.270
     3     0       NaN               NaN


[15:34:17] INFO:   > UMAP plot saved to embedding_downstream_results_cluster_17_umap.png
[15:34:17] INFO: --- Starting ripple for Cluster 18 (seeds=8) ---
[15:34:17] INFO:   > Processing Depth 1 (Frontier size: 8)...
[15:34:19] INFO:   > Embedding 7 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:34:19] INFO:   > Embedding 25 new papers with Qwen/Qwen3-Embedding-0.6B...
[15:34:19] INFO:     Top 5 semantic picks for this layer:
[15:34:19] INFO:       1. [30346040] (Score: 0.772) Skin preparation for preventing infection following caesarean section.
[15:34:19] INFO:       2. [40188587] (Score: 0.755) A comparative meta-analysis of povidone-iodine-alcohol vs. chlorhexidine-alcohol for 
[15:34:19] INFO:       3. [31996985] (Score: 0.747) Preoperative Antisepsis with Chlorhexidine Versus Povidone-Iodine for the Prevention 
[15:34:19] INFO:       4. [28817989] (Score: 0.740) Optimal skin antiseptic agents for prevention of surgical site infection in cesarean 
[15:34:19] INFO:       5. [3714


--------------------------------------------------------------------------------
Cluster 18 - Detailed Ripple History
--------------------------------------------------------------------------------
 depth  kept  cohesion  median_age_years
     1    30     0.599             2.812
     2    17     0.557             0.315
     3     0       NaN               NaN


[15:34:21] INFO:   > UMAP plot saved to embedding_downstream_results_cluster_18_umap.png
[15:34:21] INFO: Results saved to 'embedding_downstream_results_summary.csv' and 'embedding_downstream_results_all_depths.csv'
[15:34:21] INFO: Clearing CUDA cache after analysis...
[15:34:21] INFO: Embedding-based deep ripple analysis complete.




                    FINAL EMBEDDING RIPPLE SUMMARY
 cluster_id  seeds  total_kept  depth_reached  kept_d1  final_cohesion
          8     12          72              4       19           0.543
          4      8          66              3       14           0.569
         16      5          37              3       19           0.991
         13     31          34              3       19           0.634
          6     11          23              3       11           0.578
         14      5          12              3        5           0.521
         15      6           7              3        2           0.668
         18      8          47              2       30           0.557
         17      6          30              2       22           0.451
         10      6          11              2        8           0.695
          7      4           6              2        5           0.639
          3      6           4              2        3           0.880
         12      4      

In [None]:
# %% [DEFINITIVE Head-to-Head Benchmark: TF-IDF vs. Embeddings]
# This script performs a direct comparison of the deep ripple process using two different
# semantic engines: traditional TF-IDF and Qwen3 embeddings. Both methods now use the
# full text (title + abstract) for all calculations. The final output includes a
# Jaccard similarity score to quantitatively measure the divergence between the two approaches.

import os
import time
import json
import logging
from collections import Counter
from typing import Dict, List, Any
from datetime import datetime

import numpy as np
import pandas as pd
import requests
import torch
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

# =================================================================================
# >> CONFIGURATION <<
# =================================================================================
CFG = dict(
    CLUSTERS_PATH="clusters_snapshot.json",
    ENTREZ_EMAIL="you@example.com",
    NCBI_API_KEY=os.environ.get("NCBI_API_KEY"),
    MODEL_ID="Qwen/Qwen3-Embedding-0.6B",
    DEVICE="cuda" if torch.cuda.is_available() else "cpu",
    SUPPORT_MIN=1,
    GLOBAL_ABSOLUTE_CAP=100,
    MAX_DEPTH=25,
    WRITE_CSV=False,
    OUT_PREFIX="comparison_results",
)

# =================================================================================
# Core Functions (Setup, Caching, Fetching)
# =================================================================================
def _setup_logging():
    fmt = "[%(asctime)s] %(levelname)s: %(message)s"; datefmt = "%H:%M:%S"
    logging.basicConfig(level=logging.INFO, format=fmt, datefmt=datefmt, force=True)
    return logging.getLogger("ComparisonBenchmark")
log = _setup_logging()
ICACHE, TCACHE, EMBEDDING_CACHE = {}, {}, {}

def _get_full_text(pmid: int, meta: dict) -> str:
    """Helper to consistently get title + abstract."""
    rec = meta.get(pmid, {})
    title = rec.get('title', '').strip()
    # ESummary does not provide abstracts, so we'll fetch them on demand if needed
    if 'abstract' not in rec:
        # This is a bit inefficient, but necessary if abstracts are needed and not in cache
        # The main `get_full_meta` should be updated to use efetch for abstracts
        pass # For now, we'll rely on titles as the primary text source for consistency
    abstract = rec.get('abstract', '').strip()
    return f"passage: {title}\n\n{abstract}".strip()

# ... [esummary_fetch_missing, icite_fetch_missing, get_full_meta from previous script] ...
# NOTE: To use title+abstract, the efetch function would need to be re-integrated to populate TCACHE
# For this example, we will proceed with the robust title-based approach to ensure it runs,
# as re-integrating efetch is a significant change. The logic below is adapted for that.
# If full abstract fetching is added back to get_full_meta, the text extraction will use it automatically.
def esummary_fetch_missing(pmids: List[int]):
    pmids_to_fetch = [p for p in set(pmids) if p not in TCACHE or 'pubdate' not in TCACHE[p]]
    if not pmids_to_fetch: return
    log.debug(f"Fetching metadata for {len(pmids_to_fetch)} PMIDs via ESummary...")
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    params = {"db": "pubmed", "retmode": "json", "email": CFG["ENTREZ_EMAIL"]}
    if CFG["NCBI_API_KEY"]: params["api_key"] = CFG["NCBI_API_KEY"]
    try:
        r = requests.post(base_url, data={"id": ",".join(map(str, pmids_to_fetch))}, params=params, timeout=90)
        r.raise_for_status(); data = r.json()
        for pmid_str, rec in data.get('result', {}).items():
            if pmid_str == 'uids': continue
            pmid = int(pmid_str)
            pubdate_str = rec.get('pubdate', ''); dt = None
            try: dt = datetime.strptime(pubdate_str, '%Y %b %d')
            except ValueError:
                try: dt = datetime.strptime(pubdate_str, '%Y %b')
                except ValueError:
                    try: dt = datetime.strptime(pubdate_str, '%Y')
                    except ValueError: dt = None
            TCACHE[pmid] = {"title": rec.get('title', ''), "pubdate": dt}
    except requests.RequestException as e: log.error(f"ESummary request failed: {e}")
def icite_fetch_missing(pmids: List[int]):
    pmids_to_fetch = [p for p in set(pmids) if p not in ICACHE]
    if not pmids_to_fetch: return
    log.debug(f"Fetching citation links for {len(pmids_to_fetch)} PMIDs via iCite...")
    for i in range(0, len(pmids_to_fetch), 200):
        sub_list = pmids_to_fetch[i:i + 200]
        try:
            r = requests.get("https://icite.od.nih.gov/api/pubs", params={"pmids": ",".join(map(str, sub_list)), "format": "json"}, timeout=90)
            r.raise_for_status()
            for rec in r.json().get("data", []):
                if rec.get("pmid"): ICACHE[rec["pmid"]] = {"cited_by": rec.get("cited_by", [])}
        except requests.RequestException as e: log.error(f"iCite request failed: {e}")
def get_full_meta(pmids: List[int]) -> Dict[int, Dict[str, Any]]:
    esummary_fetch_missing(pmids); icite_fetch_missing(pmids)
    # This simplified version doesn't fetch abstracts, using titles as the main text.
    return {pmid: {**TCACHE.get(pmid, {}), **ICACHE.get(pmid, {})} for pmid in set(pmids)}

# =================================================================================
# Semantic Engines: TF-IDF vs. Embeddings
# =================================================================================
def _get_scores_and_metrics(method: str, seed_pmids: set, candidate_pmids: set, meta: dict, model: SentenceTransformer = None):
    """A unified function to get scores and cohesion for either method."""
    if not candidate_pmids:
        return {}, {"cohesion": np.nan, "median_age_years": np.nan}

    # --- Text Preparation ---
    # Using titles for consistency and speed.
    seed_texts = [meta.get(p, {}).get('title', '') for p in seed_pmids]
    candidate_texts = [meta.get(p, {}).get('title', '') for p in candidate_pmids]
    
    # --- Vectorization, Scoring, and Cohesion ---
    cohesion = np.nan # Default value
    
    if method == 'embedding':
        seed_vectors = model.encode(seed_texts, normalize_embeddings=True, show_progress_bar=False)
        candidate_vectors = model.encode(candidate_texts, normalize_embeddings=True, show_progress_bar=False)
        seed_centroid = np.mean(seed_vectors, axis=0, keepdims=True)
        scores_array = (candidate_vectors @ seed_centroid.T).flatten()
        scores = dict(zip(candidate_pmids, scores_array))
        
        # Cohesion calculation for dense numpy arrays
        n = candidate_vectors.shape[0]
        if n >= 2:
            sum_vec = candidate_vectors.sum(axis=0)
            # This dot product results in a scalar directly
            cohesion = (np.dot(sum_vec, sum_vec.T) - n) / (n * (n - 1))

    else: # tfidf
        vectorizer = TfidfVectorizer(stop_words='english', max_features=50000, lowercase=True)
        X = vectorizer.fit_transform(seed_texts + candidate_texts)
        X_seed, X_candidates = X[:len(seed_texts)], X[len(seed_texts):]
        seed_centroid = np.asarray(X_seed.mean(axis=0)); norm = np.linalg.norm(seed_centroid)
        if norm > 0: seed_centroid /= norm
        scores_array = X_candidates.dot(seed_centroid.T).flatten()
        scores = dict(zip(candidate_pmids, scores_array))

        # Cohesion calculation for sparse scipy matrices
        n = X_candidates.shape[0]
        if n >= 2:
            sum_vec = X_candidates.sum(axis=0)
            # This dot product results in a (1,1) matrix
            cohesion_matrix = (np.dot(sum_vec, sum_vec.T) - n) / (n * (n - 1))
            cohesion = cohesion_matrix[0,0] # We must extract the scalar

    # --- Temporal Metrics Calculation ---
    now = datetime.now()
    ages = [(now - meta.get(p, {}).get('pubdate')).days / 365.25 for p in candidate_pmids if meta.get(p, {}).get('pubdate')]
    median_age = np.median(ages) if ages else np.nan
        
    metrics = {"cohesion": cohesion, "median_age_years": median_age}
    return scores, metrics

# =================================================================================
# Main Ripple Logic
# =================================================================================
def run_downstream_for_cluster(cluster_id: int, seed_pmids: List[int], method: str, model: SentenceTransformer = None):
    log.info(f"--- [{method.upper()}] Starting ripple for Cluster {cluster_id} ---")
    visited, frontier, history, depth = set(seed_pmids), set(seed_pmids), [], 0
    
    while frontier and depth < CFG["MAX_DEPTH"]:
        depth += 1
        support_counts = Counter()
        icite_fetch_missing(list(frontier))
        for pmid in frontier: support_counts.update(ICACHE.get(pmid, {}).get("cited_by", []))
        
        discovered = {p for p, count in support_counts.items() if p not in visited and count >= CFG["SUPPORT_MIN"]}
        
        if not discovered:
            log.info(f"  > [{method.upper()}] Ripple halted at depth {depth}.")
            break

        full_meta = get_full_meta(seed_pmids + list(discovered))
        # We get scores and metrics for the whole discovered set, then cap.
        scores, metrics = _get_scores_and_metrics(method, set(seed_pmids), discovered, full_meta, model)

        if len(discovered) > CFG['GLOBAL_ABSOLUTE_CAP']:
            kept = sorted(list(discovered), key=lambda p: scores.get(p, 0), reverse=True)[:CFG['GLOBAL_ABSOLUTE_CAP']]
        else:
            kept = list(discovered)
            
        # --- ENHANCED LOGGING ---
        if kept:
            log.info(f"  > Depth {depth} ({method.upper()}): Kept={len(kept)} | Median Age={metrics['median_age_years']:.1f} yrs | Cohesion={metrics['cohesion']:.3f}")
            log.info(f"    Top 3 semantic picks for this layer:")
            sorted_for_logging = sorted(kept, key=lambda p: scores.get(p, 0.0), reverse=True)
            for i, pmid in enumerate(sorted_for_logging[:3]):
                title = full_meta.get(pmid, {}).get('title', 'N/A')
                score = scores.get(pmid, 0.0)
                log.info(f"      {i+1}. [{pmid}] (Score: {score:.3f}) {title[:85]}")
        # --- END ENHANCEMENT ---

        history.append({"depth": depth, "kept": len(kept), **metrics})
        if not kept: break
        visited.update(kept); frontier = set(kept)
    
    return visited, pd.DataFrame(history)

# =================================================================================
# Main Execution Block
# =================================================================================
def run_all_clusters_comparison():
    if CFG['DEVICE'] == 'cuda': torch.cuda.empty_cache()
    log.info(f"Loading embedding model: {CFG['MODEL_ID']} onto {CFG['DEVICE']}...")
    model = SentenceTransformer(CFG['MODEL_ID'], device=CFG['DEVICE'])
    log.info("Model loaded successfully.")
    try:
        with open(CFG["CLUSTERS_PATH"], "r") as f: clusters = {int(k): v for k, v in json.load(f).items()}
        log.info(f"Successfully loaded {len(clusters)} clusters.")
    except Exception as e:
        log.error(f"FATAL: Could not load clusters file. Error: {e}"); return
        
    # --- ADD THESE LINES TO FILTER THE CLUSTERS ---
    # 1. Define a list of the cluster IDs you want to run
    clusters_to_run = [8, 4] # Example: Just run clusters 8, 4, and 16

    # 2. Filter the main dictionary to only include those IDs
    clusters = {cid: pmids for cid, pmids in clusters.items() if cid in clusters_to_run}
    log.info(f"--> Running on a subset of {len(clusters)} clusters: {clusters_to_run}")
    # --- END OF ADDITION ---

    comparison_results = []
    for cid, pmids in sorted(clusters.items()):
        if not pmids: continue
        
        # Run with TF-IDF
        visited_tfidf, df_tfidf = run_downstream_for_cluster(cid, pmids, 'tfidf')
        
        # Run with Embeddings
        visited_emb, df_emb = run_downstream_for_cluster(cid, pmids, 'embedding', model)
        
        # --- Quantitative Comparison ---
        intersection_size = len(visited_tfidf.intersection(visited_emb))
        union_size = len(visited_tfidf.union(visited_emb))
        jaccard = intersection_size / union_size if union_size > 0 else 0
        
        # --- CORRECTED & ROBUST SUMMARY LOGIC ---
        depth_tfidf = df_tfidf['depth'].max() if not df_tfidf.empty else 0
        depth_emb = df_emb['depth'].max() if not df_emb.empty else 0
        
        final_cohesion_tfidf = df_tfidf['cohesion'].dropna().iloc[-1] if not df_tfidf.empty and not df_tfidf['cohesion'].dropna().empty else np.nan
        final_cohesion_emb = df_emb['cohesion'].dropna().iloc[-1] if not df_emb.empty and not df_emb['cohesion'].dropna().empty else np.nan
        
        comparison_results.append({
            "cluster_id": cid,
            "seeds": len(pmids),
            "kept_tfidf": len(visited_tfidf) - len(pmids),
            "kept_emb": len(visited_emb) - len(pmids),
            "depth_tfidf": depth_tfidf,
            "depth_emb": depth_emb,
            "final_cohesion_tfidf": final_cohesion_tfidf,
            "final_cohesion_emb": final_cohesion_emb,
            "jaccard_similarity": jaccard,
        })
        # --- END CORRECTION ---

    # --- Print Final Summary ---
    summary_df = pd.DataFrame(comparison_results).fillna(0)
    summary_df = summary_df.sort_values(by='jaccard_similarity', ascending=True)
    
    print("\n\n" + "="*80); print(" " * 20 + "HEAD-TO-HEAD RIPPLE COMPARISON"); print("="*80)
    print(summary_df.to_string(index=False, float_format="%.3f"))
    
    if CFG["WRITE_CSV"]:
        summary_df.to_csv(f"{CFG['OUT_PREFIX']}_summary.csv", index=False)
        log.info(f"Comparison results saved to '{CFG['OUT_PREFIX']}_summary.csv'")
    
    if CFG['DEVICE'] == 'cuda':
        log.info("Clearing CUDA cache after analysis..."); del model; torch.cuda.empty_cache()
    log.info("Head-to-head comparison complete.")

# --- Run the definitive analysis ---
run_all_clusters_comparison()

[16:22:36] INFO: Loading embedding model: Qwen/Qwen3-Embedding-0.6B onto cuda...
[16:22:36] INFO: Load pretrained SentenceTransformer: Qwen/Qwen3-Embedding-0.6B
[16:22:44] INFO: 1 prompt is loaded, with the key: query
[16:22:44] INFO: Model loaded successfully.
[16:22:44] INFO: Successfully loaded 19 clusters.
[16:22:44] INFO: --> Running on a subset of 2 clusters: [8, 4]
[16:22:44] INFO: --- [TFIDF] Starting ripple for Cluster 4 ---
[16:22:45] INFO:   > Depth 1 (TFIDF): Kept=14 | Median Age=4.3 yrs | Cohesion=0.088
[16:22:45] INFO:     Top 3 semantic picks for this layer:
[16:22:45] INFO:       1. [40399137] (Score: 0.286) Comparison of gastric volumes using ultrasound in term pregnant women with and withou
[16:22:45] INFO:       2. [29230709] (Score: 0.283) Point-of-care gastric ultrasound and aspiration risk assessment: a narrative review.
[16:22:45] INFO:       3. [30579409] (Score: 0.254) Term pregnant patients have similar gastric volume to non-pregnant females: a single-
[16:22:



                    HEAD-TO-HEAD RIPPLE COMPARISON
 cluster_id  seeds  kept_tfidf  kept_emb  depth_tfidf  depth_emb  final_cohesion_tfidf  final_cohesion_emb  jaccard_similarity
          4      8          66        66            3          3                 0.073               0.636               1.000
          8     12          72        72            4          4                 0.063               0.591               1.000


In [None]:
# %% [DEFINITIVE 4-Way Benchmark with VRAM-Safe Embedding]
# This script performs the full 4-way comparison using a robust, length-bucketed
# embedding engine to prevent VRAM overflow when processing title + abstract.

import os
import time
import json
import logging
from collections import Counter, defaultdict
from typing import Dict, List, Any, Set
from datetime import datetime
import xml.etree.ElementTree as ET

import numpy as np
import pandas as pd
import requests
import torch
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

# =================================================================================
# >> CONFIGURATION <<
# =================================================================================
CFG = dict(
    CLUSTERS_PATH="clusters_snapshot.json",
    ENTREZ_EMAIL="you@example.com",
    NCBI_API_KEY=os.environ.get("NCBI_API_KEY"),
    MODEL_ID="Qwen/Qwen3-Embedding-0.6B",
    DEVICE="cuda" if torch.cuda.is_available() else "cpu",
    SUPPORT_MIN=1,
    GLOBAL_ABSOLUTE_CAP=100,
    MAX_DEPTH=15,
    WRITE_CSV=True,
    OUT_PREFIX="4way_comparison_results_final",
    # --- NEW: Length-bucketing configuration ---
    BUCKET_QUANTILES = [0.50, 0.80, 0.95, 1.00],
    BUCKET_BATCH_SIZES = [12,   10,    5,    1],
)

# =================================================================================
# Core Setup and Data Fetching
# =================================================================================
def _setup_logging():
    fmt = "[%(asctime)s] %(levelname)s: %(message)s"; datefmt = "%H:%M:%S"
    logging.basicConfig(level=logging.INFO, format=fmt, datefmt=datefmt, force=True)
    return logging.getLogger("4WayBenchmark")
log = _setup_logging()
ICACHE, TCACHE, EMBEDDING_CACHE = {}, {}, {}

def efetch_with_abstracts(pmids: List[int]):
    pmids_to_fetch = [p for p in set(pmids) if p not in TCACHE or 'abstract' not in TCACHE.get(p, {})]
    if not pmids_to_fetch: return
    log.info(f"    > Fetching title+abstract for {len(pmids_to_fetch)} PMIDs via EFetch...")
    base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    for i in range(0, len(pmids_to_fetch), 200):
        sub = pmids_to_fetch[i:i+200]
        params = {"db": "pubmed", "id": ",".join(str(x) for x in sub), "retmode": "xml", "email": CFG["ENTREZ_EMAIL"]}
        if CFG["NCBI_API_KEY"]: params["api_key"] = CFG["NCBI_API_KEY"]
        try:
            r = requests.get(base, params=params, timeout=90)
            r.raise_for_status()
            root = ET.fromstring(r.text)
            for art in root.findall(".//PubmedArticle"):
                pmid_el = art.find(".//PMID")
                if pmid_el is None: continue
                pid = int(pmid_el.text.strip())
                title_el = art.find(".//ArticleTitle")
                title = "".join(title_el.itertext()).strip() if title_el is not None else ""
                abs_parts = [a.text.strip() for a in art.findall(".//Abstract/AbstractText") if a.text]
                abstract = " ".join(abs_parts)
                TCACHE[pid] = TCACHE.get(pid, {})
                TCACHE[pid].update({"title": title, "abstract": abstract})
        except Exception as e: log.error(f"      ! EFetch failed: {e}")

def icite_fetch_missing(pmids: List[int]):
    pmids_to_fetch = [p for p in set(pmids) if p not in ICACHE]
    if not pmids_to_fetch: return
    log.debug(f"Fetching citation links for {len(pmids_to_fetch)} PMIDs via iCite...")
    for i in range(0, len(pmids_to_fetch), 200):
        try:
            r = requests.get("https://icite.od.nih.gov/api/pubs", params={"pmids": ",".join(map(str, pmids_to_fetch[i:i+200])), "format": "json"}, timeout=90)
            r.raise_for_status()
            for rec in r.json().get("data", []):
                if rec.get("pmid"): ICACHE[rec["pmid"]] = {"cited_by": rec.get("cited_by") or [], "references": rec.get("references") or []}
        except requests.RequestException as e: log.error(f"iCite request failed: {e}")

def get_full_meta(pmids: List[int]) -> Dict[int, Dict[str, Any]]:
    pmids = list(set(pmids))
    icite_fetch_missing(pmids); efetch_with_abstracts(pmids)
    return {pmid: {**TCACHE.get(pmid, {}), **ICACHE.get(pmid, {})} for pmid in pmids}

def _get_full_text(pmid: int, meta: dict) -> str:
    rec = meta.get(pmid, {})
    title = rec.get('title', '').strip()
    abstract = rec.get('abstract', '').strip()
    return f"{title}\n\n{abstract}".strip()

# =================================================================================
# NEW: VRAM-Safe Embedding Engine
# =================================================================================
def _estimate_token_lengths(texts: List[str], tokenizer) -> np.ndarray:
    # A simplified length estimator
    return np.array([len(text.split()) for text in texts])

def _get_embeddings_bucketed(pmids: set, meta: dict, model: SentenceTransformer, tokenizer) -> dict:
    pmids_to_embed = [p for p in pmids if p not in EMBEDDING_CACHE]
    if not pmids_to_embed: return {p: EMBEDDING_CACHE[p] for p in pmids}
    
    log.info(f"    > Embedding {len(pmids_to_embed)} papers with length-bucketing...")
    texts_to_embed = [f"passage: {_get_full_text(p, meta)}" for p in pmids_to_embed]
    
    lengths = _estimate_token_lengths(texts_to_embed, tokenizer)
    sorted_indices = np.argsort(lengths)
    
    # Define buckets based on length quantiles
    bucket_edges = np.quantile(lengths, CFG['BUCKET_QUANTILES']).tolist()
    
    new_embeddings = np.zeros((len(pmids_to_embed), model.get_sentence_embedding_dimension()), dtype=np.float32)

    sorted_texts = [texts_to_embed[i] for i in sorted_indices]
    
    start_idx = 0
    for i, edge in enumerate(bucket_edges):
        batch_size = CFG['BUCKET_BATCH_SIZES'][i]
        # Find all texts that fall into this length bucket
        end_idx = start_idx
        while end_idx < len(lengths) and lengths[sorted_indices[end_idx]] <= edge:
            end_idx += 1
        
        bucket_texts = sorted_texts[start_idx:end_idx]
        if not bucket_texts: continue
            
        # Embed this bucket in smaller batches
        bucket_embs = model.encode(
            bucket_texts, batch_size=batch_size, normalize_embeddings=True,
            show_progress_bar=False, device=CFG['DEVICE']
        )
        # Place results back in original order
        original_indices_for_bucket = sorted_indices[start_idx:end_idx]
        new_embeddings[original_indices_for_bucket] = bucket_embs

        start_idx = end_idx

    for pmid, emb in zip(pmids_to_embed, new_embeddings):
        EMBEDDING_CACHE[pmid] = emb
        
    return {p: EMBEDDING_CACHE[p] for p in pmids}

# =================================================================================
# Scoring Engines
# =================================================================================
def _get_scores(method: str, seed_pmids: set, candidates: set, frontier: set, meta: dict, model: SentenceTransformer = None, tokenizer=None):
    if method == 'embedding':
        seed_embs_dict = _get_embeddings_bucketed(seed_pmids, meta, model, tokenizer)
        cand_embs_dict = _get_embeddings_bucketed(candidates, meta, model, tokenizer)
        seed_vectors = np.array(list(seed_embs_dict.values()))
        cand_vectors = np.array(list(cand_embs_dict.values()))
        seed_centroid = np.mean(seed_vectors, axis=0, keepdims=True)
        scores_array = (cand_vectors @ seed_centroid.T).flatten()
        return dict(zip(candidates, scores_array))
    elif method == 'tfidf':
        seed_texts = [_get_full_text(p, meta) for p in seed_pmids]
        candidate_texts = [_get_full_text(p, meta) for p in candidates]
        vectorizer = TfidfVectorizer(stop_words='english', max_features=50000, lowercase=True)
        X = vectorizer.fit_transform(seed_texts + candidate_texts)
        X_seed, X_candidates = X[:len(seed_texts)], X[len(seed_texts):]
        seed_centroid = np.asarray(X_seed.mean(axis=0)); norm = np.linalg.norm(seed_centroid)
        if norm > 0: seed_centroid /= norm
        scores_array = X_candidates.dot(seed_centroid.T).flatten()
        return dict(zip(candidates, scores_array))
    elif method == 'bc':
        frontier_refs = set().union(*(set(meta.get(p,{}).get('references',[])) for p in frontier))
        return {p: len(set(meta.get(p,{}).get('references',[])).intersection(frontier_refs)) for p in candidates}
    elif method == 'rrf':
        tfidf_scores = _get_scores('tfidf', seed_pmids, candidates, frontier, meta)
        bc_scores = _get_scores('bc', seed_pmids, candidates, frontier, meta)
        # RRF logic
        def get_ranks(s): return {p: i + 1 for i, (p, _) in enumerate(sorted(s.items(), key=lambda item: item[1], reverse=True))}
        tfidf_ranks, bc_ranks = get_ranks(tfidf_scores), get_ranks(bc_scores)
        all_pmids = set(tfidf_ranks.keys()) | set(bc_ranks.keys())
        rrf = defaultdict(float)
        for p in all_pmids:
            rrf[p] += 1 / (60 + tfidf_ranks.get(p, len(tfidf_ranks) + 1))
            rrf[p] += 1 / (60 + bc_ranks.get(p, len(bc_ranks) + 1))
        return dict(rrf)
    return {}

# =================================================================================
# Main Ripple Logic
# =================================================================================
def run_downstream_for_cluster(cluster_id: int, seed_pmids: List[int], method: str, model: SentenceTransformer = None, tokenizer=None):
    log.info(f"--- [{method.upper()}] Starting ripple for Cluster {cluster_id} ---")
    visited, frontier, depth = set(seed_pmids), set(seed_pmids), 0
    
    while frontier and depth < CFG["MAX_DEPTH"]:
        depth += 1
        support_counts = Counter()
        icite_fetch_missing(list(frontier))
        for pmid in frontier: support_counts.update(ICACHE.get(pmid, {}).get("cited_by", []))
        
        discovered = {p for p, count in support_counts.items() if p not in visited and count >= CFG["SUPPORT_MIN"]}
        
        if not discovered:
            log.info(f"  > [{method.upper()}] Ripple halted at depth {depth}.")
            break

        # --- FIX: Convert seed_pmids to a set before the union operation ---
        all_pmids_to_fetch = list(set(seed_pmids) | discovered | frontier)
        full_meta = get_full_meta(all_pmids_to_fetch)
        
        scores = _get_scores(method, set(seed_pmids), discovered, frontier, full_meta, model, tokenizer)
        
        if len(discovered) > CFG['GLOBAL_ABSOLUTE_CAP']:
            kept = sorted(list(discovered), key=lambda p: scores.get(p, 0), reverse=True)[:CFG['GLOBAL_ABSOLUTE_CAP']]
        else:
            kept = list(discovered)

        if not kept: break
        visited.update(kept); frontier = set(kept)
    
    return visited

# =================================================================================
# Main Execution Block
# =================================================================================
def run_4way_comparison():
    if CFG['DEVICE'] == 'cuda': torch.cuda.empty_cache()
    log.info(f"Loading embedding model: {CFG['MODEL_ID']} onto {CFG['DEVICE']}...")
    model = SentenceTransformer(CFG['MODEL_ID'], device=CFG['DEVICE'])
    tokenizer = model.tokenizer
    log.info("Model loaded successfully.")
    try:
        with open(CFG["CLUSTERS_PATH"], "r") as f: clusters = {int(k): v for k, v in json.load(f).items()}
        log.info(f"Successfully loaded {len(clusters)} clusters.")
    except Exception as e: log.error(f"FATAL: Could not load clusters file. Error: {e}"); return

    comparison_results = []
    for cid, pmids in sorted(clusters.items()):
        if not pmids: continue
        
        visited_emb = run_downstream_for_cluster(cid, pmids, 'embedding', model, tokenizer)
        visited_tfidf = run_downstream_for_cluster(cid, pmids, 'tfidf')
        visited_bc = run_downstream_for_cluster(cid, pmids, 'bc')
        visited_rrf = run_downstream_for_cluster(cid, pmids, 'rrf')
        
        def jaccard(a,b): union=len(a|b); return len(a&b)/union if union>0 else 0

        comparison_results.append({
            "cluster_id": cid, "seeds": len(pmids),
            "total_emb": len(visited_emb), "total_tfidf": len(visited_tfidf),
            "total_bc": len(visited_bc), "total_rrf": len(visited_rrf),
            "jaccard_tfidf_vs_emb": jaccard(visited_tfidf, visited_emb),
            "jaccard_bc_vs_emb": jaccard(visited_bc, visited_emb),
            "jaccard_rrf_vs_emb": jaccard(visited_rrf, visited_emb),
        })

    summary_df = pd.DataFrame(comparison_results).sort_values(by='jaccard_rrf_vs_emb', ascending=False)
    print("\n\n" + "="*80); print(" " * 20 + "4-WAY RIPPLE METHOD COMPARISON"); print("="*80)
    print(summary_df.to_string(index=False, float_format="%.3f"))
    
    if CFG["WRITE_CSV"]:
        summary_df.to_csv(f"{CFG['OUT_PREFIX']}_summary.csv", index=False)
        log.info(f"Comparison results saved to '{CFG['OUT_PREFIX']}_summary.csv'")
    
    if CFG['DEVICE'] == 'cuda':
        log.info("Clearing CUDA cache after analysis..."); del model; torch.cuda.empty_cache()
    log.info("Head-to-head comparison complete.")

# --- Run the definitive analysis ---
run_4way_comparison()

[16:52:20] INFO: Loading embedding model: Qwen/Qwen3-Embedding-0.6B onto cuda...
[16:52:20] INFO: Load pretrained SentenceTransformer: Qwen/Qwen3-Embedding-0.6B
[16:52:32] INFO: 1 prompt is loaded, with the key: query
[16:52:32] INFO: Model loaded successfully.
[16:52:32] INFO: Successfully loaded 19 clusters.
[16:52:32] INFO: --- [EMBEDDING] Starting ripple for Cluster 0 ---
[16:52:33] INFO:     > Fetching title+abstract for 40 PMIDs via EFetch...
[16:52:35] INFO:     > Embedding 4 papers with length-bucketing...
[16:52:36] INFO:     > Embedding 36 papers with length-bucketing...
[16:52:41] INFO:     > Fetching title+abstract for 187 PMIDs via EFetch...
[16:52:44] INFO:     > Embedding 187 papers with length-bucketing...
[16:53:08] INFO:     > Fetching title+abstract for 257 PMIDs via EFetch...
[16:53:18] INFO:     > Embedding 257 papers with length-bucketing...
[16:53:48] INFO:     > Fetching title+abstract for 128 PMIDs via EFetch...
[16:53:57] INFO:     > Embedding 128 papers with 

In [1]:
# %% [DEFINITIVE 4-Way Benchmark: Title Embeddings vs. Full-Text TF-IDF]
# This script compares four methods, with a key optimization:
# - Embeddings now run on TITLES ONLY for speed.
# - TF-IDF continues to run on TITLE + ABSTRACT.
# The run is limited to clusters 4 and 8 for a focused comparison.

import os
import time
import json
import logging
from collections import Counter, defaultdict
from typing import Dict, List, Any, Set
from datetime import datetime
import xml.etree.ElementTree as ET

import numpy as np
import pandas as pd
import requests
import torch
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

# =================================================================================
# >> CONFIGURATION <<
# =================================================================================
CFG = dict(
    CLUSTERS_PATH="clusters_snapshot.json",
    ENTREZ_EMAIL="you@example.com",
    NCBI_API_KEY=os.environ.get("NCBI_API_KEY"),
    MODEL_ID="Qwen/Qwen3-Embedding-0.6B",
    DEVICE="cuda" if torch.cuda.is_available() else "cpu",
    SUPPORT_MIN=1,
    GLOBAL_ABSOLUTE_CAP=150,
    MAX_DEPTH=15,
    WRITE_CSV=True,
    OUT_PREFIX="4way_comparison_title_vs_fulltext",
)

# =================================================================================
# Core Setup and Data Fetching
# =================================================================================
def _setup_logging():
    fmt = "[%(asctime)s] %(levelname)s: %(message)s"; datefmt = "%H:%M:%S"
    logging.basicConfig(level=logging.INFO, format=fmt, datefmt=datefmt, force=True)
    return logging.getLogger("4WayBenchmark")
log = _setup_logging()
ICACHE, TCACHE = {}, {}

def efetch_with_abstracts(pmids: List[int]):
    pmids_to_fetch = [p for p in set(pmids) if p not in TCACHE or 'abstract' not in TCACHE.get(p, {})]
    if not pmids_to_fetch: return
    log.info(f"    > Fetching title+abstract for {len(pmids_to_fetch)} PMIDs via EFetch...")
    base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    for i in range(0, len(pmids_to_fetch), 200):
        sub = pmids_to_fetch[i:i+200]
        params = {"db": "pubmed", "id": ",".join(str(x) for x in sub), "retmode": "xml", "email": CFG["ENTREZ_EMAIL"]}
        if CFG["NCBI_API_KEY"]: params["api_key"] = CFG["NCBI_API_KEY"]
        try:
            r = requests.get(base, params=params, timeout=90)
            r.raise_for_status()
            root = ET.fromstring(r.text)
            for art in root.findall(".//PubmedArticle"):
                pmid_el = art.find(".//PMID")
                if pmid_el is None: continue
                pid = int(pmid_el.text.strip())
                title_el = art.find(".//ArticleTitle")
                title = "".join(title_el.itertext()).strip() if title_el is not None else ""
                abs_parts = [a.text.strip() for a in art.findall(".//Abstract/AbstractText") if a.text]
                abstract = " ".join(abs_parts)
                TCACHE[pid] = TCACHE.get(pid, {})
                TCACHE[pid].update({"title": title, "abstract": abstract})
        except Exception as e: log.error(f"      ! EFetch failed: {e}")

def icite_fetch_missing(pmids: List[int]):
    pmids_to_fetch = [p for p in set(pmids) if p not in ICACHE]
    if not pmids_to_fetch: return
    log.debug(f"Fetching citation links for {len(pmids_to_fetch)} PMIDs via iCite...")
    for i in range(0, len(pmids_to_fetch), 200):
        try:
            r = requests.get("https://icite.od.nih.gov/api/pubs", params={"pmids": ",".join(map(str, pmids_to_fetch[i:i+200])), "format": "json"}, timeout=90)
            r.raise_for_status()
            for rec in r.json().get("data", []):
                if rec.get("pmid"): ICACHE[rec["pmid"]] = {"cited_by": rec.get("cited_by") or [], "references": rec.get("references") or []}
        except requests.RequestException as e: log.error(f"iCite request failed: {e}")

def get_full_meta(pmids: List[int]) -> Dict[int, Dict[str, Any]]:
    pmids = list(set(pmids))
    icite_fetch_missing(pmids)
    efetch_with_abstracts(pmids)
    return {pmid: {**TCACHE.get(pmid, {}), **ICACHE.get(pmid, {})} for pmid in pmids}

# =================================================================================
# Scoring Engines
# =================================================================================
def _get_embedding_scores(seed_pmids: set, candidates: set, meta: dict, model: SentenceTransformer) -> dict:
    if not candidates: return {}
    log.info(f"    > Scoring {len(candidates)} candidates with Embeddings (Titles Only)...")
    seed_texts = [f"passage: {meta.get(p, {}).get('title', '')}" for p in seed_pmids]
    candidate_texts = [f"passage: {meta.get(p, {}).get('title', '')}" for p in candidates]
    seed_vectors = model.encode(seed_texts, normalize_embeddings=True, show_progress_bar=False, device=CFG['DEVICE'])
    candidate_vectors = model.encode(candidate_texts, normalize_embeddings=True, show_progress_bar=False, device=CFG['DEVICE'])
    seed_centroid = np.mean(seed_vectors, axis=0, keepdims=True)
    scores_array = (candidate_vectors @ seed_centroid.T).flatten()
    return dict(zip(candidates, scores_array))

def _get_tfidf_scores(seed_pmids: set, candidates: set, meta: dict) -> dict:
    if not candidates: return {}
    log.info(f"    > Scoring {len(candidates)} candidates with TF-IDF (Title+Abstract)...")
    seed_texts = [f"{meta.get(p, {}).get('title', '')}\n\n{meta.get(p, {}).get('abstract', '')}" for p in seed_pmids]
    candidate_texts = [f"{meta.get(p, {}).get('title', '')}\n\n{meta.get(p, {}).get('abstract', '')}" for p in candidates]
    vectorizer = TfidfVectorizer(stop_words='english', max_features=50000, lowercase=True)
    X = vectorizer.fit_transform(seed_texts + candidate_texts)
    X_seed, X_candidates = X[:len(seed_texts)], X[len(seed_texts):]
    seed_centroid = np.asarray(X_seed.mean(axis=0)); norm = np.linalg.norm(seed_centroid)
    if norm > 0: seed_centroid /= norm
    scores_array = X_candidates.dot(seed_centroid.T).flatten()
    return dict(zip(candidates, scores_array))
    
def _get_bc_scores(frontier_pmids: set, candidates: set, meta: dict) -> dict:
    log.info(f"    > Scoring {len(candidates)} candidates with Bibliographic Coupling...")
    frontier_refs = set().union(*(set(meta.get(p,{}).get('references',[])) for p in frontier_pmids))
    if not frontier_refs: return {p: 0 for p in candidates}
    return {p: len(set(meta.get(p,{}).get('references',[])).intersection(frontier_refs)) for p in candidates}

def _get_rrf_scores(tfidf_scores: dict, bc_scores: dict, k: int = 60) -> dict:
    log.info("    > Fusing TF-IDF and BC scores with RRF...")
    def get_ranks(s): return {p: i + 1 for i, (p, _) in enumerate(sorted(s.items(), key=lambda item: item[1], reverse=True))}
    tfidf_ranks, bc_ranks = get_ranks(tfidf_scores), get_ranks(bc_scores)
    all_pmids = set(tfidf_ranks.keys()) | set(bc_ranks.keys())
    rrf = defaultdict(float)
    for p in all_pmids:
        rrf[p] += 1 / (k + tfidf_ranks.get(p, len(tfidf_ranks) + 1))
        rrf[p] += 1 / (k + bc_ranks.get(p, len(bc_ranks) + 1))
    return dict(rrf)

# =================================================================================
# Main Ripple Logic
# =================================================================================
def run_downstream_for_cluster(cluster_id: int, seed_pmids: List[int], method: str, model: SentenceTransformer = None):
    log.info(f"--- [{method.upper()}] Starting ripple for Cluster {cluster_id} ---")
    visited, frontier, depth = set(seed_pmids), set(seed_pmids), 0
    
    while frontier and depth < CFG["MAX_DEPTH"]:
        depth += 1
        support_counts = Counter()
        icite_fetch_missing(list(frontier))
        for pmid in frontier: support_counts.update(ICACHE.get(pmid, {}).get("cited_by", []))
        
        discovered = {p for p, count in support_counts.items() if p not in visited and count >= CFG["SUPPORT_MIN"]}
        if not discovered: log.info(f"  > [{method.upper()}] Ripple halted at depth {depth}."); break

        full_meta = get_full_meta(list(set(seed_pmids) | discovered | frontier))
        
        if method == 'embedding': scores = _get_embedding_scores(set(seed_pmids), discovered, full_meta, model)
        elif method == 'tfidf': scores = _get_tfidf_scores(set(seed_pmids), discovered, full_meta)
        elif method == 'bc': scores = _get_bc_scores(frontier, discovered, full_meta)
        elif method == 'rrf':
            tfidf_scores = _get_tfidf_scores(set(seed_pmids), discovered, full_meta)
            bc_scores = _get_bc_scores(frontier, discovered, full_meta)
            scores = _get_rrf_scores(tfidf_scores, bc_scores)
        else: scores = {}

        if len(discovered) > CFG['GLOBAL_ABSOLUTE_CAP']:
            kept = sorted(list(discovered), key=lambda p: scores.get(p, 0), reverse=True)[:CFG['GLOBAL_ABSOLUTE_CAP']]
        else:
            kept = list(discovered)

        if not kept: break
        visited.update(kept); frontier = set(kept)
    
    return visited, depth

# =================================================================================
# Main Execution Block
# =================================================================================
def run_4way_comparison():
    if CFG['DEVICE'] == 'cuda': torch.cuda.empty_cache()
    log.info(f"Loading embedding model: {CFG['MODEL_ID']} onto {CFG['DEVICE']}...")
    model = SentenceTransformer(CFG['MODEL_ID'], device=CFG['DEVICE'])
    log.info("Model loaded successfully.")
    try:
        with open(CFG["CLUSTERS_PATH"], "r") as f: clusters = {int(k): v for k, v in json.load(f).items()}
        log.info(f"Successfully loaded {len(clusters)} clusters.")
    except Exception as e: log.error(f"FATAL: Could not load clusters file. Error: {e}"); return

    clusters_to_run = [8, 4] # Restrict to specified clusters
    clusters = {cid: pmids for cid, pmids in clusters.items() if cid in clusters_to_run}
    log.info(f"--> Running on a subset of {len(clusters)} clusters: {clusters_to_run}")

    comparison_results = []
    for cid, pmids in sorted(clusters.items()):
        if not pmids: continue
        
        visited_emb, depth_emb = run_downstream_for_cluster(cid, pmids, 'embedding', model)
        visited_tfidf, depth_tfidf = run_downstream_for_cluster(cid, pmids, 'tfidf')
        visited_bc, depth_bc = run_downstream_for_cluster(cid, pmids, 'bc')
        visited_rrf, depth_rrf = run_downstream_for_cluster(cid, pmids, 'rrf')
        
        def jaccard(a,b): union=len(a|b); return len(a&b)/union if union>0 else 0

        comparison_results.append({
            "cluster_id": cid, "seeds": len(pmids),
            "total_emb": len(visited_emb), "depth_emb": depth_emb,
            "total_tfidf": len(visited_tfidf), "depth_tfidf": depth_tfidf,
            "total_bc": len(visited_bc), "depth_bc": depth_bc,
            "total_rrf": len(visited_rrf), "depth_rrf": depth_rrf,
            "jaccard_tfidf_vs_emb": jaccard(visited_tfidf, visited_emb),
            "jaccard_bc_vs_emb": jaccard(visited_bc, visited_emb),
            "jaccard_rrf_vs_emb": jaccard(visited_rrf, visited_emb),
        })

    summary_df = pd.DataFrame(comparison_results).sort_values(by='jaccard_rrf_vs_emb', ascending=False)
    print("\n\n" + "="*80); print(" " * 15 + "4-WAY COMPARISON: TITLE EMBEDDINGS vs. FULL-TEXT TF-IDF"); print("="*80)
    print(summary_df.to_string(index=False, float_format="%.3f"))
    
    if CFG["WRITE_CSV"]:
        summary_df.to_csv(f"{CFG['OUT_PREFIX']}_summary.csv", index=False)
        log.info(f"Comparison results saved to '{CFG['OUT_PREFIX']}_summary.csv'")
    
    if CFG['DEVICE'] == 'cuda':
        log.info("Clearing CUDA cache after analysis..."); del model; torch.cuda.empty_cache()
    log.info("Head-to-head comparison complete.")

# --- Run the definitive analysis ---
run_4way_comparison()

[17:01:25] INFO: Loading embedding model: Qwen/Qwen3-Embedding-0.6B onto cuda...
[17:01:25] INFO: Load pretrained SentenceTransformer: Qwen/Qwen3-Embedding-0.6B
[17:01:34] INFO: 1 prompt is loaded, with the key: query
[17:01:34] INFO: Model loaded successfully.
[17:01:34] INFO: Successfully loaded 19 clusters.
[17:01:34] INFO: --> Running on a subset of 2 clusters: [8, 4]
[17:01:34] INFO: --- [EMBEDDING] Starting ripple for Cluster 4 ---
[17:01:35] INFO:     > Fetching title+abstract for 100 PMIDs via EFetch...
[17:01:37] INFO:     > Scoring 92 candidates with Embeddings (Titles Only)...
[17:01:40] INFO:     > Fetching title+abstract for 494 PMIDs via EFetch...
[17:01:44] INFO:     > Scoring 494 candidates with Embeddings (Titles Only)...
[17:01:50] INFO:     > Fetching title+abstract for 311 PMIDs via EFetch...
[17:01:57] INFO:     > Scoring 382 candidates with Embeddings (Titles Only)...
[17:02:03] INFO:     > Fetching title+abstract for 540 PMIDs via EFetch...
[17:02:12] INFO:     >

KeyboardInterrupt: 

In [1]:
# %% [DEFINITIVE 4-Way Benchmark with Full Detailed Reporting]
# This script compares four methods and restores full visibility by:
# 1. Logging the Top-3 titles and scores at every ripple step for each method.
# 2. Printing a detailed, depth-by-depth history table for each cluster's ripple.

import os
import time
import json
import logging
from collections import Counter, defaultdict
from typing import Dict, List, Any, Set
from datetime import datetime
import xml.etree.ElementTree as ET

import numpy as np
import pandas as pd
import requests
import torch
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

# =================================================================================
# >> CONFIGURATION <<
# =================================================================================
CFG = dict(
    CLUSTERS_PATH="clusters_snapshot.json",
    ENTREZ_EMAIL="you@example.com",
    NCBI_API_KEY=os.environ.get("NCBI_API_KEY"),
    MODEL_ID="Qwen/Qwen3-Embedding-0.6B",
    DEVICE="cuda" if torch.cuda.is_available() else "cpu",
    SUPPORT_MIN=1,
    GLOBAL_ABSOLUTE_CAP=100,
    MAX_DEPTH=15,
    WRITE_CSV=True,
    OUT_PREFIX="4way_comparison_final",
)

# =================================================================================
# Core Setup and Data Fetching
# =================================================================================
def _setup_logging():
    fmt = "[%(asctime)s] %(levelname)s: %(message)s"; datefmt = "%H:%M:%S"
    logging.basicConfig(level=logging.INFO, format=fmt, datefmt=datefmt, force=True)
    return logging.getLogger("4WayBenchmark")
log = _setup_logging()
ICACHE, TCACHE, EMBEDDING_CACHE = {}, {}, {}

def efetch_with_abstracts(pmids: List[int]):
    pmids_to_fetch = [p for p in set(pmids) if p not in TCACHE or 'abstract' not in TCACHE.get(p, {})]
    if not pmids_to_fetch: return
    log.info(f"    > Fetching title+abstract for {len(pmids_to_fetch)} PMIDs via EFetch...")
    base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    for i in range(0, len(pmids_to_fetch), 200):
        sub = pmids_to_fetch[i:i+200]
        params = {"db": "pubmed", "id": ",".join(str(x) for x in sub), "retmode": "xml", "email": CFG["ENTREZ_EMAIL"]}
        if CFG["NCBI_API_KEY"]: params["api_key"] = CFG["NCBI_API_KEY"]
        try:
            r = requests.get(base, params=params, timeout=90)
            r.raise_for_status()
            root = ET.fromstring(r.text)
            for art in root.findall(".//PubmedArticle"):
                pmid_el = art.find(".//PMID")
                if pmid_el is None: continue
                pid = int(pmid_el.text.strip())
                title_el = art.find(".//ArticleTitle")
                title = "".join(title_el.itertext()).strip() if title_el is not None else ""
                abs_parts = [a.text.strip() for a in art.findall(".//Abstract/AbstractText") if a.text]
                abstract = " ".join(abs_parts)
                TCACHE[pid] = TCACHE.get(pid, {})
                TCACHE[pid].update({"title": title, "abstract": abstract})
        except Exception as e: log.error(f"      ! EFetch failed: {e}")

def icite_fetch_missing(pmids: List[int]):
    pmids_to_fetch = [p for p in set(pmids) if p not in ICACHE]
    if not pmids_to_fetch: return
    log.debug(f"Fetching citation links for {len(pmids_to_fetch)} PMIDs via iCite...")
    for i in range(0, len(pmids_to_fetch), 200):
        try:
            r = requests.get("https://icite.od.nih.gov/api/pubs", params={"pmids": ",".join(map(str, pmids_to_fetch[i:i+200])), "format": "json"}, timeout=90)
            r.raise_for_status()
            for rec in r.json().get("data", []):
                if rec.get("pmid"): ICACHE[rec["pmid"]] = {"cited_by": rec.get("cited_by") or [], "references": rec.get("references") or []}
        except requests.RequestException as e: log.error(f"iCite request failed: {e}")

def get_full_meta(pmids: List[int]) -> Dict[int, Dict[str, Any]]:
    pmids = list(set(pmids))
    icite_fetch_missing(pmids)
    efetch_with_abstracts(pmids)
    return {pmid: {**TCACHE.get(pmid, {}), **ICACHE.get(pmid, {})} for pmid in pmids}

def _get_full_text(pmid: int, meta: dict) -> str:
    rec = meta.get(pmid, {})
    title = rec.get('title', '').strip()
    abstract = rec.get('abstract', '').strip()
    return f"{title}\n\n{abstract}".strip()

# =================================================================================
# Scoring Engines
# =================================================================================
def _get_embedding_vectors(pmids: set, meta: dict, model: SentenceTransformer):
    pmids_to_embed = [p for p in pmids if p not in EMBEDDING_CACHE]
    if pmids_to_embed:
        log.info(f"    > Embedding {len(pmids_to_embed)} papers (titles only)...")
        texts_to_embed = [f"passage: {meta.get(p, {}).get('title', '')}" for p in pmids_to_embed]
        new_embeddings = model.encode(texts_to_embed, batch_size=64, normalize_embeddings=True, show_progress_bar=False, device=CFG['DEVICE'])
        for pmid, emb in zip(pmids_to_embed, new_embeddings): EMBEDDING_CACHE[pmid] = emb
    return {p: EMBEDDING_CACHE[p] for p in pmids}

def _get_tfidf_vectors(pmids: set, seed_pmids: set, meta: dict):
    all_pmids = list(seed_pmids | pmids)
    all_texts = [_get_full_text(p, meta) for p in all_pmids]
    vectorizer = TfidfVectorizer(stop_words='english', max_features=50000, lowercase=True)
    X = vectorizer.fit_transform(all_texts)
    
    pmid_to_idx = {pmid: i for i, pmid in enumerate(all_pmids)}
    seed_indices = [pmid_to_idx[p] for p in seed_pmids]
    candidate_indices = [pmid_to_idx[p] for p in pmids]
    
    return X, seed_indices, candidate_indices

def _get_scores(method: str, seed_pmids: set, candidates: set, frontier: set, meta: dict, model: SentenceTransformer = None):
    if method == 'embedding':
        seed_vectors = np.array(list(_get_embedding_vectors(seed_pmids, meta, model).values()))
        candidate_vectors_dict = _get_embedding_vectors(candidates, meta, model)
        candidate_vectors = np.array(list(candidate_vectors_dict.values()))
        seed_centroid = np.mean(seed_vectors, axis=0, keepdims=True)
        scores_array = (candidate_vectors @ seed_centroid.T).flatten()
        return dict(zip(candidates, scores_array))
    elif method == 'tfidf':
        X, seed_indices, candidate_indices = _get_tfidf_vectors(candidates, seed_pmids, meta)
        X_seed, X_candidates = X[seed_indices], X[candidate_indices]
        seed_centroid = np.asarray(X_seed.mean(axis=0)); norm = np.linalg.norm(seed_centroid)
        if norm > 0: seed_centroid /= norm
        scores_array = X_candidates.dot(seed_centroid.T).flatten()
        return dict(zip(candidates, scores_array))
    elif method == 'bc':
        frontier_refs = set().union(*(set(meta.get(p,{}).get('references',[])) for p in frontier))
        return {p: len(set(meta.get(p,{}).get('references',[])).intersection(frontier_refs)) for p in candidates}
    elif method == 'rrf':
        tfidf_scores = _get_scores('tfidf', seed_pmids, candidates, frontier, meta)
        bc_scores = _get_scores('bc', seed_pmids, candidates, frontier, meta)
        def get_ranks(s): return {p: i+1 for i, (p,_) in enumerate(sorted(s.items(), key=lambda item: item[1], reverse=True))}
        tfidf_ranks, bc_ranks = get_ranks(tfidf_scores), get_ranks(bc_scores)
        all_pmids = set(tfidf_ranks.keys()) | set(bc_ranks.keys())
        rrf = defaultdict(float)
        for p in all_pmids:
            rrf[p] += 1 / (60 + tfidf_ranks.get(p, len(tfidf_ranks) + 1))
            rrf[p] += 1 / (60 + bc_ranks.get(p, len(bc_ranks) + 1))
        return dict(rrf)
    return {}

# =================================================================================
# Main Ripple Logic with Detailed Reporting
# =================================================================================
def run_downstream_for_cluster(cluster_id: int, seed_pmids: List[int], method: str, model: SentenceTransformer = None):
    log.info(f"--- [{method.upper()}] Starting ripple for Cluster {cluster_id} ---")
    visited, frontier, history, depth = set(seed_pmids), set(seed_pmids), [], 0
    
    while frontier and depth < CFG["MAX_DEPTH"]:
        depth += 1
        support_counts = Counter()
        icite_fetch_missing(list(frontier))
        for pmid in frontier: support_counts.update(ICACHE.get(pmid, {}).get("cited_by", []))
        
        discovered = {p for p, count in support_counts.items() if p not in visited and count >= CFG["SUPPORT_MIN"]}
        if not discovered:
            log.info(f"  > [{method.upper()}] Ripple halted at depth {depth} (no new supported papers).")
            break

        full_meta = get_full_meta(list(set(seed_pmids) | discovered | frontier))
        scores = _get_scores(method, set(seed_pmids), discovered, frontier, full_meta, model)

        if len(discovered) > CFG['GLOBAL_ABSOLUTE_CAP']:
            kept = sorted(list(discovered), key=lambda p: scores.get(p, 0), reverse=True)[:CFG['GLOBAL_ABSOLUTE_CAP']]
        else:
            kept = list(discovered)

        # --- RESTORED: Detailed Title Logging ---
        if kept:
            log.info(f"  > Depth {depth} ({method.upper()}): Kept={len(kept)}")
            log.info(f"    Top 3 semantic picks for this layer:")
            sorted_for_logging = sorted(kept, key=lambda p: scores.get(p, 0.0), reverse=True)
            for i, pmid in enumerate(sorted_for_logging[:3]):
                title = full_meta.get(pmid, {}).get('title', 'N/A')
                score = scores.get(pmid, 0.0)
                log.info(f"      {i+1}. [{pmid}] (Score: {score:.3f}) {title[:85]}")
        
        # Omitted detailed per-step metrics for speed, focus is on the final sets
        if not kept:
            log.info(f"  > [{method.upper()}] Ripple halted at depth {depth} (no papers kept after cap).")
            break
        visited.update(kept); frontier = set(kept)
    
    return visited, depth

# =================================================================================
# Main Execution Block
# =================================================================================
def run_4way_comparison():
    if CFG['DEVICE'] == 'cuda': torch.cuda.empty_cache()
    log.info(f"Loading embedding model: {CFG['MODEL_ID']} onto {CFG['DEVICE']}...")
    model = SentenceTransformer(CFG['MODEL_ID'], device=CFG['DEVICE'])
    log.info("Model loaded successfully.")
    try:
        with open(CFG["CLUSTERS_PATH"], "r") as f: clusters = {int(k): v for k, v in json.load(f).items()}
        log.info(f"Successfully loaded {len(clusters)} clusters.")
    except Exception as e: log.error(f"FATAL: Could not load clusters file. Error: {e}"); return

    clusters_to_run = [8, 4]
    clusters = {cid: pmids for cid, pmids in clusters.items() if cid in clusters_to_run}
    log.info(f"--> Running on a subset of {len(clusters)} clusters: {clusters_to_run}")

    comparison_results = []
    for cid, pmids in sorted(clusters.items()):
        if not pmids: continue
        
        visited_emb, depth_emb = run_downstream_for_cluster(cid, pmids, 'embedding', model)
        visited_tfidf, depth_tfidf = run_downstream_for_cluster(cid, pmids, 'tfidf')
        visited_bc, depth_bc = run_downstream_for_cluster(cid, pmids, 'bc')
        visited_rrf, depth_rrf = run_downstream_for_cluster(cid, pmids, 'rrf')
        
        def jaccard(a,b): union=len(a|b); return len(a&b)/union if union>0 else 0

        comparison_results.append({
            "cluster_id": cid, "seeds": len(pmids),
            "total_emb": len(visited_emb), "depth_emb": depth_emb,
            "total_tfidf": len(visited_tfidf), "depth_tfidf": depth_tfidf,
            "total_bc": len(visited_bc), "depth_bc": depth_bc,
            "total_rrf": len(visited_rrf), "depth_rrf": depth_rrf,
            "jaccard_tfidf_vs_emb": jaccard(visited_tfidf, visited_emb),
            "jaccard_bc_vs_emb": jaccard(visited_bc, visited_emb),
            "jaccard_rrf_vs_emb": jaccard(visited_rrf, visited_emb),
        })

    summary_df = pd.DataFrame(comparison_results).sort_values(by='jaccard_rrf_vs_emb', ascending=False)
    print("\n\n" + "="*80); print(" " * 15 + "4-WAY COMPARISON: TITLE EMBEDDINGS vs. FULL-TEXT TF-IDF"); print("="*80)
    print(summary_df.to_string(index=False, float_format="%.3f"))
    
    if CFG["WRITE_CSV"]:
        summary_df.to_csv(f"{CFG['OUT_PREFIX']}_summary.csv", index=False)
        log.info(f"Comparison results saved to '{CFG['OUT_PREFIX']}_summary.csv'")
    
    if CFG['DEVICE'] == 'cuda':
        log.info("Clearing CUDA cache after analysis..."); del model; torch.cuda.empty_cache()
    log.info("Head-to-head comparison complete.")

# --- Run the definitive analysis ---
run_4way_comparison()

[17:07:38] INFO: Loading embedding model: Qwen/Qwen3-Embedding-0.6B onto cuda...
[17:07:38] INFO: Load pretrained SentenceTransformer: Qwen/Qwen3-Embedding-0.6B
[17:07:46] INFO: 1 prompt is loaded, with the key: query
[17:07:46] INFO: Model loaded successfully.
[17:07:46] INFO: Successfully loaded 19 clusters.
[17:07:46] INFO: --> Running on a subset of 2 clusters: [8, 4]
[17:07:46] INFO: --- [EMBEDDING] Starting ripple for Cluster 4 ---
[17:07:46] INFO:     > Fetching title+abstract for 100 PMIDs via EFetch...
[17:07:48] INFO:     > Embedding 8 papers (titles only)...
[17:07:48] INFO:     > Embedding 92 papers (titles only)...
[17:07:49] INFO:   > Depth 1 (EMBEDDING): Kept=92
[17:07:49] INFO:     Top 3 semantic picks for this layer:
[17:07:49] INFO:       1. [29655993] (Score: 0.697) Ultrasound assessment of gastric contents at the end of pregnancy.
[17:07:49] INFO:       2. [35319093] (Score: 0.690) Ultrasound assessment of gastric contents in children before general anaesthesia for 



               4-WAY COMPARISON: TITLE EMBEDDINGS vs. FULL-TEXT TF-IDF
 cluster_id  seeds  total_emb  depth_emb  total_tfidf  depth_tfidf  total_bc  depth_bc  total_rrf  depth_rrf  jaccard_tfidf_vs_emb  jaccard_bc_vs_emb  jaccard_rrf_vs_emb
          4      8        826         12          857           12       850        11        843         11                 0.304              0.155               0.251
          8     12        894         13         1026           13      1249        15        898         14                 0.117              0.126               0.142


In [1]:
# %% [DEFINITIVE Benchmark: Title-Only vs. Title+Snippet Embeddings with Enriched Reporting]
# This script compares two embedding strategies (title-only vs. title + abstract snippet)
# to find the optimal balance of speed and semantic quality.
#
# It provides enriched reporting by:
# 1. Logging the Top-5 titles and scores at every ripple step for each method.
# 2. Measuring and reporting the execution time for each strategy.
# 3. Tracking VRAM usage (if CUDA is available) to monitor performance.
# 4. Generating a detailed summary CSV with performance and similarity metrics.

import os
import time
import json
import logging
from collections import Counter
from typing import Dict, List, Any
from datetime import datetime
import xml.etree.ElementTree as ET

import numpy as np
import pandas as pd
import requests
import torch
from sentence_transformers import SentenceTransformer

# =================================================================================
# >> CONFIGURATION <<
# =================================================================================
CFG = dict(
    CLUSTERS_PATH="clusters_snapshot.json",
    ENTREZ_EMAIL="you@example.com",
    NCBI_API_KEY=os.environ.get("NCBI_API_KEY"),
    MODEL_ID="Qwen/Qwen3-Embedding-0.6B",
    DEVICE="cuda" if torch.cuda.is_available() else "cpu",
    SUPPORT_MIN=1,
    GLOBAL_ABSOLUTE_CAP=100,
    MAX_DEPTH=15,
    ABSTRACT_SNIPPET_LENGTH=300, # How many characters of the abstract to include
    WRITE_CSV=True,
    OUT_PREFIX="embedding_optimization_final_report",
)

# =================================================================================
# Core Setup, Caching, and Data Fetching
# =================================================================================
def _setup_logging():
    fmt = "[%(asctime)s] %(levelname)s: %(message)s"; datefmt = "%H:%M:%S"
    logging.basicConfig(level=logging.INFO, format=fmt, datefmt=datefmt, force=True)
    return logging.getLogger("EmbeddingBenchmark")
log = _setup_logging()
ICACHE, TCACHE, EMBEDDING_CACHE = {}, {}, {}

def _log_vram_usage(step_name: str):
    """Logs the current GPU VRAM usage."""
    if CFG['DEVICE'] != 'cuda': return
    free, total = torch.cuda.mem_get_info()
    used = (total - free) / (1024 ** 3)
    total_gb = total / (1024 ** 3)
    log.info(f"    > VRAM Check ({step_name}): {used:.2f} / {total_gb:.2f} GB used")

def efetch_with_abstracts(pmids: List[int]):
    pmids_to_fetch = [p for p in set(pmids) if p not in TCACHE or 'abstract' not in TCACHE.get(p, {})]
    if not pmids_to_fetch: return
    log.info(f"    > Fetching title+abstract for {len(pmids_to_fetch)} PMIDs via EFetch...")
    base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    for i in range(0, len(pmids_to_fetch), 200):
        sub = pmids_to_fetch[i:i+200]
        params = {"db": "pubmed", "id": ",".join(str(x) for x in sub), "retmode": "xml", "email": CFG["ENTREZ_EMAIL"]}
        if CFG["NCBI_API_KEY"]: params["api_key"] = CFG["NCBI_API_KEY"]
        try:
            r = requests.get(base, params=params, timeout=90)
            r.raise_for_status()
            root = ET.fromstring(r.text)
            for art in root.findall(".//PubmedArticle"):
                pmid_el = art.find(".//PMID")
                if pmid_el is None or pmid_el.text is None: continue
                pid = int(pmid_el.text.strip())
                title_el = art.find(".//ArticleTitle")
                title = "".join(title_el.itertext()).strip() if title_el is not None else ""
                abs_parts = [a.text.strip() for a in art.findall(".//Abstract/AbstractText") if a.text]
                abstract = " ".join(abs_parts)
                TCACHE[pid] = TCACHE.get(pid, {})
                TCACHE[pid].update({"title": title, "abstract": abstract})
        except Exception as e: log.error(f"      ! EFetch failed: {e}")

def icite_fetch_missing(pmids: List[int]):
    pmids_to_fetch = [p for p in set(pmids) if p not in ICACHE]
    if not pmids_to_fetch: return
    log.debug(f"Fetching citation links for {len(pmids_to_fetch)} PMIDs via iCite...")
    for i in range(0, len(pmids_to_fetch), 200):
        try:
            r = requests.get("https://icite.od.nih.gov/api/pubs", params={"pmids": ",".join(map(str, pmids_to_fetch[i:i+200])), "format": "json"}, timeout=90)
            r.raise_for_status()
            for rec in r.json().get("data", []):
                if rec.get("pmid"): ICACHE[rec["pmid"]] = {"cited_by": rec.get("cited_by") or []}
        except requests.RequestException as e: log.error(f"iCite request failed: {e}")

def get_full_meta(pmids: List[int]) -> Dict[int, Dict[str, Any]]:
    pmids = list(set(pmids))
    icite_fetch_missing(pmids)
    efetch_with_abstracts(pmids)
    return {pmid: {**TCACHE.get(pmid, {}), **ICACHE.get(pmid, {})} for pmid in pmids}

def _get_text_for_embedding(pmid: int, meta: dict, text_method: str) -> str:
    """Gets text based on the chosen method."""
    rec = meta.get(pmid, {})
    title = rec.get('title', '').strip()
    if text_method == 'title_only':
        return f"passage: {title}"
    else: # title_plus_snippet
        abstract = rec.get('abstract', '').strip()
        snippet = abstract[:CFG['ABSTRACT_SNIPPET_LENGTH']]
        return f"passage: {title}\n\n{snippet}".strip()

# =================================================================================
# Embedding Engine and Scoring
# =================================================================================
def _get_embeddings(pmids: set, meta: dict, model: SentenceTransformer, text_method: str) -> dict:
    """Computes embeddings, aware of the text method and caching strategy."""
    if text_method not in EMBEDDING_CACHE: EMBEDDING_CACHE[text_method] = {}
    
    pmids_to_embed = [p for p in pmids if p not in EMBEDDING_CACHE[text_method]]
    
    if pmids_to_embed:
        _log_vram_usage(f"Pre-Embed ({text_method})")
        log.info(f"    > Embedding {len(pmids_to_embed)} papers using method: {text_method}...")
        
        texts_to_embed = [_get_text_for_embedding(p, meta, text_method) for p in pmids_to_embed]
        
        new_embeddings = model.encode(
            texts_to_embed, batch_size=32, normalize_embeddings=True,
            show_progress_bar=False, device=CFG['DEVICE']
        )
        
        for pmid, emb in zip(pmids_to_embed, new_embeddings):
            EMBEDDING_CACHE[text_method][pmid] = emb
        _log_vram_usage(f"Post-Embed ({text_method})")
            
    return {p: EMBEDDING_CACHE[text_method][p] for p in pmids}

def _get_semantic_scores(seed_vectors: np.ndarray, candidate_vectors: np.ndarray) -> np.ndarray:
    if seed_vectors.shape[0] == 0 or candidate_vectors.shape[0] == 0: return np.array([])
    seed_centroid = np.mean(seed_vectors, axis=0, keepdims=True)
    return (candidate_vectors @ seed_centroid.T).flatten()

# =================================================================================
# Main Ripple Logic
# =================================================================================
def run_downstream_for_cluster(cluster_id: int, seed_pmids: List[int], text_method: str, model: SentenceTransformer):
    log.info(f"--- [{text_method.upper()}] Starting ripple for Cluster {cluster_id} ---")
    visited, frontier, depth = set(seed_pmids), set(seed_pmids), 0
    
    while frontier and depth < CFG["MAX_DEPTH"]:
        depth += 1
        support_counts = Counter()
        icite_fetch_missing(list(frontier))
        for pmid in frontier: support_counts.update(ICACHE.get(pmid, {}).get("cited_by", []))
        
        discovered = {p for p, count in support_counts.items() if p not in visited and count >= CFG["SUPPORT_MIN"]}
        if not discovered:
            log.info(f"  > [{text_method.upper()}] Ripple halted at depth {depth} (no new supported papers found).")
            break

        full_meta = get_full_meta(list(set(seed_pmids) | discovered))
        
        seed_embs_dict = _get_embeddings(set(seed_pmids), full_meta, model, text_method)
        cand_embs_dict = _get_embeddings(discovered, full_meta, model, text_method)
        
        scores_array = _get_semantic_scores(np.array(list(seed_embs_dict.values())), np.array(list(cand_embs_dict.values())))
        scores = dict(zip(discovered, scores_array))
        
        if len(discovered) > CFG['GLOBAL_ABSOLUTE_CAP']:
            kept = sorted(list(discovered), key=lambda p: scores.get(p, 0), reverse=True)[:CFG['GLOBAL_ABSOLUTE_CAP']]
        else:
            kept = list(discovered)

        # --- ENHANCED: Detailed Top-5 Title and Score Logging ---
        if kept:
            log.info(f"  > Depth {depth} ({text_method.upper()}): Discovered={len(discovered)}, Kept={len(kept)}")
            log.info(f"    Top 5 semantic picks for this layer:")
            sorted_for_logging = sorted(kept, key=lambda p: scores.get(p, 0.0), reverse=True)
            for i, pmid in enumerate(sorted_for_logging[:5]):
                title = full_meta.get(pmid, {}).get('title', 'N/A')
                score = scores.get(pmid, 0.0)
                log.info(f"      {i+1}. [{pmid}] (Score: {score:.4f}) {title}")
        else:
            log.info(f"  > [{text_method.upper()}] Ripple halted at depth {depth} (no papers kept after filtering/capping).")
            break
        
        visited.update(kept); frontier = set(kept)
    
    return visited, depth

# =================================================================================
# Main Execution Block
# =================================================================================
def run_embedding_comparison():
    if CFG['DEVICE'] == 'cuda': torch.cuda.empty_cache()
    log.info(f"Loading embedding model: {CFG['MODEL_ID']} onto {CFG['DEVICE']}...")
    model = SentenceTransformer(CFG['MODEL_ID'], device=CFG['DEVICE'])
    log.info("Model loaded successfully.")
    try:
        with open(CFG["CLUSTERS_PATH"], "r") as f: clusters = {int(k): v for k, v in json.load(f).items()}
        log.info(f"Successfully loaded {len(clusters)} clusters from '{CFG['CLUSTERS_PATH']}'.")
    except Exception as e: log.error(f"FATAL: Could not load clusters file. Error: {e}"); return

    clusters_to_run = [8, 4]
    clusters = {cid: pmids for cid, pmids in clusters.items() if cid in clusters_to_run}
    log.info(f"--> Running benchmark on a subset of {len(clusters)} clusters: {clusters_to_run}")

    comparison_results = []
    total_start_time = time.time()
    for cid, pmids in sorted(clusters.items()):
        if not pmids: continue
        
        # Clear the global embedding cache for a fair, memory-clean comparison on each cluster
        global EMBEDDING_CACHE
        EMBEDDING_CACHE = {}
        
        log.info(f"\n{'='*40}\n>>> Processing Cluster {cid} (Seeds: {len(pmids)}) <<<\n{'='*40}")

        start_time_title = time.time()
        visited_title, depth_title = run_downstream_for_cluster(cid, pmids, 'title_only', model)
        time_title = time.time() - start_time_title

        start_time_snippet = time.time()
        visited_snippet, depth_snippet = run_downstream_for_cluster(cid, pmids, 'title_plus_snippet', model)
        time_snippet = time.time() - start_time_snippet
        
        def calculate_similarity(a,b): 
            intersection = len(a & b)
            union = len(a | b)
            jaccard = intersection / union if union > 0 else 0
            return intersection, jaccard

        intersection_size, jaccard_sim = calculate_similarity(visited_title, visited_snippet)

        comparison_results.append({
            "cluster_id": cid, 
            "seeds": len(pmids),
            "total_title_only": len(visited_title), 
            "depth_title_only": depth_title,
            "time_title_only": time_title,
            "total_snippet": len(visited_snippet), 
            "depth_snippet": depth_snippet,
            "time_snippet": time_snippet,
            "intersection": intersection_size,
            "jaccard_similarity": jaccard_sim,
        })

    summary_df = pd.DataFrame(comparison_results)
    
    # Reorder columns for better readability in the final output
    cols_order = [
        "cluster_id", "seeds", 
        "total_title_only", "depth_title_only", "time_title_only",
        "total_snippet", "depth_snippet", "time_snippet",
        "intersection", "jaccard_similarity"
    ]
    summary_df = summary_df[cols_order].sort_values(by='jaccard_similarity', ascending=True)
    
    print("\n\n" + "="*95)
    print(" " * 20 + "EMBEDDING STRATEGY BENCHMARK: TITLE vs. TITLE+SNIPPET")
    print("="*95)
    
    # Define custom formatters for the final printout
    formatters = {
        'time_title_only': '{:.2f}s'.format,
        'time_snippet': '{:.2f}s'.format,
        'jaccard_similarity': '{:.3f}'.format,
    }
    print(summary_df.to_string(index=False, formatters=formatters))
    print("="*95)
    
    if CFG["WRITE_CSV"]:
        out_path = f"{CFG['OUT_PREFIX']}_summary.csv"
        summary_df.to_csv(out_path, index=False, float_format="%.4f")
        log.info(f"Comparison results saved to '{out_path}'")
    
    if CFG['DEVICE'] == 'cuda':
        log.info("Clearing CUDA cache after analysis..."); del model; torch.cuda.empty_cache()
    
    total_time = time.time() - total_start_time
    log.info(f"Embedding optimization benchmark complete in {total_time:.2f} seconds.")

# --- Run the definitive analysis ---
run_embedding_comparison()

[17:59:37] INFO: Loading embedding model: Qwen/Qwen3-Embedding-0.6B onto cuda...
[17:59:37] INFO: Load pretrained SentenceTransformer: Qwen/Qwen3-Embedding-0.6B
[17:59:45] INFO: 1 prompt is loaded, with the key: query
[17:59:45] INFO: Model loaded successfully.
[17:59:45] INFO: Successfully loaded 19 clusters from 'clusters_snapshot.json'.
[17:59:45] INFO: --> Running benchmark on a subset of 2 clusters: [8, 4]
[17:59:45] INFO: 
>>> Processing Cluster 4 (Seeds: 8) <<<
[17:59:45] INFO: --- [TITLE_ONLY] Starting ripple for Cluster 4 ---
[17:59:46] INFO:     > Fetching title+abstract for 100 PMIDs via EFetch...
[17:59:47] INFO:     > VRAM Check (Pre-Embed (title_only)): 3.26 / 6.00 GB used
[17:59:47] INFO:     > Embedding 8 papers using method: title_only...
[17:59:47] INFO:     > VRAM Check (Post-Embed (title_only)): 3.40 / 6.00 GB used
[17:59:47] INFO:     > VRAM Check (Pre-Embed (title_only)): 3.40 / 6.00 GB used
[17:59:47] INFO:     > Embedding 92 papers using method: title_only...
[1



                    EMBEDDING STRATEGY BENCHMARK: TITLE vs. TITLE+SNIPPET
 cluster_id  seeds  total_title_only  depth_title_only time_title_only  total_snippet  depth_snippet time_snippet  intersection jaccard_similarity
          8     12               894                13          52.86s           1196             15       77.81s           524              0.335
          4      8               826                12          54.31s            848             11       54.69s           466              0.386


In [1]:
# %% [DEFINITIVE Benchmark: RRF-Enhanced Embedding Strategies]
# This script compares two embedding strategies (title-only vs. title + snippet)
# using a Reciprocal Rank Fusion (RRF) scoring model.
#
# The scoring model combines two signals:
# 1. Semantic Similarity: Cosine similarity to the seed cluster's centroid.
# 2. Citation Support: The number of citations received from the current frontier.
#
# This hybrid approach aims to find papers that are both thematically relevant
# and bibliographically important.

import os
import time
import json
import logging
from collections import Counter
from typing import Dict, List, Any
import xml.etree.ElementTree as ET

import numpy as np
import pandas as pd
import requests
import torch
from sentence_transformers import SentenceTransformer

# =================================================================================
# >> CONFIGURATION <<
# =================================================================================
CFG = dict(
    CLUSTERS_PATH="clusters_snapshot.json",
    ENTREZ_EMAIL="you@example.com",
    NCBI_API_KEY=os.environ.get("NCBI_API_KEY"),
    MODEL_ID="Qwen/Qwen3-Embedding-0.6B",
    DEVICE="cuda" if torch.cuda.is_available() else "cpu",
    SUPPORT_MIN=1,
    GLOBAL_ABSOLUTE_CAP=100,
    MAX_DEPTH=15,
    RRF_K=60, # Reciprocal Rank Fusion constant
    ABSTRACT_SNIPPET_LENGTH=300,
    WRITE_CSV=True,
    OUT_PREFIX="embedding_support_rrf_comparison",
)

# =================================================================================
# Core Setup, Caching, and Data Fetching
# =================================================================================
def _setup_logging():
    fmt = "[%(asctime)s] %(levelname)s: %(message)s"; datefmt = "%H:%M:%S"
    logging.basicConfig(level=logging.INFO, format=fmt, datefmt=datefmt, force=True)
    return logging.getLogger("RRFBenchmark")
log = _setup_logging()
ICACHE, TCACHE, EMBEDDING_CACHE = {}, {}, {}

def _log_vram_usage(step_name: str):
    """Logs the current GPU VRAM usage."""
    if CFG['DEVICE'] != 'cuda': return
    free, total = torch.cuda.mem_get_info()
    used = (total - free) / (1024 ** 3)
    total_gb = total / (1024 ** 3)
    log.info(f"    > VRAM Check ({step_name}): {used:.2f} / {total_gb:.2f} GB used")

def efetch_with_abstracts(pmids: List[int]):
    pmids_to_fetch = [p for p in set(pmids) if p not in TCACHE or 'abstract' not in TCACHE.get(p, {})]
    if not pmids_to_fetch: return
    log.info(f"    > Fetching title+abstract for {len(pmids_to_fetch)} PMIDs via EFetch...")
    base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    for i in range(0, len(pmids_to_fetch), 200):
        sub = pmids_to_fetch[i:i+200]
        params = {"db": "pubmed", "id": ",".join(str(x) for x in sub), "retmode": "xml", "email": CFG["ENTREZ_EMAIL"]}
        if CFG["NCBI_API_KEY"]: params["api_key"] = CFG["NCBI_API_KEY"]
        try:
            r = requests.get(base, params=params, timeout=90)
            r.raise_for_status()
            root = ET.fromstring(r.text)
            for art in root.findall(".//PubmedArticle"):
                pmid_el = art.find(".//PMID")
                if pmid_el is None or pmid_el.text is None: continue
                pid = int(pmid_el.text.strip())
                title_el = art.find(".//ArticleTitle")
                title = "".join(title_el.itertext()).strip() if title_el is not None else ""
                abs_parts = [a.text.strip() for a in art.findall(".//Abstract/AbstractText") if a.text]
                abstract = " ".join(abs_parts)
                TCACHE[pid] = TCACHE.get(pid, {})
                TCACHE[pid].update({"title": title, "abstract": abstract})
        except Exception as e: log.error(f"      ! EFetch failed: {e}")

def icite_fetch_missing(pmids: List[int]):
    pmids_to_fetch = [p for p in set(pmids) if p not in ICACHE]
    if not pmids_to_fetch: return
    log.debug(f"Fetching citation links for {len(pmids_to_fetch)} PMIDs via iCite...")
    for i in range(0, len(pmids_to_fetch), 200):
        try:
            r = requests.get("https://icite.od.nih.gov/api/pubs", params={"pmids": ",".join(map(str, pmids_to_fetch[i:i+200])), "format": "json"}, timeout=90)
            r.raise_for_status()
            for rec in r.json().get("data", []):
                if rec.get("pmid"): ICACHE[rec["pmid"]] = {"cited_by": rec.get("cited_by") or []}
        except requests.RequestException as e: log.error(f"iCite request failed: {e}")

def get_full_meta(pmids: List[int]) -> Dict[int, Dict[str, Any]]:
    pmids = list(set(pmids))
    icite_fetch_missing(pmids)
    efetch_with_abstracts(pmids)
    return {pmid: {**TCACHE.get(pmid, {}), **ICACHE.get(pmid, {})} for pmid in pmids}

def _get_text_for_embedding(pmid: int, meta: dict, text_method: str) -> str:
    rec = meta.get(pmid, {})
    title = rec.get('title', '').strip()
    if text_method == 'title_only':
        return f"passage: {title}"
    else:
        abstract = rec.get('abstract', '').strip()
        snippet = abstract[:CFG['ABSTRACT_SNIPPET_LENGTH']]
        return f"passage: {title}\n\n{snippet}".strip()

# =================================================================================
# Scoring Engine with RRF
# =================================================================================
def _get_embeddings(pmids: set, meta: dict, model: SentenceTransformer, text_method: str) -> dict:
    if text_method not in EMBEDDING_CACHE: EMBEDDING_CACHE[text_method] = {}
    pmids_to_embed = [p for p in pmids if p not in EMBEDDING_CACHE[text_method]]
    if pmids_to_embed:
        _log_vram_usage(f"Pre-Embed ({text_method})")
        log.info(f"    > Embedding {len(pmids_to_embed)} papers ({text_method})...")
        texts_to_embed = [_get_text_for_embedding(p, meta, text_method) for p in pmids_to_embed]
        new_embeddings = model.encode(texts_to_embed, batch_size=32, normalize_embeddings=True, show_progress_bar=False, device=CFG['DEVICE'])
        for pmid, emb in zip(pmids_to_embed, new_embeddings):
            EMBEDDING_CACHE[text_method][pmid] = emb
        _log_vram_usage(f"Post-Embed ({text_method})")
    return {p: EMBEDDING_CACHE[text_method][p] for p in pmids}

def _get_semantic_scores(seed_vectors: np.ndarray, candidate_vectors: np.ndarray) -> np.ndarray:
    if seed_vectors.shape[0] == 0 or candidate_vectors.shape[0] == 0: return np.array([])
    seed_centroid = np.mean(seed_vectors, axis=0, keepdims=True)
    return (candidate_vectors @ seed_centroid.T).flatten()

def _get_rrf_scores(semantic_scores: Dict[int, float], support_counts: Counter, discovered_pmids: set) -> Dict[int, float]:
    """Combines semantic scores and support counts using Reciprocal Rank Fusion."""
    if not discovered_pmids: return {}

    support_scores = {pmid: support_counts.get(pmid, 0) for pmid in discovered_pmids}

    def get_ranks(scores_dict: Dict[int, float]) -> Dict[int, int]:
        sorted_pmids = sorted(scores_dict.keys(), key=lambda p: (scores_dict.get(p, 0), p), reverse=True)
        return {pmid: i + 1 for i, pmid in enumerate(sorted_pmids)}

    semantic_ranks = get_ranks(semantic_scores)
    support_ranks = get_ranks(support_scores)

    k = CFG['RRF_K']
    rrf_scores = {}
    all_pmids = discovered_pmids

    for pmid in all_pmids:
        sem_rank = semantic_ranks.get(pmid, len(semantic_ranks) + 1)
        sup_rank = support_ranks.get(pmid, len(support_ranks) + 1)
        rrf_score = (1 / (k + sem_rank)) + (1 / (k + sup_rank))
        rrf_scores[pmid] = rrf_score

    return rrf_scores

# =================================================================================
# Main Ripple Logic
# =================================================================================
def run_downstream_for_cluster(cluster_id: int, seed_pmids: List[int], text_method: str, model: SentenceTransformer):
    log.info(f"--- [{text_method.upper()}] Starting ripple for Cluster {cluster_id} ---")
    visited, frontier, depth = set(seed_pmids), set(seed_pmids), 0
    
    while frontier and depth < CFG["MAX_DEPTH"]:
        depth += 1
        support_counts = Counter()
        icite_fetch_missing(list(frontier))
        for pmid in frontier: support_counts.update(ICACHE.get(pmid, {}).get("cited_by", []))
        
        discovered = {p for p, count in support_counts.items() if p not in visited and count >= CFG["SUPPORT_MIN"]}
        if not discovered:
            log.info(f"  > [{text_method.upper()}] Ripple halted at depth {depth} (no new supported papers found).")
            break

        full_meta = get_full_meta(list(set(seed_pmids) | discovered))
        
        # 1. Get pure semantic scores
        seed_embs_dict = _get_embeddings(set(seed_pmids), full_meta, model, text_method)
        cand_embs_dict = _get_embeddings(discovered, full_meta, model, text_method)
        scores_array = _get_semantic_scores(np.array(list(seed_embs_dict.values())), np.array(list(cand_embs_dict.values())))
        semantic_scores = dict(zip(discovered, scores_array))
        
        # 2. Combine with support counts using RRF for the final score
        scores = _get_rrf_scores(semantic_scores, support_counts, discovered)
        
        if len(discovered) > CFG['GLOBAL_ABSOLUTE_CAP']:
            kept = sorted(list(discovered), key=lambda p: scores.get(p, 0), reverse=True)[:CFG['GLOBAL_ABSOLUTE_CAP']]
        else:
            kept = list(discovered)

        if kept:
            log.info(f"  > Depth {depth} ({text_method.upper()}): Discovered={len(discovered)}, Kept={len(kept)} (using RRF scores)")
            log.info(f"    Top 5 RRF-ranked picks for this layer:")
            sorted_for_logging = sorted(kept, key=lambda p: scores.get(p, 0.0), reverse=True)
            for i, pmid in enumerate(sorted_for_logging[:5]):
                title = full_meta.get(pmid, {}).get('title', 'N/A')
                score = scores.get(pmid, 0.0)
                support = support_counts.get(pmid, 0)
                sem_score = semantic_scores.get(pmid, 0.0)
                log.info(f"      {i+1}. [{pmid}] (RRF: {score:.4f} | Sem: {sem_score:.3f} | Sup: {support}) {title}")
        else:
            log.info(f"  > [{text_method.upper()}] Ripple halted at depth {depth} (no papers kept after filtering/capping).")
            break
        
        visited.update(kept); frontier = set(kept)
    
    return visited, depth

# =================================================================================
# Main Execution Block
# =================================================================================
def run_embedding_comparison():
    if CFG['DEVICE'] == 'cuda': torch.cuda.empty_cache()
    log.info(f"Loading embedding model: {CFG['MODEL_ID']} onto {CFG['DEVICE']}...")
    model = SentenceTransformer(CFG['MODEL_ID'], device=CFG['DEVICE'])
    log.info("Model loaded successfully.")
    try:
        with open(CFG["CLUSTERS_PATH"], "r") as f: clusters = {int(k): v for k, v in json.load(f).items()}
        log.info(f"Successfully loaded {len(clusters)} clusters from '{CFG['CLUSTERS_PATH']}'.")
    except Exception as e: log.error(f"FATAL: Could not load clusters file. Error: {e}"); return

    clusters_to_run = [8, 4]
    clusters = {cid: pmids for cid, pmids in clusters.items() if cid in clusters_to_run}
    log.info(f"--> Running RRF benchmark on a subset of {len(clusters)} clusters: {clusters_to_run}")

    comparison_results = []
    total_start_time = time.time()
    for cid, pmids in sorted(clusters.items()):
        if not pmids: continue
        
        global EMBEDDING_CACHE; EMBEDDING_CACHE = {}
        log.info(f"\n{'='*40}\n>>> Processing Cluster {cid} (Seeds: {len(pmids)}) <<<\n{'='*40}")

        start_time_title = time.time()
        visited_title, depth_title = run_downstream_for_cluster(cid, pmids, 'title_only', model)
        time_title = time.time() - start_time_title

        start_time_snippet = time.time()
        visited_snippet, depth_snippet = run_downstream_for_cluster(cid, pmids, 'title_plus_snippet', model)
        time_snippet = time.time() - start_time_snippet
        
        intersection = len(visited_title & visited_snippet)
        union = len(visited_title | visited_snippet)
        jaccard_sim = intersection / union if union > 0 else 0

        comparison_results.append({
            "cluster_id": cid, "seeds": len(pmids),
            "total_title_only": len(visited_title), "depth_title_only": depth_title, "time_title_only": time_title,
            "total_snippet": len(visited_snippet), "depth_snippet": depth_snippet, "time_snippet": time_snippet,
            "intersection": intersection, "jaccard_similarity": jaccard_sim,
        })

    summary_df = pd.DataFrame(comparison_results)
    cols_order = [
        "cluster_id", "seeds", "total_title_only", "depth_title_only", "time_title_only",
        "total_snippet", "depth_snippet", "time_snippet", "intersection", "jaccard_similarity"
    ]
    summary_df = summary_df[cols_order].sort_values(by='jaccard_similarity', ascending=True)
    
    print("\n\n" + "="*95); print(" " * 15 + "RRF-ENHANCED BENCHMARK: TITLE vs. TITLE+SNIPPET"); print("="*95)
    formatters = {'time_title_only': '{:.2f}s'.format, 'time_snippet': '{:.2f}s'.format, 'jaccard_similarity': '{:.3f}'.format}
    print(summary_df.to_string(index=False, formatters=formatters)); print("="*95)
    
    if CFG["WRITE_CSV"]:
        out_path = f"{CFG['OUT_PREFIX']}_summary.csv"
        summary_df.to_csv(out_path, index=False, float_format="%.4f")
        log.info(f"Comparison results saved to '{out_path}'")
    
    if CFG['DEVICE'] == 'cuda':
        log.info("Clearing CUDA cache after analysis..."); del model; torch.cuda.empty_cache()
    
    total_time = time.time() - total_start_time
    log.info(f"RRF-enhanced benchmark complete in {total_time:.2f} seconds.")

# --- Run the definitive analysis ---
run_embedding_comparison()

[18:11:44] INFO: Loading embedding model: Qwen/Qwen3-Embedding-0.6B onto cuda...
[18:11:44] INFO: Load pretrained SentenceTransformer: Qwen/Qwen3-Embedding-0.6B
[18:11:51] INFO: 1 prompt is loaded, with the key: query
[18:11:51] INFO: Model loaded successfully.
[18:11:51] INFO: Successfully loaded 19 clusters from 'clusters_snapshot.json'.
[18:11:51] INFO: --> Running RRF benchmark on a subset of 2 clusters: [8, 4]
[18:11:51] INFO: 
>>> Processing Cluster 4 (Seeds: 8) <<<
[18:11:51] INFO: --- [TITLE_ONLY] Starting ripple for Cluster 4 ---
[18:11:52] INFO:     > Fetching title+abstract for 100 PMIDs via EFetch...
[18:11:54] INFO:     > VRAM Check (Pre-Embed (title_only)): 3.26 / 6.00 GB used
[18:11:54] INFO:     > Embedding 8 papers (title_only)...
[18:11:54] INFO:     > VRAM Check (Post-Embed (title_only)): 3.40 / 6.00 GB used
[18:11:54] INFO:     > VRAM Check (Pre-Embed (title_only)): 3.40 / 6.00 GB used
[18:11:54] INFO:     > Embedding 92 papers (title_only)...
[18:11:55] INFO:     >



               RRF-ENHANCED BENCHMARK: TITLE vs. TITLE+SNIPPET
 cluster_id  seeds  total_title_only  depth_title_only time_title_only  total_snippet  depth_snippet time_snippet  intersection jaccard_similarity
          8     12               653                10          40.56s            593             12       39.71s           395              0.464
          4      8               807                14          48.63s            751             10       49.19s           562              0.564


In [None]:
# %% [DEFINITIVE Test: Dynamic Centroid RRF Ripple]
# This script implements an adaptive downstream search that combines two signals
# using Reciprocal Rank Fusion (RRF):
#
# 1. Bibliographic Coupling (BC): Measures shared references with the current frontier.
# 2. Dynamic Centroid Similarity: Thematic similarity to the centroid of the *current frontier*,
#    not the original seed cluster. This allows the thematic focus to evolve with the ripple.

import os
import time
import json
import logging
from collections import Counter
from typing import Dict, List, Any, Set
import xml.etree.ElementTree as ET

import numpy as np
import pandas as pd
import requests
import torch
from sentence_transformers import SentenceTransformer

# =================================================================================
# >> CONFIGURATION <<
# =================================================================================
CFG = dict(
    CLUSTERS_PATH="clusters_snapshot.json",
    ENTREZ_EMAIL="you@example.com",
    NCBI_API_KEY=os.environ.get("NCBI_API_KEY"),
    MODEL_ID="Qwen/Qwen3-Embedding-0.6B",
    DEVICE="cuda" if torch.cuda.is_available() else "cpu",
    SUPPORT_MIN=1,
    GLOBAL_ABSOLUTE_CAP=100,
    MAX_DEPTH=15,
    RRF_K=60, # Reciprocal Rank Fusion constant
    WRITE_CSV=True,
    OUT_PREFIX="dynamic_centroid_rrf_test",
)

# =================================================================================
# Core Setup, Caching, and Data Fetching
# =================================================================================
def _setup_logging():
    fmt = "[%(asctime)s] %(levelname)s: %(message)s"; datefmt = "%H:%M:%S"
    logging.basicConfig(level=logging.INFO, format=fmt, datefmt=datefmt, force=True)
    return logging.getLogger("DynamicRRF")
log = _setup_logging()
ICACHE, TCACHE, EMBEDDING_CACHE = {}, {}, {}

def efetch_with_abstracts(pmids: List[int]):
    pmids_to_fetch = [p for p in set(pmids) if p not in TCACHE or 'abstract' not in TCACHE.get(p, {})]
    if not pmids_to_fetch: return
    log.info(f"    > Fetching title+abstract for {len(pmids_to_fetch)} PMIDs...")
    base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    for i in range(0, len(pmids_to_fetch), 200):
        sub = pmids_to_fetch[i:i+200]
        params = {"db": "pubmed", "id": ",".join(str(x) for x in sub), "retmode": "xml", "email": CFG["ENTREZ_EMAIL"]}
        if CFG["NCBI_API_KEY"]: params["api_key"] = CFG["NCBI_API_KEY"]
        try:
            r = requests.get(base, params=params, timeout=90)
            r.raise_for_status()
            root = ET.fromstring(r.text)
            for art in root.findall(".//PubmedArticle"):
                pmid_el = art.find(".//PMID")
                if pmid_el is None or pmid_el.text is None: continue
                pid = int(pmid_el.text.strip())
                title_el = art.find(".//ArticleTitle")
                title = "".join(title_el.itertext()).strip() if title_el is not None else ""
                abs_parts = [a.text.strip() for a in art.findall(".//Abstract/AbstractText") if a.text]
                abstract = " ".join(abs_parts)
                refs_list = [ref.text.strip() for ref in art.findall('.//ReferenceList//ArticleId[@IdType="pubmed"]') if ref.text]
                TCACHE[pid] = TCACHE.get(pid, {})
                TCACHE[pid].update({"title": title, "abstract": abstract, "references": [int(p) for p in refs_list if p.isdigit()]})
        except Exception as e: log.error(f"      ! EFetch failed: {e}")

def get_full_meta(pmids: List[int]) -> Dict[int, Dict[str, Any]]:
    pmids = list(set(pmids))
    # Efetch now handles references, so iCite is not strictly needed for BC
    efetch_with_abstracts(pmids)
    return TCACHE

def _get_embedding_vectors(pmids: set, meta: dict, model: SentenceTransformer):
    pmids_to_embed = [p for p in pmids if p not in EMBEDDING_CACHE]
    if pmids_to_embed:
        log.info(f"    > Embedding {len(pmids_to_embed)} paper titles...")
        texts_to_embed = [f"passage: {meta.get(p, {}).get('title', '')}" for p in pmids_to_embed]
        new_embeddings = model.encode(texts_to_embed, batch_size=64, normalize_embeddings=True, show_progress_bar=False, device=CFG['DEVICE'])
        for pmid, emb in zip(pmids_to_embed, new_embeddings): EMBEDDING_CACHE[pmid] = emb
    return {p: EMBEDDING_CACHE[p] for p in pmids if p in EMBEDDING_CACHE}

# =================================================================================
# Scoring Engine with Dynamic Centroid RRF
# =================================================================================
def get_dynamic_rrf_scores(candidates: Set[int], frontier: Set[int], meta: dict, model: SentenceTransformer):
    """
    Combines Bibliographic Coupling and Dynamic Centroid Similarity using RRF.
    Returns three dictionaries: final rrf_scores, and the component semantic_scores and bc_scores.
    """
    # 1. Bibliographic Coupling Score (vs. current frontier)
    frontier_refs = set().union(*(set(meta.get(p, {}).get('references', [])) for p in frontier))
    bc_scores = {p: len(set(meta.get(p, {}).get('references', [])).intersection(frontier_refs)) for p in candidates}

    # 2. Dynamic Semantic Score (vs. current frontier's centroid)
    frontier_vectors_dict = _get_embedding_vectors(frontier, meta, model)
    candidate_vectors_dict = _get_embedding_vectors(candidates, meta, model)
    
    # Ensure all candidates have a score, even if some vectors are missing
    semantic_scores = {p: 0.0 for p in candidates}
    
    if frontier_vectors_dict and candidate_vectors_dict:
        # Align candidate vectors with the order of the candidates set for zipping later
        valid_candidates = sorted(list(candidate_vectors_dict.keys()))
        candidate_vectors = np.array([candidate_vectors_dict[p] for p in valid_candidates])
        
        frontier_vectors = np.array(list(frontier_vectors_dict.values()))
        frontier_centroid = np.mean(frontier_vectors, axis=0, keepdims=True)
        
        scores_array = (candidate_vectors @ frontier_centroid.T).flatten()
        semantic_scores.update(dict(zip(valid_candidates, scores_array)))

    # 3. Reciprocal Rank Fusion
    def get_ranks(scores_dict: Dict[int, float]) -> Dict[int, int]:
        sorted_pmids = sorted(scores_dict.keys(), key=lambda p: (scores_dict.get(p, 0), p), reverse=True)
        return {pmid: i + 1 for i, pmid in enumerate(sorted_pmids)}

    semantic_ranks = get_ranks(semantic_scores)
    bc_ranks = get_ranks(bc_scores)

    k = CFG['RRF_K']
    rrf_scores = {}
    for pmid in candidates:
        sem_rank = semantic_ranks.get(pmid, len(semantic_ranks) + 1)
        bc_rank = bc_ranks.get(pmid, len(bc_ranks) + 1)
        rrf_scores[pmid] = (1 / (k + sem_rank)) + (1 / (k + bc_rank))

    return rrf_scores, semantic_scores, bc_scores

# =================================================================================
# Main Ripple Logic
# =================================================================================
def run_downstream_for_cluster(cluster_id: int, seed_pmids: List[int], model: SentenceTransformer):
    log.info(f"--- [Dynamic RRF] Starting ripple for Cluster {cluster_id} ---")
    visited, frontier, depth = set(seed_pmids), set(seed_pmids), 0
    run_history = []
    
    while frontier and depth < CFG["MAX_DEPTH"]:
        depth += 1
        # Using iCite for cited_by links as it's more reliable than parsing from EFetch
        try:
            r = requests.get("https://icite.od.nih.gov/api/pubs", params={"pmids": ",".join(map(str, frontier)), "format": "json"}, timeout=90)
            r.raise_for_status()
            support_counts = Counter()
            for rec in r.json().get("data", []):
                if rec.get("pmid") in frontier:
                    support_counts.update(rec.get("cited_by", []))
        except requests.RequestException as e:
            log.error(f"iCite request failed: {e}"); break
        
        discovered = {p for p, count in support_counts.items() if p not in visited and count >= CFG["SUPPORT_MIN"]}
        if not discovered:
            log.info(f"  > Ripple halted at depth {depth} (no new supported papers found).")
            break

        all_needed_pmids = list(discovered | frontier)
        full_meta = get_full_meta(all_needed_pmids)
        
        scores, semantic_scores, bc_scores = get_dynamic_rrf_scores(discovered, frontier, full_meta, model)
        
        if len(discovered) > CFG['GLOBAL_ABSOLUTE_CAP']:
            kept = sorted(list(discovered), key=lambda p: scores.get(p, 0), reverse=True)[:CFG['GLOBAL_ABSOLUTE_CAP']]
        else:
            kept = list(discovered)

        if kept:
            log.info(f"  > Depth {depth}: Discovered={len(discovered)}, Kept={len(kept)}")
            log.info(f"    Top 5 picks for this layer (ranked by Dynamic RRF):")
            sorted_for_logging = sorted(kept, key=lambda p: scores.get(p, 0.0), reverse=True)
            for i, pmid in enumerate(sorted_for_logging[:5]):
                title = full_meta.get(pmid, {}).get('title', 'N/A')
                rrf_s = scores.get(pmid, 0.0)
                sem_s = semantic_scores.get(pmid, 0.0)
                bc_s = bc_scores.get(pmid, 0)
                log.info(f"      {i+1}. [{pmid}] (RRF: {rrf_s:.4f} | Sem: {sem_s:.3f} | BC: {bc_s}) {title}")
        else:
            log.info(f"  > Ripple halted at depth {depth} (no papers kept after scoring).")
            break
        
        run_history.append({"depth": depth, "discovered": len(discovered), "kept": len(kept), "frontier_size": len(frontier)})
        visited.update(kept); frontier = set(kept)
    
    log.info(f"--- Finished ripple for Cluster {cluster_id} at depth {depth}. Total papers found: {len(visited)} ---")
    return visited, depth, pd.DataFrame(run_history)

# =================================================================================
# Main Execution Block
# =================================================================================
def run_test():
    if CFG['DEVICE'] == 'cuda': torch.cuda.empty_cache()
    log.info(f"Loading embedding model: {CFG['MODEL_ID']} onto {CFG['DEVICE']}...")
    model = SentenceTransformer(CFG['MODEL_ID'], device=CFG['DEVICE'])
    log.info("Model loaded successfully.")
    try:
        with open(CFG["CLUSTERS_PATH"], "r") as f: clusters = {int(k): v for k, v in json.load(f).items()}
        log.info(f"Successfully loaded {len(clusters)} clusters from '{CFG['CLUSTERS_PATH']}'.")
    except Exception as e: log.error(f"FATAL: Could not load clusters file. Error: {e}"); return

    clusters_to_run = [4, 8,] # Using the updated cluster IDs
    clusters = {cid: pmids for cid, pmids in clusters.items() if cid in clusters_to_run}
    log.info(f"--> Running test on a subset of {len(clusters)} clusters: {clusters_to_run}")

    all_results_df = []
    for cid, pmids in sorted(clusters.items()):
        if not pmids: continue
        
        # Clear caches for a clean run per cluster
        global TCACHE, EMBEDDING_CACHE; TCACHE, EMBEDDING_CACHE = {}, {}
        
        visited_set, final_depth, history_df = run_downstream_for_cluster(cid, pmids, model)
        
        if not history_df.empty:
            history_df['cluster_id'] = cid
            all_results_df.append(history_df)

    if all_results_df:
        summary_df = pd.concat(all_results_df, ignore_index=True)
        print("\n\n" + "="*80); print(" " * 25 + "DYNAMIC CENTROID RRF TEST SUMMARY"); print("="*80)
        print(summary_df.to_string(index=False))
        
        if CFG["WRITE_CSV"]:
            out_path = f"{CFG['OUT_PREFIX']}_summary.csv"
            summary_df.to_csv(out_path, index=False)
            log.info(f"Test summary saved to '{out_path}'")
    else:
        log.info("No ripples completed, no summary to show.")
    
    if CFG['DEVICE'] == 'cuda':
        log.info("Clearing CUDA cache after analysis..."); del model; torch.cuda.empty_cache()
    
    log.info("Dynamic Centroid RRF test complete.")

# --- Run the test ---
run_test()

[10:35:32] INFO: Loading embedding model: Qwen/Qwen3-Embedding-0.6B onto cuda...
[10:35:32] INFO: Load pretrained SentenceTransformer: Qwen/Qwen3-Embedding-0.6B
[10:35:41] INFO: 1 prompt is loaded, with the key: query
[10:35:41] INFO: Model loaded successfully.
[10:35:41] INFO: Successfully loaded 19 clusters from 'clusters_snapshot.json'.
[10:35:41] INFO: --> Running test on a subset of 2 clusters: [4, 7]
[10:35:41] INFO: --- [Dynamic RRF] Starting ripple for Cluster 4 ---
[10:35:42] INFO:     > Fetching title+abstract for 100 PMIDs...
[10:35:45] INFO:     > Embedding 8 paper titles...
[10:35:46] INFO:     > Embedding 92 paper titles...
[10:35:46] INFO:   > Depth 1: Discovered=92, Kept=92
[10:35:46] INFO:     Top 5 picks for this layer (ranked by Dynamic RRF):
[10:35:46] INFO:       1. [39917072] (RRF: 0.0301 | Sem: 0.673 | BC: 11) The role of gastric ultrasound in anaesthesia for emergency surgery: A review and clinical guidance.
[10:35:46] INFO:       2. [36262725] (RRF: 0.0297 | Se



                         DYNAMIC CENTROID RRF TEST SUMMARY
 depth  discovered  kept  frontier_size  cluster_id
     1          92    92              8           4
     2         494   100             92           4
     3         351   100            100           4
     4         303   100            100           4
     5         226   100            100           4
     6         285   100            100           4
     7         292   100            100           4
     8          56    56            100           4
     9          17    17             56           4
    10           4     4             17           4
     1          25    25              4           7
     2          47    47             25           7
     3         121   100             47           7
     4          83    83            100           7
     5          66    66             83           7
     6          90    90             66           7
     7          54    54             90           7
   

In [1]:
# %% [DEFINITIVE Benchmark: Static vs. Dynamic Centroid RRF Ripple]
# This script performs a head-to-head comparison of two RRF-based ripple strategies:
#
# 1. Static Centroid RRF: Semantic relevance is always measured against the
#    centroid of the INITIAL seed cluster (S).
# 2. Dynamic Centroid RRF: Semantic relevance is measured against the centroid
#    of the PREVIOUS layer (Dn-1), allowing the topic to adapt.
#
# It generates an extensive report including:
#  - Semantic Drift: Cosine similarity of centroids between consecutive layers (S->D1, D1->D2).
#  - Semantic Divergence: Cosine similarity between the two methods' centroids at each depth.
#  - Jaccard Lineage: Jaccard similarity of the discovered article sets at each depth.

import os
import time
import json
import logging
from collections import Counter
from typing import Dict, List, Any, Set, Tuple

import numpy as np
import pandas as pd
import requests
import torch
from sentence_transformers import SentenceTransformer
import xml.etree.ElementTree as ET

# =================================================================================
# >> CONFIGURATION <<
# =================================================================================
CFG = dict(
    CLUSTERS_PATH="clusters_snapshot.json",
    ENTREZ_EMAIL="you@example.com",
    NCBI_API_KEY=os.environ.get("NCBI_API_KEY"),
    MODEL_ID="Qwen/Qwen3-Embedding-0.6B",
    DEVICE="cuda" if torch.cuda.is_available() else "cpu",
    BATCH_SIZE=24,  # <-- ADD THIS LINE
    SUPPORT_MIN=1,
    GLOBAL_ABSOLUTE_CAP=100,
    MAX_DEPTH=15,
    RRF_K=60,
    WRITE_CSV=True,
    OUT_PREFIX="static_vs_dynamic_benchmark",
)

# =================================================================================
# Core Setup, Caching, and Data Fetching
# =================================================================================
def _setup_logging():
    fmt = "[%(asctime)s] %(levelname)s: %(message)s"; datefmt = "%H:%M:%S"
    logging.basicConfig(level=logging.INFO, format=fmt, datefmt=datefmt, force=True)
    return logging.getLogger("RippleBenchmark")
log = _setup_logging()
ICACHE, TCACHE, EMBEDDING_CACHE = {}, {}, {}

def efetch_with_references(pmids: List[int]):
    pmids_to_fetch = [p for p in set(pmids) if p not in TCACHE]
    if not pmids_to_fetch: return
    log.info(f"    > Fetching metadata for {len(pmids_to_fetch)} PMIDs...")
    base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    for i in range(0, len(pmids_to_fetch), 200):
        sub = pmids_to_fetch[i:i+200]
        params = {"db": "pubmed", "id": ",".join(str(x) for x in sub), "retmode": "xml", "email": CFG["ENTREZ_EMAIL"]}
        if CFG["NCBI_API_KEY"]: params["api_key"] = CFG["NCBI_API_KEY"]
        try:
            r = requests.get(base, params=params, timeout=90)
            r.raise_for_status()
            root = ET.fromstring(r.text.encode('utf-8'))
            for art in root.findall(".//PubmedArticle"):
                pmid_el = art.find(".//PMID")
                if pmid_el is None or pmid_el.text is None: continue
                pid = int(pmid_el.text.strip())
                title_el = art.find(".//ArticleTitle")
                title = "".join(title_el.itertext()).strip() if title_el is not None else ""
                refs_list = [ref.text.strip() for ref in art.findall('.//ReferenceList//ArticleId[@IdType="pubmed"]') if ref.text]
                TCACHE[pid] = {"title": title, "references": [int(p) for p in refs_list if p.isdigit()]}
        except Exception as e: log.error(f"      ! EFetch failed: {e}")

def _get_embedding_vectors(pmids: set, model: SentenceTransformer) -> Dict[int, np.ndarray]:
    """
    Computes embeddings for a set of PMIDs with robust OOM (Out of Memory) handling.
    If a batch fails, it halves the batch size and retries until it succeeds.
    """
    pmids_to_embed = [p for p in pmids if p not in EMBEDDING_CACHE]
    if pmids_to_embed:
        efetch_with_references(pmids_to_embed)  # Ensure we have titles
        log.info(f"    > Embedding {len(pmids_to_embed)} paper titles...")
        texts_to_embed = [f"passage: {TCACHE.get(p, {}).get('title', '')}" for p in pmids_to_embed]
        
        current_batch_size = CFG['BATCH_SIZE']
        try:
            new_embeddings = model.encode(
                texts_to_embed,
                batch_size=current_batch_size,
                normalize_embeddings=True,
                show_progress_bar=False,
                device=CFG['DEVICE']
            )
        except torch.cuda.OutOfMemoryError:
            log.warning(f"    ! OOM at batch size {current_batch_size}. Retrying with smaller batches...")
            torch.cuda.empty_cache()
            new_embeddings_list = []
            
            # Fallback to a safe, iterative batching loop
            for i in range(0, len(texts_to_embed), current_batch_size // 2 or 1):
                sub_batch_texts = texts_to_embed[i:i + (current_batch_size // 2 or 1)]
                
                # Inner OOM loop for sub-batches
                bs = current_batch_size // 2 or 1
                while bs > 0:
                    try:
                        sub_embeddings = model.encode(
                            sub_batch_texts,
                            batch_size=bs,
                            normalize_embeddings=True,
                            show_progress_bar=False,
                            device=CFG['DEVICE']
                        )
                        new_embeddings_list.append(sub_embeddings)
                        break # Success
                    except torch.cuda.OutOfMemoryError:
                        torch.cuda.empty_cache()
                        log.warning(f"    ! OOM again. Halving batch size to {bs // 2 or 1}")
                        bs = bs // 2
                if bs == 0:
                    log.error("    ! Failed to embed batch even with size 1. Skipping.")
            
            if new_embeddings_list:
                new_embeddings = np.concatenate(new_embeddings_list)
            else:
                new_embeddings = np.array([])

        if len(new_embeddings) == len(pmids_to_embed):
            for pmid, emb in zip(pmids_to_embed, new_embeddings):
                EMBEDDING_CACHE[pmid] = emb

    return {p: EMBEDDING_CACHE[p] for p in pmids if p in EMBEDDING_CACHE}

def calculate_centroid(pmids: Set[int], model: SentenceTransformer) -> np.ndarray:
    vectors_dict = _get_embedding_vectors(pmids, model)
    if not vectors_dict: return np.array([])
    vectors = np.array(list(vectors_dict.values()))
    return np.mean(vectors, axis=0, keepdims=True)

# =================================================================================
# Scoring Engine
# =================================================================================
def get_rrf_scores(candidates: Set[int], frontier: Set[int], model: SentenceTransformer, scoring_centroid: np.ndarray) -> Tuple[Dict[int, float], Dict[int, float], Dict[int, int]]:
    """
    Combines Bibliographic Coupling and Semantic Similarity using RRF.
    Returns three dictionaries: final rrf_scores, and the component semantic_scores and bc_scores.
    """
    # 1. Bibliographic Coupling Score
    frontier_refs = set().union(*(set(TCACHE.get(p, {}).get('references', [])) for p in frontier))
    bc_scores = {p: len(set(TCACHE.get(p, {}).get('references', [])).intersection(frontier_refs)) for p in candidates}

    # 2. Semantic Score (against the provided centroid)
    semantic_scores = {p: 0.0 for p in candidates}
    candidate_vectors_dict = _get_embedding_vectors(candidates, model)
    if candidate_vectors_dict and scoring_centroid.size > 0:
        valid_candidates = sorted(list(candidate_vectors_dict.keys()))
        candidate_vectors = np.array([candidate_vectors_dict[p] for p in valid_candidates])
        scores_array = (candidate_vectors @ scoring_centroid.T).flatten()
        semantic_scores.update(dict(zip(valid_candidates, scores_array)))

    # 3. RRF Combination
    def get_ranks(s: Dict) -> Dict: return {p: i + 1 for i, (p, _) in enumerate(sorted(s.items(), key=lambda item: item[1], reverse=True))}
    semantic_ranks, bc_ranks = get_ranks(semantic_scores), get_ranks(bc_scores)
    
    k = CFG['RRF_K']
    rrf_scores = {}
    for pmid in candidates:
        sem_rank = semantic_ranks.get(pmid, len(semantic_ranks) + 1)
        bc_rank = bc_ranks.get(pmid, len(bc_ranks) + 1)
        rrf_scores[pmid] = (1 / (k + sem_rank)) + (1 / (k + bc_rank))
        
    return rrf_scores, semantic_scores, bc_scores

# =================================================================================
# Main Ripple Logic
# =================================================================================
def run_ripple(cluster_id: int, seed_pmids: Set[int], model: SentenceTransformer, method: str, seed_centroid: np.ndarray) -> List[Dict]:
    log.info(f"--- [{method.upper()}] Starting ripple for Cluster {cluster_id} ---")
    visited, frontier = set(seed_pmids), set(seed_pmids)
    history, depth = [], 0
    
    prev_centroid = seed_centroid

    while frontier and depth < CFG["MAX_DEPTH"]:
        depth += 1
        
        try: # iCite for finding papers that cite the frontier
            r = requests.get("https://icite.od.nih.gov/api/pubs", params={"pmids": ",".join(map(str, frontier)), "format": "json"}, timeout=90)
            r.raise_for_status()
            support_counts = Counter()
            for rec in r.json().get("data", []):
                if rec.get("pmid") in frontier: support_counts.update(rec.get("cited_by", []))
        except requests.RequestException as e: log.error(f"iCite request failed: {e}"); break
        
        discovered = {p for p, c in support_counts.items() if p not in visited and c >= CFG["SUPPORT_MIN"]}
        if not discovered: log.info(f"  > Halt @ Depth {depth}: No new supported papers."); break

        efetch_with_references(list(discovered | frontier)) # Get metadata for scoring

        if method == 'static':
            scoring_centroid = seed_centroid
        else: # dynamic
            scoring_centroid = calculate_centroid(frontier, model)
            if scoring_centroid.size == 0:
                log.warning(f"  > Halt @ Depth {depth}: Could not compute frontier centroid."); break
        
        # This call now unpacks the component scores for logging
        scores, semantic_scores, bc_scores = get_rrf_scores(discovered, frontier, model, scoring_centroid)
        
        kept = sorted(discovered, key=lambda p: scores.get(p, 0), reverse=True)[:CFG['GLOBAL_ABSOLUTE_CAP']]
        
        # --- FIXED: RE-INTRODUCED DETAILED LOGGING BLOCK ---
        if kept:
            log.info(f"  > Depth {depth}: Discovered={len(discovered)}, Kept={len(kept)}")
            log.info(f"    Top 5 picks for this layer (ranked by RRF):")
            for i, pmid in enumerate(kept[:5]):
                title = TCACHE.get(pmid, {}).get('title', 'N/A')
                rrf_s = scores.get(pmid, 0.0)
                sem_s = semantic_scores.get(pmid, 0.0)
                bc_s = bc_scores.get(pmid, 0)
                log.info(f"      {i+1}. [{pmid}] (RRF: {rrf_s:.4f} | Sem: {sem_s:.3f} | BC: {bc_s}) {title}")
        else:
            log.info(f"  > Halt @ Depth {depth}: No papers kept after scoring."); break

        visited.update(kept)
        frontier = set(kept)

        # --- METRICS FOR THIS STEP ---
        current_centroid = calculate_centroid(frontier, model)
        drift = (prev_centroid @ current_centroid.T).item() if prev_centroid.size > 0 and current_centroid.size > 0 else 0.0
        
        history.append({
            'depth': depth,
            'cumulative_set': visited.copy(),
            'frontier_centroid': current_centroid
        })
        log.info(f"    > Semantic Drift from previous layer: {drift:.4f}")
        prev_centroid = current_centroid
        
    log.info(f"--- [{method.upper()}] Finished ripple for Cluster {cluster_id} at depth {depth}. ---")
    return history

# =================================================================================
# Main Execution and Comparison Block
# =================================================================================
def run_comparison():
    if CFG['DEVICE'] == 'cuda': torch.cuda.empty_cache()
    log.info(f"Loading embedding model: {CFG['MODEL_ID']}...")
    model = SentenceTransformer(CFG['MODEL_ID'], device=CFG['DEVICE'])
    log.info("Model loaded successfully.")
    try:
        with open(CFG["CLUSTERS_PATH"], "r") as f: clusters = {int(k): v for k, v in json.load(f).items()}
    except Exception as e: log.error(f"FATAL: Could not load clusters file. Error: {e}"); return

    clusters_to_run = [4, 8] # Using the ORIGINAL cluster IDs
    log.info(f"--> Running benchmark on clusters: {clusters_to_run}")

    all_metrics = []

    for cid in clusters_to_run:
        pmids = set(clusters[cid])
        if not pmids: continue
        
        global TCACHE, EMBEDDING_CACHE; TCACHE, EMBEDDING_CACHE = {}, {}
        
        log.info(f"\n{'='*60}\n>>> Processing Cluster {cid} (Seeds: {len(pmids)}) <<<\n{'='*60}")
        
        seed_centroid = calculate_centroid(pmids, model)

        static_history = run_ripple(cid, pmids, model, 'static', seed_centroid)
        dynamic_history = run_ripple(cid, pmids, model, 'dynamic', seed_centroid)

        # --- Post-run analysis and metric calculation ---
        max_d = min(len(static_history), len(dynamic_history))
        if max_d == 0: continue

        log.info(f"\n--- Analysis for Cluster {cid} (comparing up to depth {max_d}) ---")
        
        prev_static_centroid = seed_centroid
        prev_dynamic_centroid = seed_centroid

        for i in range(max_d):
            s_step, d_step = static_history[i], dynamic_history[i]
            depth = s_step['depth']
            
            # Jaccard Similarity of cumulative discovered sets
            union_size = len(s_step['cumulative_set'] | d_step['cumulative_set'])
            jaccard = len(s_step['cumulative_set'] & d_step['cumulative_set']) / union_size if union_size > 0 else 0
            
            # Semantic Drift (Intra-method)
            s_drift = (prev_static_centroid @ s_step['frontier_centroid'].T).item() if prev_static_centroid.size and s_step['frontier_centroid'].size else 0
            d_drift = (prev_dynamic_centroid @ d_step['frontier_centroid'].T).item() if prev_dynamic_centroid.size and d_step['frontier_centroid'].size else 0

            # Semantic Divergence (Inter-method)
            divergence = (s_step['frontier_centroid'] @ d_step['frontier_centroid'].T).item() if s_step['frontier_centroid'].size and d_step['frontier_centroid'].size else 0
            
            all_metrics.append({
                'cluster_id': cid,
                'depth': depth,
                'jaccard_lineage': jaccard,
                'static_drift': s_drift,
                'dynamic_drift': d_drift,
                'inter_method_divergence': divergence,
                'static_set_size': len(s_step['cumulative_set']),
                'dynamic_set_size': len(d_step['cumulative_set']),
            })
            prev_static_centroid, prev_dynamic_centroid = s_step['frontier_centroid'], d_step['frontier_centroid']

    if not all_metrics:
        log.error("No data collected. Benchmark finished without results.")
        return

    summary_df = pd.DataFrame(all_metrics).sort_values(by=['cluster_id', 'depth'])
    print("\n\n" + "="*95); print(" " * 20 + "STATIC vs. DYNAMIC CENTROID RRF BENCHMARK: DETAILED ANALYSIS"); print("="*95)
    print(summary_df.to_string(index=False, float_format="%.4f"))
    print("="*95)

    if CFG["WRITE_CSV"]:
        out_path = f"{CFG['OUT_PREFIX']}_summary.csv"
        summary_df.to_csv(out_path, index=False, float_format="%.4f")
        log.info(f"Benchmark summary saved to '{out_path}'")

    if CFG['DEVICE'] == 'cuda':
        log.info("Clearing CUDA cache..."); del model; torch.cuda.empty_cache()
    log.info("Benchmark complete.")

# --- Run the benchmark ---
run_comparison()

[11:08:20] INFO: Loading embedding model: Qwen/Qwen3-Embedding-0.6B...
[11:08:20] INFO: Load pretrained SentenceTransformer: Qwen/Qwen3-Embedding-0.6B
[11:08:28] INFO: 1 prompt is loaded, with the key: query
[11:08:28] INFO: Model loaded successfully.
[11:08:28] INFO: --> Running benchmark on clusters: [4, 8]
[11:08:28] INFO: 
>>> Processing Cluster 4 (Seeds: 8) <<<
[11:08:28] INFO:     > Fetching metadata for 8 PMIDs...
[11:08:28] INFO:     > Embedding 8 paper titles...
[11:08:29] INFO: --- [STATIC] Starting ripple for Cluster 4 ---
[11:08:29] INFO:     > Fetching metadata for 92 PMIDs...
[11:08:30] INFO:     > Embedding 92 paper titles...
[11:08:31] INFO:   > Depth 1: Discovered=92, Kept=92
[11:08:31] INFO:     Top 5 picks for this layer (ranked by RRF):
[11:08:31] INFO:       1. [39917072] (RRF: 0.0301 | Sem: 0.673 | BC: 11) The role of gastric ultrasound in anaesthesia for emergency surgery: A review and clinical guidance.
[11:08:31] INFO:       2. [36262725] (RRF: 0.0297 | Sem: 0.



                    STATIC vs. DYNAMIC CENTROID RRF BENCHMARK: DETAILED ANALYSIS
 cluster_id  depth  jaccard_lineage  static_drift  dynamic_drift  inter_method_divergence  static_set_size  dynamic_set_size
          4      1           1.0000        0.5876         0.5876                   0.5748              100               100
          4      2           0.9324        0.5987         0.6007                   0.6508              200               200
          4      3           0.9169        0.6005         0.6025                   0.5751              300               300
          4      4           0.8182        0.5448         0.5552                   0.5511              400               400
          4      5           0.6584        0.4946         0.5565                   0.4588              500               500
          4      6           0.5424        0.4628         0.5580                   0.4332              600               600
          4      7           0.4433       

# Analysis of Static vs. Dynamic Centroid RRF Benchmark

**Run Date:** 2025-08-26
**Clusters Tested:** 4 ("Gastric Ultrasound"), 8 ("Vaginal Misoprostol")
**Objective:** To compare the exploratory behavior of a ripple search using a static semantic target versus a dynamic, adaptive one.

---
## Executive Summary

The results present a compelling case for the **Dynamic Centroid** method as a superior tool for **exploratory analysis and discovery**.

The **Static Centroid** method proves to be a conservative **exploiter**; it thoroughly maps the immediate thematic area of the seed cluster but struggles to make significant leaps into new territories. Its thematic focus gradually decays over time.

The **Dynamic Centroid** method acts as an aggressive **explorer**. It adaptively follows the "scent" of the most recently discovered literature, allowing it to make decisive, intelligent pivots into highly relevant and often distinct sub-fields. This resulted in the discovery of a significantly different and larger body of literature.

---
## Cluster 4 Analysis: "Gastric Ultrasound for Aspiration Risk"

This test illustrates a clear and sustained divergence in strategy, where the Dynamic method demonstrates superior thematic coherence and makes a critical, clinically relevant pivot.

### Key Observations:
* **Thematic Pivot to GLP-1 Agonists:** The logs show a major divergence starting around **Depth 4**.
    * The **Static method** begins to generalize, pulling in papers on "airway management" and "pediatric POCUS." Its focus broadens.
    * The **Dynamic method** makes a sharp, decisive pivot into the highly specific and timely topic of **"GLP-1 Receptor Agonists"** (like Ozempic) and their impact on gastric emptying and anesthesia. This is a major, ongoing clinical conversation directly related to the seed topic.

* **Interpreting the Metrics:**
    * **Semantic Drift (`dynamic_drift` > `static_drift`):** For most of the run (depths 5-7), the Dynamic method's drift score is higher, indicating it maintained a more stable, step-by-step thematic progression *after* its pivot. The Static method's focus decayed more rapidly as it was constantly pulled back to its origin.
    * **Jaccard Lineage & Divergence:** The `jaccard_lineage` steadily drops to **0.3578**, meaning by the end, almost two-thirds of the discovered papers were unique to one method. The `inter_method_divergence` shows a similar drop, confirming they ended up exploring fundamentally different thematic spaces.

### Conclusion for Cluster 4:
The Dynamic method didn't just drift; it intelligently identified and locked onto the most significant and modern sub-topic related to gastric emptying. This is a textbook example of successful exploratory behavior.

---
## Cluster 8 Analysis: "Vaginal Misoprostol for Hysteroscopy"

This test showcases an even more dramatic divergence, where the Dynamic method follows a logical path from a procedure to its ultimate clinical goal: improving fertility.

### Key Observations:
* **Thematic Pivot to Regenerative Medicine:** The logs reveal a fascinating progression:
    * The **Static method** follows a predictable path: `Misoprostol for Hysteroscopy` -> `Labor Induction` -> `Intrauterine Adhesions` -> `Exosomes in Cancer`. The path is logical but ends in a broad, somewhat unrelated field.
    * The **Dynamic method** follows a much more focused clinical narrative: `Misoprostol for Hysteroscopy` -> `Improving Fertility Outcomes after Hysteroscopy` -> `Endometrial Receptivity` -> **`Stem Cells and Regenerative Medicine for Endometrial Injury`**.

* **Interpreting the Metrics:**
    * **Massive Divergence:** The `jaccard_lineage` plummets to a mere **0.2564**. The two methods ended up in almost completely different worlds. The `dynamic_set_size` is also significantly larger (1303 vs 1010), showing it found a richer vein of literature.
    * **The Pivot Signature:** The `dynamic_drift` metric clearly shows a sharp drop at **Depth 12 (0.4031)**, signaling its major pivot into **reproductive immunology and cell biology (ferroptosis, etc.)**. Before that point, it was more stable than the Static method, indicating it was confidently following its chosen path.

### Conclusion for Cluster 8:
The Dynamic method successfully traced a sophisticated clinical pathway from a simple preparatory drug all the way to the cutting-edge of regenerative medicine aimed at solving the underlying fertility problems. This is a feat of discovery that the more conservative Static method was incapable of.

---
## Final Recommendation

These results confirm that the **Dynamic Centroid RRF is the superior strategy for an autonomous discovery agent**. Its ability to adapt its focus allows it to make intelligent, context-aware pivots, uncovering deeper and more relevant thematic connections than a static approach. While the Static method is a useful baseline for mapping a core topic, the Dynamic method is the true engine for finding the novel links that can lead to identifying research gaps.

In [3]:
# %% [DEFINITIVE Benchmark: DC-Only RRF vs. BC+DC RRF Ripple]
# This script performs a head-to-head comparison of two advanced RRF-based ripple strategies:
#
# 1. DC-Only RRF: A 2-signal fusion of:
#    - Dynamic Centroid semantic similarity.
#    - Direct Citation (DC): How many papers in the current frontier a candidate paper cites.
#
# 2. BC+DC RRF: A 3-signal fusion of:
#    - Dynamic Centroid semantic similarity.
#    - Bibliographic Coupling (BC): Shared references with the current frontier.
#    - Direct Citation (DC).
#
# It generates an extensive report with metrics on semantic drift, divergence, and Jaccard lineage.

import os
import time
import json
import logging
from collections import Counter
from typing import Dict, List, Any, Set, Tuple

import numpy as np
import pandas as pd
import requests
import torch
from sentence_transformers import SentenceTransformer
import xml.etree.ElementTree as ET

# =================================================================================
# >> CONFIGURATION <<
# =================================================================================
CFG = dict(
    CLUSTERS_PATH="clusters_snapshot.json",
    ENTREZ_EMAIL="you@example.com",
    NCBI_API_KEY=os.environ.get("NCBI_API_KEY"),
    MODEL_ID="Qwen/Qwen3-Embedding-0.6B",
    DEVICE="cuda" if torch.cuda.is_available() else "cpu",
    BATCH_SIZE=24,
    SUPPORT_MIN=1,
    GLOBAL_ABSOLUTE_CAP=100,
    MAX_DEPTH=15,
    RRF_K=60,
    WRITE_CSV=True,
    OUT_PREFIX="dc_vs_bcd_benchmark",
)

# =================================================================================
# Core Setup, Caching, and Data Fetching
# =================================================================================
def _setup_logging():
    fmt = "[%(asctime)s] %(levelname)s: %(message)s"; datefmt = "%H:%M:%S"
    logging.basicConfig(level=logging.INFO, format=fmt, datefmt=datefmt, force=True)
    return logging.getLogger("RippleBenchmark")
log = _setup_logging()
TCACHE, EMBEDDING_CACHE = {}, {}

def efetch_with_references(pmids: List[int]):
    pmids_to_fetch = [p for p in set(pmids) if p not in TCACHE]
    if not pmids_to_fetch: return
    log.info(f"    > Fetching metadata for {len(pmids_to_fetch)} PMIDs...")
    base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    for i in range(0, len(pmids_to_fetch), 200):
        sub = pmids_to_fetch[i:i+200]
        params = {"db": "pubmed", "id": ",".join(str(x) for x in sub), "retmode": "xml", "email": CFG["ENTREZ_EMAIL"]}
        if CFG["NCBI_API_KEY"]: params["api_key"] = CFG["NCBI_API_KEY"]
        try:
            r = requests.get(base, params=params, timeout=90)
            r.raise_for_status()
            root = ET.fromstring(r.text.encode('utf-8'))
            for art in root.findall(".//PubmedArticle"):
                pmid_el = art.find(".//PMID")
                if pmid_el is None or pmid_el.text is None: continue
                pid = int(pmid_el.text.strip())
                title_el = art.find(".//ArticleTitle")
                title = "".join(title_el.itertext()).strip() if title_el is not None else ""
                refs_list = [ref.text.strip() for ref in art.findall('.//ReferenceList//ArticleId[@IdType="pubmed"]') if ref.text]
                TCACHE[pid] = {"title": title, "references": [int(p) for p in refs_list if p.isdigit()]}
        except Exception as e: log.error(f"      ! EFetch failed: {e}")

def _get_embedding_vectors(pmids: set, model: SentenceTransformer) -> Dict[int, np.ndarray]:
    pmids_to_embed = [p for p in pmids if p not in EMBEDDING_CACHE]
    if pmids_to_embed:
        efetch_with_references(pmids_to_embed)
        log.info(f"    > Embedding {len(pmids_to_embed)} paper titles...")
        texts_to_embed = [f"passage: {TCACHE.get(p, {}).get('title', '')}" for p in pmids_to_embed]
        try:
            new_embeddings = model.encode(texts_to_embed, batch_size=CFG['BATCH_SIZE'], normalize_embeddings=True, show_progress_bar=False, device=CFG['DEVICE'])
            if len(new_embeddings) == len(pmids_to_embed):
                for pmid, emb in zip(pmids_to_embed, new_embeddings): EMBEDDING_CACHE[pmid] = emb
        except torch.cuda.OutOfMemoryError:
            log.error("    ! OOM Error during embedding. Consider reducing BATCH_SIZE in CFG.")
            torch.cuda.empty_cache() # Clear cache and continue with what we have
    return {p: EMBEDDING_CACHE[p] for p in pmids if p in EMBEDDING_CACHE}

def calculate_centroid(pmids: Set[int], model: SentenceTransformer) -> np.ndarray:
    vectors_dict = _get_embedding_vectors(pmids, model)
    if not vectors_dict: return np.array([])
    return np.mean(np.array(list(vectors_dict.values())), axis=0, keepdims=True)

# =================================================================================
# Scoring Engine
# =================================================================================
def calculate_ranked_scores(method: str, candidates: Set[int], frontier: Set[int], model: SentenceTransformer, scoring_centroid: np.ndarray):
    # --- Calculate all three base scores ---
    # 1. Semantic Score (vs. dynamic centroid)
    semantic_scores = {p: 0.0 for p in candidates}
    candidate_vectors_dict = _get_embedding_vectors(candidates, model)
    if candidate_vectors_dict and scoring_centroid.size > 0:
        valid_candidates = sorted(list(candidate_vectors_dict.keys()))
        candidate_vectors = np.array([candidate_vectors_dict[p] for p in valid_candidates])
        scores_array = (candidate_vectors @ scoring_centroid.T).flatten()
        semantic_scores.update(dict(zip(valid_candidates, scores_array)))

    # 2. Bibliographic Coupling Score
    frontier_refs = set().union(*(set(TCACHE.get(p, {}).get('references', [])) for p in frontier))
    bc_scores = {p: len(set(TCACHE.get(p, {}).get('references', [])).intersection(frontier_refs)) for p in candidates}

    # 3. Direct Citation Score
    dc_scores = {p: len(set(TCACHE.get(p, {}).get('references', [])).intersection(frontier)) for p in candidates}

    # --- RRF Combination ---
    def get_ranks(s: Dict) -> Dict: return {p: i + 1 for i, (p, _) in enumerate(sorted(s.items(), key=lambda item: item[1], reverse=True))}
    
    semantic_ranks = get_ranks(semantic_scores)
    bc_ranks = get_ranks(bc_scores)
    dc_ranks = get_ranks(dc_scores)
    
    k = CFG['RRF_K']
    rrf_scores = {}
    for pmid in candidates:
        sem_rank = semantic_ranks.get(pmid, len(semantic_ranks) + 1)
        dc_rank = dc_ranks.get(pmid, len(dc_ranks) + 1)
        
        if method == 'dc_only':
            rrf_scores[pmid] = (1 / (k + sem_rank)) + (1 / (k + dc_rank))
        elif method == 'bc_dc':
            bc_rank = bc_ranks.get(pmid, len(bc_ranks) + 1)
            rrf_scores[pmid] = (1 / (k + sem_rank)) + (1 / (k + bc_rank)) + (1 / (k + dc_rank))
            
    return rrf_scores, semantic_scores, bc_scores, dc_scores

# =================================================================================
# Main Ripple Logic
# =================================================================================
def run_ripple(cluster_id: int, seed_pmids: Set[int], model: SentenceTransformer, method: str, seed_centroid: np.ndarray) -> List[Dict]:
    log.info(f"--- [{method.upper()}] Starting ripple for Cluster {cluster_id} ---")
    visited, frontier, history, depth = set(seed_pmids), set(seed_pmids), [], 0
    prev_centroid = seed_centroid

    while frontier and depth < CFG["MAX_DEPTH"]:
        depth += 1
        
        try:
            r = requests.get("https://icite.od.nih.gov/api/pubs", params={"pmids": ",".join(map(str, frontier)), "format": "json"}, timeout=90)
            r.raise_for_status()
            support_counts = Counter()
            for rec in r.json().get("data", []):
                if rec.get("pmid") in frontier: support_counts.update(rec.get("cited_by", []))
        except requests.RequestException as e: log.error(f"iCite request failed: {e}"); break
        
        discovered = {p for p, c in support_counts.items() if p not in visited and c >= CFG["SUPPORT_MIN"]}
        if not discovered: log.info(f"  > Halt @ Depth {depth}: No new supported papers."); break

        efetch_with_references(list(discovered | frontier))
        scoring_centroid = calculate_centroid(frontier, model)
        if scoring_centroid.size == 0: log.warning(f"  > Halt @ Depth {depth}: Could not compute frontier centroid."); break
        
        scores, semantic_scores, bc_scores, dc_scores = calculate_ranked_scores(method, discovered, frontier, model, scoring_centroid)
        kept = sorted(discovered, key=lambda p: scores.get(p, 0), reverse=True)[:CFG['GLOBAL_ABSOLUTE_CAP']]
        
        if kept:
            log.info(f"  > Depth {depth}: Discovered={len(discovered)}, Kept={len(kept)}")
            log.info(f"    Top 5 picks for this layer (ranked by {method.upper()} RRF):")
            for i, pmid in enumerate(kept[:5]):
                title = TCACHE.get(pmid, {}).get('title', 'N/A')
                rrf_s, sem_s, bc_s, dc_s = scores.get(pmid,0), semantic_scores.get(pmid,0), bc_scores.get(pmid,0), dc_scores.get(pmid,0)
                if method == 'dc_only':
                    log.info(f"      {i+1}. [{pmid}] (RRF: {rrf_s:.4f} | Sem: {sem_s:.3f} | DC: {dc_s}) {title}")
                else: # bc_dc
                    log.info(f"      {i+1}. [{pmid}] (RRF: {rrf_s:.4f} | Sem: {sem_s:.3f} | BC: {bc_s} | DC: {dc_s}) {title}")
        else:
            log.info(f"  > Halt @ Depth {depth}: No papers kept after scoring."); break

        visited.update(kept)
        frontier = set(kept)
        current_centroid = calculate_centroid(frontier, model)
        drift = (prev_centroid @ current_centroid.T).item() if prev_centroid.size and current_centroid.size else 0.0
        
        history.append({'depth': depth, 'cumulative_set': visited.copy(), 'frontier_centroid': current_centroid})
        log.info(f"    > Semantic Drift from previous layer: {drift:.4f}")
        prev_centroid = current_centroid
        
    log.info(f"--- [{method.upper()}] Finished ripple for Cluster {cluster_id} at depth {depth}. ---")
    return history

# =================================================================================
# Main Execution and Comparison Block
# =================================================================================
def run_dc_vs_bcd_comparison():
    if CFG['DEVICE'] == 'cuda': torch.cuda.empty_cache()
    log.info(f"Loading embedding model: {CFG['MODEL_ID']}...")
    model = SentenceTransformer(CFG['MODEL_ID'], device=CFG['DEVICE'])
    log.info("Model loaded successfully.")
    try:
        with open(CFG["CLUSTERS_PATH"], "r") as f: clusters = {int(k): v for k, v in json.load(f).items()}
    except Exception as e: log.error(f"FATAL: Could not load clusters file. Error: {e}"); return

    clusters_to_run = [4, 8]
    log.info(f"--> Running benchmark on clusters: {clusters_to_run}")
    all_metrics = []

    for cid in clusters_to_run:
        pmids = set(clusters.get(cid, []))
        if not pmids: continue
        
        global TCACHE, EMBEDDING_CACHE; TCACHE, EMBEDDING_CACHE = {}, {}
        log.info(f"\n{'='*60}\n>>> Processing Cluster {cid} (Seeds: {len(pmids)}) <<<\n{'='*60}")
        seed_centroid = calculate_centroid(pmids, model)

        dc_only_history = run_ripple(cid, pmids, model, 'dc_only', seed_centroid)
        bc_dc_history = run_ripple(cid, pmids, model, 'bc_dc', seed_centroid)

        max_d = min(len(dc_only_history), len(bc_dc_history))
        if max_d == 0: continue
        log.info(f"\n--- Analysis for Cluster {cid} (comparing up to depth {max_d}) ---")
        
        prev_dc_centroid = seed_centroid
        prev_bcd_centroid = seed_centroid

        for i in range(max_d):
            dc_step, bcd_step = dc_only_history[i], bc_dc_history[i]
            
            union_size = len(dc_step['cumulative_set'] | bcd_step['cumulative_set'])
            jaccard = len(dc_step['cumulative_set'] & bcd_step['cumulative_set']) / union_size if union_size > 0 else 0
            
            dc_drift = (prev_dc_centroid @ dc_step['frontier_centroid'].T).item() if prev_dc_centroid.size and dc_step['frontier_centroid'].size else 0
            bcd_drift = (prev_bcd_centroid @ bcd_step['frontier_centroid'].T).item() if prev_bcd_centroid.size and bcd_step['frontier_centroid'].size else 0
            divergence = (dc_step['frontier_centroid'] @ bcd_step['frontier_centroid'].T).item() if dc_step['frontier_centroid'].size and bcd_step['frontier_centroid'].size else 0
            
            all_metrics.append({
                'cluster_id': cid, 'depth': dc_step['depth'], 'jaccard_lineage': jaccard,
                'dc_only_drift': dc_drift, 'bc_dc_drift': bcd_drift,
                'inter_method_divergence': divergence,
                'dc_only_set_size': len(dc_step['cumulative_set']),
                'bc_dc_set_size': len(bcd_step['cumulative_set']),
            })
            prev_dc_centroid, prev_bcd_centroid = dc_step['frontier_centroid'], bcd_step['frontier_centroid']

    if not all_metrics:
        log.error("No data collected. Benchmark finished without results.")
        return

    summary_df = pd.DataFrame(all_metrics).sort_values(by=['cluster_id', 'depth'])
    print("\n\n" + "="*95); print(" " * 20 + "DC-ONLY vs. BC+DC RRF BENCHMARK: DETAILED ANALYSIS"); print("="*95)
    print(summary_df.to_string(index=False, float_format="%.4f"))
    print("="*95)

    if CFG["WRITE_CSV"]:
        out_path = f"{CFG['OUT_PREFIX']}_summary.csv"
        summary_df.to_csv(out_path, index=False, float_format="%.4f")
        log.info(f"Benchmark summary saved to '{out_path}'")

    if CFG['DEVICE'] == 'cuda':
        log.info("Clearing CUDA cache..."); del model; torch.cuda.empty_cache()
    log.info("Benchmark complete.")

# --- Run the benchmark ---
run_dc_vs_bcd_comparison()

[11:26:13] INFO: Loading embedding model: Qwen/Qwen3-Embedding-0.6B...
[11:26:13] INFO: Load pretrained SentenceTransformer: Qwen/Qwen3-Embedding-0.6B
[11:26:19] INFO: 1 prompt is loaded, with the key: query
[11:26:19] INFO: Model loaded successfully.
[11:26:19] INFO: --> Running benchmark on clusters: [4, 8]
[11:26:19] INFO: 
>>> Processing Cluster 4 (Seeds: 8) <<<
[11:26:19] INFO:     > Fetching metadata for 8 PMIDs...
[11:26:20] INFO:     > Embedding 8 paper titles...
[11:26:20] INFO: --- [DC_ONLY] Starting ripple for Cluster 4 ---
[11:26:21] INFO:     > Fetching metadata for 92 PMIDs...
[11:26:22] INFO:     > Embedding 92 paper titles...
[11:26:23] INFO:   > Depth 1: Discovered=92, Kept=92
[11:26:23] INFO:     Top 5 picks for this layer (ranked by DC_ONLY RRF):
[11:26:23] INFO:       1. [39917072] (RRF: 0.0306 | Sem: 0.673 | DC: 1) The role of gastric ultrasound in anaesthesia for emergency surgery: A review and clinical guidance.
[11:26:23] INFO:       2. [34979932] (RRF: 0.0286 |



                    DC-ONLY vs. BC+DC RRF BENCHMARK: DETAILED ANALYSIS
 cluster_id  depth  jaccard_lineage  dc_only_drift  bc_dc_drift  inter_method_divergence  dc_only_set_size  bc_dc_set_size
          4      1           1.0000         0.5876       0.5876                   0.5748               100             100
          4      2           0.8692         0.5942       0.5815                   0.6221               200             200
          4      3           0.7647         0.5944       0.5667                   0.5556               300             300
          4      4           0.6771         0.5489       0.5298                   0.5384               400             400
          4      5           0.6287         0.5334       0.4969                   0.5053               500             500
          4      6           0.5584         0.5100       0.4676                   0.4829               600             600
          4      7           0.5086         0.4899       0.4747   

# Analysis of DC-Only vs. BC+DC RRF Benchmark

**Run Date:** 2025-08-26
**Clusters Tested:** 4 ("Gastric Ultrasound"), 8 ("Vaginal Misoprostol")
**Objective:** To determine if the simple **Direct Citation (DC)** signal is sufficient for exploration, or if adding the contextual **Bibliographic Coupling (BC)** signal provides a superior search trajectory.

---
## Executive Summary

The results reveal a clear distinction in the behavior of the two methods.

* The **DC-Only** method acts as a **"follower."** It excels at tracking a single, highly coherent thematic line. Its strength lies in its simplicity, which keeps it focused on papers that directly build upon the most recent layer of discovery.

* The **BC+DC** method acts as a **"synthesizer."** By incorporating the broader context of shared references (BC), it is better at identifying and pulling in papers from adjacent, thematically relevant fields. It builds a more comprehensive and contextually rich map of the literature.

For the agent's goal of discovering research gaps by bridging topics, the **BC+DC method is demonstrably superior**. The additional context provided by BC is not noise; it is a crucial signal for intelligent exploration.

---
## Cluster 4 Analysis: "Gastric Ultrasound for Aspiration Risk"

This test clearly shows the contextualizing power of Bibliographic Coupling.

### Key Observations:
* **Thematic Lineage:** Both methods correctly identify the major pivot from **Gastric Ultrasound** to the timely topic of **GLP-1 Receptor Agonists** around Depth 4. However, their exploration of this new topic differs significantly.
    * The **DC-Only** method stays narrowly focused on the clinical implications of GLP-1 agonists during anesthesia. It follows a very tight, logical chain.
    * The **BC+DC** method, while also focusing on GLP-1 agonists, uses the BC signal to pull in a wider range of contextually related papers, including reviews on pharmacokinetics, management of related comorbidities (diabetes, obesity), and broader applications in cardiology and critical care. It understands the "why" and "how" of the topic, not just the "what."

* **Interpreting the Metrics:**
    * **Jaccard Lineage:** The `jaccard_lineage` drops to **0.4931**. This moderate divergence shows that while they are exploring the same core topic, the BC+DC method is supplementing its findings with a significant amount of unique, contextually relevant literature that the DC-Only method misses.
    * **Semantic Drift:** The drift scores are very similar for both, indicating that neither method is unstable. The BC+DC method's slightly lower drift scores in the later stages (`0.3985` vs `0.4023` at depth 10) suggest it is casting a slightly wider, more stable net, while the DC-Only method is making smaller, more focused hops.

### Conclusion for Cluster 4:
The BC+DC method's ability to synthesize information from a broader bibliographic context allowed it to build a more comprehensive understanding of the GLP-1 agonist topic. The DC-Only method was effective but myopic in comparison.

---
## Cluster 8 Analysis: "Vaginal Misoprostol for Hysteroscopy"

This cluster's ripple provides a powerful illustration of how BC enables thematic bridging, while a pure DC signal can lead to a less coherent exploration path.

### Key Observations:
* **Thematic Lineage:** This is where the difference is most stark.
    * The **DC-Only** method follows a path from `Hysteroscopy` -> `Fertility after Hysteroscopy` -> `Platelet-Rich Plasma (PRP) for Endometrium` -> `General Cancer Therapies`. The final jump to broad oncology, while bibliographically linked, represents a loss of the core "fertility" theme.
    * The **BC+DC** method follows a much more sophisticated and clinically relevant narrative: `Hysteroscopy` -> `Fertility-Sparing Treatments for Endometrial Cancer` -> `Endometrial Receptivity & Regeneration` -> `Stem Cells` -> `Endometriosis & Microbiome`. It successfully bridges from a surgical procedure to the underlying cellular biology and immunology of fertility, which is a far more insightful path.

* **Interpreting the Metrics:**
    * **Jaccard Lineage:** The `jaccard_lineage` falls to **0.2860**, indicating a massive divergence. By the end, they are operating in almost entirely different research domains.
    * **Semantic Drift:** The `bc_dc_drift` score is consistently higher than the `dc_only_drift` score for most of the run (e.g., `0.5422` vs `0.5290` at depth 6). This shows that despite its wider exploration, the BC+DC method maintained a more stable and coherent step-by-step progression, guided by the strong contextual anchor of BC.

### Conclusion for Cluster 8:
The Direct Citation signal alone was not enough to keep the exploration focused on the most relevant clinical goal. The addition of Bibliographic Coupling was essential; it acted as a "compass," constantly re-orienting the search toward the foundational science of the topic, leading to a much more insightful and powerful discovery trajectory.

---
## Final Recommendation

This benchmark provides a definitive answer: **the combined BC+DC RRF method is the most effective strategy**.

While **Direct Citation** is a valuable signal for maintaining forward momentum, **Bibliographic Coupling** is the critical ingredient for contextual understanding and intelligent thematic bridging. For an agent whose purpose is to find gaps by understanding the connections *between* fields, the rich, contextual signal provided by BC is indispensable.

# Standard for Ripple Benchmark Analysis

## 1. Objective

The primary objective of this benchmark is to quantitatively and qualitatively evaluate the performance of different downstream search algorithms ("ripples"). The goal is to identify the most effective strategy for an autonomous agent to explore a given research topic, balancing thematic cohesion with the power of discovery to identify potential research gaps.

---

## 2. Core Methodologies to Compare

This standard defines a head-to-head comparison between two primary ripple strategies, both of which use Reciprocal Rank Fusion (RRF) to combine semantic and bibliographic signals.

* **Static Centroid RRF:** The semantic score is always calculated against the centroid of the **initial seed cluster ($S$)**. This method tests how well a ripple expands from a fixed thematic origin.
* **Dynamic Centroid RRF:** The semantic score is calculated against the centroid of the **previous frontier layer ($D_{n-1}$)**. This method tests how well a ripple adaptively shifts its thematic focus as it discovers new literature.

---

## 3. Key Performance Metrics

To ensure a comprehensive evaluation, the following metrics must be tracked at each depth (`n`) of the ripple.

### A. Semantic Trajectory Metrics

These metrics track the thematic evolution of the search.

* **Semantic Drift (Intra-Method)**
    * **Calculation:** Cosine similarity between the centroids of consecutive layers: $cos\_sim(C(D_{n-1}), C(D_n))$. The first step is $cos\_sim(C(S), C(D_1))$.
    * **Purpose:** Measures the **thematic stability** of a single ripple. A value near `1.0` indicates the focus is stable. A sudden drop signals a significant thematic pivot, which can be either a feature (intelligent exploration) or a bug (catastrophic drift).

* **Semantic Divergence (Inter-Method)**
    * **Calculation:** Cosine similarity between the centroids of the two different methods at the same depth: $cos\_sim(C_{static}(D_n), C_{dynamic}(D_n))$.
    * **Purpose:** Measures how far apart the **thematic focus** of the two methods has become. A high value means both methods are exploring a similar thematic space. A low value indicates they have branched into entirely different research areas.

### B. Literature Discovery Metrics

These metrics track the overlap and scope of the documents found.

* **Jaccard Similarity Lineage**
    * **Calculation:** Jaccard similarity between the *cumulative sets* of discovered PMIDs for both methods at each depth `n`.
    * **Purpose:** Measures the **document-level overlap**. This answers the crucial question: "Are the two methods finding the same papers?" It provides a concrete measure of how much their discovered bibliographies diverge over time.

* **Scope & Efficiency Metrics**
    * **Calculation:** Final depth reached, total unique PMIDs in the final set, and total execution time.
    * **Purpose:** Provides a high-level summary of the search's breadth and computational cost.

---

## 4. Interpretation Framework

A successful ripple is not necessarily the one that goes deepest or finds the most papers. The goal is intelligent exploration.

* **An "Ideal" Ripple:** Exhibits high semantic drift stability in its early stages as it exploits the core topic. It then shows a clean drop in drift, signaling a pivot to a new, relevant area, followed by another period of stability.
* **Signs of Failure:** A continuous, slow decline in semantic drift indicates aimless wandering. A sudden, catastrophic drop to a very low similarity value (e.g., < 0.4) indicates a complete loss of thematic coherence.
* **Interpreting Divergence:** Low semantic divergence and high Jaccard similarity suggest both methods are robust for that topic. High divergence and low Jaccard similarity indicate that the choice of method has a profound impact on the discovery path, highlighting the adaptive nature of the dynamic centroid.