In [6]:
# ===== sniff_v4_cell1_core.py =====
# Config, typed contracts, utilities (normalization, telemetry), and LM Studio JSON AVR wrappers.

import os, re, json, time, math, pathlib, random, unicodedata, textwrap
from dataclasses import dataclass, field, asdict
from typing import List, Dict, Optional, Tuple, Callable, Any
import requests
from collections import Counter, defaultdict

# ----------------------------
# Config / constants
# ----------------------------
LMSTUDIO_BASE   = os.getenv("LMSTUDIO_BASE", "http://127.0.0.1:1234")
QWEN_MODEL      = os.getenv("QWEN_MODEL", "unsloth/qwen3-4b")                 # protocol, plausibility, synonyms
SCREENER_MODEL  = os.getenv("SCREENER_MODEL", "gemma-3n-e4b-it@q4_k_s")      # fast checklist screener

ENTREZ_EMAIL    = os.getenv("ENTREZ_EMAIL", "you@example.com")
ENTREZ_API_KEY  = os.getenv("ENTREZ_API_KEY", "")

HTTP_TIMEOUT    = int(os.getenv("HTTP_TIMEOUT", "300"))

# Universe budgets/thresholds
UNIVERSE_TARGET_MIN  = 50         # desired lower bound after deterministic prefilter
UNIVERSE_HARD_MIN    = 10         # absolute floor after relax ladder
FETCH_BUDGET_IDS     = 1200       # total efetch budget across all query candidates
PAGE_SIZE_ES         = 200        # per-page ESearch fetch
EFETCH_BATCH_SIZE    = 200

# Rerank / screening
SCREEN_TOP_K         = 60
PLAUSIBILITY_MIN_INCLUDES = 3

# Weights for PICO terms in rerank
WEIGHTS = {"P": 1.5, "I": 1.75, "C": 1.0, "O": 1.0, "ANCHOR": 2.0, "AVOID": -2.5}

# Output dir
OUT_DIR = pathlib.Path("sniff_out_v4")
(OUT_DIR / "logs").mkdir(parents=True, exist_ok=True)

random.seed(42)

# ----------------------------
# Knowledge base (designs, languages, pubtype aliases)
# ----------------------------
KB = {
    "publication_types_allowable_primary": [
        "Randomized Controlled Trial",
        "Controlled Clinical Trial",
        "Clinical Trial",
    ],
    "publication_types_allowable_secondary": [
        "Comparative Study", "Cohort Studies", "Case-Control Studies",
        "Observational Study", "Multicenter Study", "Cross-Sectional Studies",
        "Clinical Trial Protocol", "Evaluation Study"
    ],
    # Extra hard excludes include review flavors (toggle via config below)
    "publication_types_hard_exclude_base": [
        "Editorial", "Letter", "Comment", "News", "Interview",
        "Practice Guideline", "Guideline", "Consensus Development Conference",
        "Case Reports"
    ],
    "review_flavors": ["Review", "Systematic Review", "Meta-Analysis"],
    "languages_human": ["english","spanish","portuguese","french","german","italian","chinese","japanese","korean"],
    "pubtype_aliases": {
        "Randomized Controlled Trial": ["Randomized Controlled Trial"],
        "Controlled Clinical Trial":  ["Controlled Clinical Trial"],
        "Clinical Trial":             ["Clinical Trial"],
        "Comparative Study":          ["Comparative Study"],
        "Cohort Studies":             ["Cohort Studies","Prospective Studies","Retrospective Studies"],
        "Case-Control Studies":       ["Case-Control Studies"],
        "Observational Study":        ["Observational Study"],
        "Multicenter Study":          ["Multicenter Study"],
        "Cross-Sectional Studies":    ["Cross-Sectional Studies"],
        "Clinical Trial Protocol":    ["Clinical Trial Protocol","Study Protocols"],
        "Evaluation Study":           ["Evaluation Study"]
    }
}

# Config toggles
INCLUDE_REVIEW_FLAVORS = False   # if True, do NOT hard-exclude reviews in prefilter

# Language normalization (PubMed often uses 3-letter codes)
LANG_MAP = {
    "eng":"english","en":"english",
    "spa":"spanish","es":"spanish",
    "por":"portuguese","pt":"portuguese",
    "fra":"french","fre":"french","fr":"french",
    "deu":"german","ger":"german","de":"german",
    "ita":"italian","it":"italian",
    "chi":"chinese","zho":"chinese","zh":"chinese",
    "jpn":"japanese","ja":"japanese",
    "kor":"korean","ko":"korean"
}

# ---- Token policy & optional library-based shortening ----
TOKEN_MAX_WORDS = 4                 # final target for P/I/C/O tokens
VALIDATE_MAX_WORDS = 12             # schema sanity cap before we post-process

# Optional: spaCy models if available (opportunistic, no hard dependency)
try:
    import spacy
    _SPACY_MODELS = {}
    def _get_spacy(lang_human: str):
        name_map = {
            "english": "en_core_web_sm",
            "spanish": "es_core_news_sm",
            "portuguese": "pt_core_news_sm"
        }
        name = name_map.get(lang_human)
        if not name:
            return None
        if name in _SPACY_MODELS:
            return _SPACY_MODELS[name]
        try:
            nlp = spacy.load(name, disable=["ner","parser","textcat"])
            _SPACY_MODELS[name] = nlp
            return nlp
        except Exception:
            return None
except Exception:
    spacy = None
    _SPACY_MODELS = {}

def _split_paren(t: str):
    acrs = re.findall(r"\(([^)]+)\)", t or "")
    base = re.sub(r"\([^)]*\)", "", t or "")
    return base.strip(), [a.strip() for a in acrs if a.strip()]

def _rule_shorten_one(t: str) -> list:
    base, acrs = _split_paren(t)
    words = [w for w in re.split(r"\W+", base) if w]
    # simple biomedical-ish pruning
    stopish = {"undergoing","used","for","during","repair","scores","consumption",
               "postoperative","intraoperatively","procedure","surgery",
               "the","of","and","or","with","in","at","to"}
    words = [w for w in words if w.lower() not in stopish]
    s = " ".join(words[:TOKEN_MAX_WORDS]).strip().lower()
    outs = []
    if s: outs.append(s)
    for a in acrs:
        if len(a.split()) <= TOKEN_MAX_WORDS:
            al = a.lower()
            if al and al not in outs:
                outs.append(al)
    return outs if outs else [norm_txt(t)[:30]]

def _dedup(seq: list) -> list:
    seen=set(); out=[]
    for x in seq:
        xl=(x or "").strip().lower()
        if xl and xl not in seen:
            seen.add(xl); out.append(x)
    return out

def shorten_tokens(tokens: list, languages: list) -> list:
    """Prefer spaCy noun-chunk compression if a compatible model is present; else fallback."""
    outs=[]
    lang_try = next((l for l in (languages or []) if l in ("english","spanish","portuguese")), None)
    nlp = _get_spacy(lang_try) if (lang_try and 'spacy' in globals()) else None
    for t in tokens or []:
        if not t or not t.strip(): 
            continue
        if len(t.split()) <= TOKEN_MAX_WORDS:
            outs.append(t.strip())
            continue
        if nlp:
            doc = nlp(t)
            # take the first noun chunk with ≤4 tokens, else head lemma(s), else fallback
            cand = None
            for nc in doc.noun_chunks:
                if len(nc.text.split()) <= TOKEN_MAX_WORDS:
                    cand = nc.text
                    break
            if not cand:
                heads = [tok.lemma_ for tok in doc if tok.head == tok]
                cand = " ".join(heads[:TOKEN_MAX_WORDS]) if heads else None
            if cand:
                outs.append(cand.strip().lower())
            else:
                outs.extend(_rule_shorten_one(t))
        else:
            outs.extend(_rule_shorten_one(t))
    return _dedup(outs)


def normalize_lang(s: Optional[str]) -> str:
    if not s: return ""
    k = s.strip().lower()
    return LANG_MAP.get(k, k)

# ----------------------------
# Contracts (typed records)
# ----------------------------
@dataclass
class Protocol:
    narrative_question: str
    inclusion_criteria: List[str]
    exclusion_criteria: List[str]
    screening_rules_note: Dict[str, str]
    pico_tokens: Dict[str, List[str]]
    anchors_must_have: List[str]
    avoid_terms: List[str]
    designs_preference: str
    deterministic_filters: Dict[str, Any]  # {"languages":[...], "year_min": int}
    # NEW — user-driven controls
    designs_allowed: List[str] = field(default_factory=list)            # optional, from KB primary+secondary
    publication_types_allowlist: List[str] = field(default_factory=list)
    publication_types_blocklist: List[str] = field(default_factory=list)


@dataclass
class QueryCandidate:
    id: str
    query: str
    origin: str
    terms_used: Dict[str, List[str]]
    expected_breadth: str

@dataclass
class PubMedRecord:
    pmid: str
    title: str
    abstract: str
    year: Optional[int]
    language: Optional[str]
    pubtypes: List[str]
    mesh: List[str]

@dataclass
class ScreenDecision:
    pmid: str
    decision: str               # INCLUDE|BORDERLINE|EXCLUDE
    why: str
    checklist: Dict[str, bool]
    mesh_roles: List[Dict[str,str]]

# ----------------------------
# Telemetry
# ----------------------------
def log_jsonl(event: str, payload: Dict[str, Any], fname="events.jsonl"):
    payload = dict(payload)
    payload["event"] = event
    payload["ts"] = time.time()
    with open(OUT_DIR / "logs" / fname, "a", encoding="utf-8") as f:
        f.write(json.dumps(payload, ensure_ascii=False) + "\n")

# ----------------------------
# Text normalization & helpers
# ----------------------------
def norm_txt(s: str) -> str:
    s = s or ""
    s = unicodedata.normalize("NFKD", s).encode("ascii","ignore").decode("ascii")
    s = re.sub(r"[-/]", " ", s)
    s = re.sub(r"\s+", " ", s).strip().lower()
    return s

def or_block(terms: List[str], field="tiab") -> str:
    toks=[]
    for t in terms:
        t=t.strip()
        if not t: continue
        toks.append(f"\"{t}\"[{field}]" if (" " in t or "-" in t) else f"{t}[{field}]")
    return "(" + " OR ".join(toks) + ")" if toks else ""

# ----------------------------
# LM Studio client + AVR JSON utilities
# ----------------------------
class LMClient:
    def __init__(self, base=LMSTUDIO_BASE, timeout=HTTP_TIMEOUT):
        self.base = base.rstrip("/")
        self.timeout = timeout
    def chat(self, model: str, system: str, user: str, temperature=0.0) -> str:
        url = f"{self.base}/v1/chat/completions"
        body = {"model": model, "messages": [
            {"role":"system","content": system},
            {"role":"user","content": user}
        ], "temperature": float(temperature), "stream": False}
        r = requests.post(url, json=body, timeout=self.timeout)
        r.raise_for_status()
        return r.json()["choices"][0]["message"]["content"]

LM = LMClient()

_BEGIN = re.compile(r"BEGIN_JSON\s*", re.I)
_END   = re.compile(r"\s*END_JSON", re.I)
_FENCE = re.compile(r"```(?:json)?\s*([\s\S]*?)```", re.I)

def _sanitize_json_str(s: str) -> str:
    s = s.replace("\u201c", '"').replace("\u201d", '"').replace("\u2018","'").replace("\u2019","'")
    s = re.sub(r",\s*(\}|\])", r"\1", s)
    return s.strip()

def extract_json_block_or_fence(txt: str) -> str:
    blocks = []
    pos=0
    while True:
        m1 = _BEGIN.search(txt, pos)
        if not m1: break
        m2 = _END.search(txt, m1.end())
        if not m2: break
        blocks.append(txt[m1.end():m2.start()])
        pos = m2.end()
    if blocks:
        return _sanitize_json_str(blocks[-1])
    fences = _FENCE.findall(txt)
    if fences:
        return _sanitize_json_str(fences[-1])
    # last balanced {...}
    s = txt
    last_obj=None; stack=0; start=None
    for i,ch in enumerate(s):
        if ch=='{':
            if stack==0: start=i
            stack+=1
        elif ch=='}':
            if stack>0:
                stack-=1
                if stack==0 and start is not None:
                    last_obj = s[start:i+1]
    if last_obj:
        return _sanitize_json_str(last_obj)
    raise ValueError("No JSON-like content found")

STRICT_JSON_RULES = (
  "Return ONLY one JSON object. No analysis, no preface, no notes. "
  "Wrap it EXACTLY with:\nBEGIN_JSON\n{...}\nEND_JSON"
)

def get_validated_json(model: str, system_prompt: str, user_prompt: str,
                       validator: Callable[[Dict[str,Any]], Tuple[bool,str]],
                       retries: int = 2, temperature: float = 0.0) -> Dict[str,Any]:
    def _make_fix_hint(err: str) -> str:
        e = (err or "").lower()
        hints = []
        if "tokens too long" in e:
            hints.append(
                "- FIX: Each P/I/C/O token must be ≤ 4 words. If a comparator token has commas, split into separate items. Use short canonical forms (e.g., 'INC', 'Nuss', 'pectus excavatum')."
            )
        if "languages" in e and "anchors" in e:
            hints.append("- Do NOT include languages or years in anchors_must_have.")
        if "designs_allowed" in e:
            hints.append("- designs_allowed must contain only items from the KB lists (primary/secondary).")
        return ("\n".join(hints)) if hints else ""
    last_err = ""
    for i in range(retries+1):
        fix = _make_fix_hint(last_err)
        up = user_prompt + ("\n\nPrevious error: " + last_err + ("\n" + fix if fix else "") if last_err else "")
        raw = LM.chat(model, system_prompt, up + "\n\n" + STRICT_JSON_RULES, temperature=temperature)
        log_jsonl("llm_json_attempt", {"model": model, "try": i, "chars": len(raw)})
        try:
            js = json.loads(extract_json_block_or_fence(raw))
        except Exception as e:
            last_err = f"Malformed JSON: {e}"
            print(f"[AVR] attempt {i} → malformed JSON ({e})")
            if i==retries: raise RuntimeError(f"LLM failed to produce JSON: {last_err}")
            continue
        ok, why = validator(js)
        if ok:
            return js
        last_err = f"Schema invalid: {why}"
        print(f"[AVR] attempt {i} → schema invalid: {why}")
        if i==retries:
            raise RuntimeError(f"LLM JSON schema invalid after retries: {why}")


# ----------------------------
# Protocol lockdown via LLM (typed)
# ----------------------------
PROTO_SYSTEM = """You are designing a structured, search-ready SR protocol from a natural-language question.

HARD CONSTRAINTS (must follow EXACTLY):
- Return ONLY one JSON object using the provided schema (no prose).
- P/I/C/O tokenization RULES:
  - Each token must be ≤ 4 words (short, atomic).
  - If a token > 4 words, SPLIT it into multiple tokens, each ≤ 4 words.
  - For C (comparators): list EACH comparator as its OWN token (do NOT pack commas into one string).
  - Prefer canonical short forms and acronyms (e.g., "INC", "Nuss", "pectus excavatum").
  - Do NOT put languages or years into anchors_must_have.
- 'designs_preference' must be ONE of: Randomized Controlled Trial, Controlled Clinical Trial, Clinical Trial.
- 'deterministic_filters' MUST include: languages (subset of given KB languages, lowercased) and year_min (int).
- If the user does not explicitly specify "anchors_must_have", set it to [].
- Only include up to TWO anchors, and only if they are essential core terms (e.g., "pectus excavatum", "cryoablation").

USER CHOICES:
- If the question specifies study designs or publication types to include/exclude, populate:
  - "designs_allowed": array of designs chosen from the KB primary+secondary lists.
  - "publication_types_allowlist": exact Publication Type names to force-include.
  - "publication_types_blocklist": exact Publication Type names to force-exclude.

If incoherent, set needs_clarification=true with a brief request. Return ONLY the requested JSON."""


def proto_user(nlq: str) -> str:
    kb_view = {
        "designs_primary": KB["publication_types_allowable_primary"],
        "designs_secondary": KB["publication_types_allowable_secondary"],
        "languages": KB["languages_human"]
    }
    schema = {
      "narrative_question": "<1 paragraph restatement>",
      "inclusion_criteria": ["..."],
      "exclusion_criteria": ["..."],
      "screening_rules_note": {"user_notes":"...", "llm_guidance":"..."},
      "pico_tokens": {"P":["..."], "I":["..."], "C":["..."], "O":["..."]},
      "anchors_must_have": ["..."],
      "avoid_terms": ["..."],
      "designs_preference": "Randomized Controlled Trial",
      "deterministic_filters": {"languages": ["english"], "year_min": 2015},
      "designs_allowed": [],                        # NEW (optional)
      "publication_types_allowlist": [],            # NEW (optional)
      "publication_types_blocklist": [],            # NEW (optional)
      "needs_clarification": False,
      "clarification_request": ""
    }
    return f"""Natural-Language Question:
<<<
{nlq.strip()}
>>>

Knowledge Base (valid choices):
{json.dumps(kb_view, indent=2)}

Output schema:
{json.dumps(schema, indent=2)}"""

def validate_protocol(js: Dict[str,Any]) -> Tuple[bool,str]:
    try:
        req = ["narrative_question","inclusion_criteria","exclusion_criteria",
               "screening_rules_note","pico_tokens","anchors_must_have","avoid_terms",
               "designs_preference","deterministic_filters","needs_clarification","clarification_request"]
        for k in req:
            if k not in js: return False, f"missing key {k}"
        if not isinstance(js["pico_tokens"], dict): return False, "pico_tokens must be object"
        for k in ["P","I","C","O"]:
            if k not in js["pico_tokens"]: return False, f"pico_tokens missing {k}"
            if not isinstance(js["pico_tokens"][k], list): return False, f"pico_tokens[{k}] must be list"
        df = js["deterministic_filters"]
        langs = df.get("languages", [])
        if not isinstance(langs, list) or not langs: return False, "languages must be non-empty list"
        # Ensure languages are subset of KB.languages_human
        langs_ok = [l for l in langs if l.lower() in KB["languages_human"]]
        if not langs_ok: return False, "no valid languages from KB"
        df["languages"] = [l.lower() for l in langs_ok]
        y = df.get("year_min", 0)
        if isinstance(y, str) and y.isdigit(): y = int(y); df["year_min"] = y
        if not isinstance(df["year_min"], int): return False, "year_min must be int"
        if js["designs_preference"] not in KB["publication_types_allowable_primary"]:
            return False, "designs_preference must be a primary design"
        # token length safeguard
        # Optional fields sanity
        for k in ["designs_allowed","publication_types_allowlist","publication_types_blocklist"]:
            if k in js and not isinstance(js[k], list):
                return False, f"{k} must be a list"
            if k in js:
                for v in js[k]:
                    if not isinstance(v, str):
                        return False, f"{k} must contain strings"
        # If designs_allowed provided, ensure items are within KB space
        if js.get("designs_allowed"):
            valid = set(KB["publication_types_allowable_primary"] + KB["publication_types_allowable_secondary"])
            bad = [d for d in js["designs_allowed"] if d not in valid]
            if bad:
                return False, f"designs_allowed contains invalid: {bad[:2]}"
        return True, ""
    except Exception as e:
        return False, f"exception: {e}"

def lock_protocol(nlq: str) -> Protocol:
    js = get_validated_json(QWEN_MODEL, PROTO_SYSTEM, proto_user(nlq), validate_protocol, retries=2)
    js = enforce_token_policy_js(js)  # library-first shorten + hygiene
    if js.get("needs_clarification"):
        raise RuntimeError("Protocol needs clarification: " + js.get("clarification_request",""))
    proto = Protocol(
        narrative_question=js["narrative_question"],
        inclusion_criteria=js["inclusion_criteria"],
        exclusion_criteria=js["exclusion_criteria"],
        screening_rules_note=js.get("screening_rules_note", {}),
        pico_tokens=js["pico_tokens"],
        anchors_must_have=js["anchors_must_have"],
        avoid_terms=js["avoid_terms"],
        designs_preference=js["designs_preference"],
        deterministic_filters=js["deterministic_filters"],
        designs_allowed=js.get("designs_allowed", []),
        publication_types_allowlist=js.get("publication_types_allowlist", []),
        publication_types_blocklist=js.get("publication_types_blocklist", []),
    )
    log_jsonl("protocol_locked", {"protocol": proto.__dict__})
    print("[S1] Protocol locked. P/I/C/O:", proto.pico_tokens)
    return proto


def enforce_token_policy_js(js: Dict[str, Any]) -> Dict[str, Any]:
    df = js.get("deterministic_filters", {})
    langs = [l.lower() for l in (df.get("languages") or [])]

    # 1) Shorten tokens (library-first), and split comparators
    for k in ["P","I","C","O"]:
        toks = js.get("pico_tokens", {}).get(k, []) or []
        if k == "C":
            split = []
            for t in toks:
                parts = [p.strip() for p in re.split(r",|;|/", t) if p.strip()]
                split.extend(parts if parts else [t])
            toks = split
        js["pico_tokens"][k] = shorten_tokens(toks, langs)

    # 2) Domain hygiene
    # 2a) Drop risky acronyms as standalone tokens (esp. "inc")
    def _drop_bad_acronyms(arr):
        bad = {"inc"}  # standalone "inc" is toxic in PubMed tiab
        return [t for t in arr if t.lower() not in bad]
    js["pico_tokens"]["I"] = _drop_bad_acronyms(js["pico_tokens"].get("I", []))
    js["pico_tokens"]["C"] = _drop_bad_acronyms(js["pico_tokens"].get("C", []))

    # 2b) Ensure key domain anchors exist in P and I
    nq = (js.get("narrative_question") or "").lower()
    P = js["pico_tokens"].get("P", [])
    I = js["pico_tokens"].get("I", [])

    # Population must carry the target surgery/disease tokens
    must_p = []
    if "pectus" in nq and not any("pectus" in t for t in P): must_p.append("pectus excavatum")
    if "nuss" in nq and not any("nuss" in t for t in P):   must_p.append("nuss")
    if "mirpe" in nq and not any("mirpe" in t for t in P): must_p.append("mirpe")
    P = _dedup(P + must_p)

    # Intervention must carry cryo terms and intercostal nerve
    must_i = []
    if not any("cryo" in t for t in I):
        must_i += ["cryoablation", "cryoanalgesia"]
    if "intercostal" in nq and not any("intercostal" in t for t in I):
        must_i.append("intercostal nerve")
    I = _dedup(I + must_i)

    # Outcomes: normalize common phrasing
    O = js["pico_tokens"].get("O", [])
    O = ["opioid consumption" if t.lower()=="opioid" else t for t in O]
    js["pico_tokens"]["P"] = P
    js["pico_tokens"]["I"] = I
    js["pico_tokens"]["O"] = _dedup(O)

    # 3) Anchors: derive from P/I essentials; DO NOT trust LLM-provided anchors
    anchors = []
    for t in P + I:
        if any(k in t for k in ["pectus","nuss","mirpe","cryo"]):
            anchors.append(t)
    js["anchors_must_have"] = _dedup(anchors)[:2]  # at most 2
    return js


In [7]:
# ===== sniff_v4_cell2_pubmed.py =====
# PubMed E-utilities: paginated ESearch, batched EFetch, robust XML parse.

from xml.etree import ElementTree as ET
from typing import List, Tuple

EUTILS = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"

def esearch_all_ids(term: str, mindate: Optional[int], page_size: int = PAGE_SIZE_ES) -> Tuple[int, List[str], Dict[str, Any]]:
    # usehistory=y to page; pull up to 5000 for sniff
    p = {"db":"pubmed","retmode":"json","term":term,"retmax":0,"usehistory":"y","email":ENTREZ_EMAIL}
    if ENTREZ_API_KEY: p["api_key"]=ENTREZ_API_KEY
    if mindate: p["mindate"]=str(mindate)
    r = requests.get(EUTILS+"/esearch.fcgi", params=p, timeout=HTTP_TIMEOUT)
    r.raise_for_status()
    js = r.json().get("esearchresult", {})
    count = int(js.get("count","0"))
    webenv = js.get("webenv"); qk = js.get("querykey")
    ids=[]
    if count>0 and webenv and qk:
        max_pull = min(count, 5000)
        for start in range(0, max_pull, page_size):
            r2 = requests.get(EUTILS+"/esearch.fcgi", params={
                "db":"pubmed","retmode":"json","retmax":page_size,"retstart":start,
                "WebEnv":webenv,"query_key":qk,"email":ENTREZ_EMAIL,
                **({"api_key":ENTREZ_API_KEY} if ENTREZ_API_KEY else {})
            }, timeout=HTTP_TIMEOUT)
            r2.raise_for_status()
            ids.extend(r2.json().get("esearchresult",{}).get("idlist",[]))
            time.sleep(0.34)  # polite throttle
    return count, [str(x) for x in ids], {"count":count}

def efetch_xml_batched(pmids: List[str], batch_size: int = EFETCH_BATCH_SIZE) -> str:
    if not pmids: return ""
    xmls=[]
    for i in range(0, len(pmids), batch_size):
        chunk = pmids[i:i+batch_size]
        params = {"db":"pubmed","retmode":"xml","rettype":"abstract","id":",".join(chunk),"email":ENTREZ_EMAIL}
        if ENTREZ_API_KEY: params["api_key"]=ENTREZ_API_KEY
        r = requests.get(EUTILS+"/efetch.fcgi", params=params, timeout=HTTP_TIMEOUT)
        r.raise_for_status()
        xmls.append(r.text)
        time.sleep(0.34)
    return "\n".join(xmls)

def parse_pubmed_xml(xml_text: str) -> List[PubMedRecord]:
    out=[]
    if not xml_text.strip(): return out
    root = ET.fromstring(xml_text)

    def _join(node):
        if node is None: return ""
        try: return "".join(node.itertext())
        except Exception: return node.text or ""

    for art in root.findall(".//PubmedArticle"):
        pmid = art.findtext(".//PMID") or ""
        title = _join(art.find(".//ArticleTitle")).strip()
        abs_nodes = art.findall(".//Abstract/AbstractText")
        abstract = " ".join(_join(n).strip() for n in abs_nodes) if abs_nodes else ""
        # year: try multiple fields
        year = None
        for path in (".//ArticleDate/Year",".//PubDate/Year",".//DateCreated/Year",".//PubDate/MedlineDate"):
            s = art.findtext(path)
            if s:
                m = re.search(r"\d{4}", s)
                if m: year = int(m.group(0)); break
        lang = art.findtext(".//Language") or None
        lang = normalize_lang(lang)
        pubtypes = [pt.text for pt in art.findall(".//PublicationTypeList/PublicationType") if pt.text]
        mesh = [mh.findtext("./DescriptorName") for mh in art.findall(".//MeshHeadingList/MeshHeading") if mh.findtext("./DescriptorName")]
        out.append(PubMedRecord(pmid=pmid, title=title, abstract=abstract, year=year, language=lang, pubtypes=pubtypes, mesh=mesh))
    return out


In [8]:
# ===== sniff_v4_cell3_universe_prefilter.py =====
# Multi-query candidate generation (+ optional synonym LLM), budget allocation, stratified sampling,
# deterministic prefilter with language normalization, and a relax ladder.

import hashlib
from typing import Set

def _norm_terms(terms: List[str]) -> List[str]:
    return [t.strip() for t in terms if t and t.strip()]

def build_candidates(proto: Protocol, synonyms: Dict[str, List[str]], k_max: int = 8) -> List[QueryCandidate]:
    P = [t for t in proto.pico_tokens.get("P", []) if t]
    I = [t for t in proto.pico_tokens.get("I", []) if t]
    A = proto.anchors_must_have[:2] if proto.anchors_must_have else []

    # Never allow standalone "inc" in query terms
    def _sanitize(terms: List[str]) -> List[str]:
        return [t for t in terms if t.lower() not in {"inc"}]

    P = _sanitize(P); I = _sanitize(I); A = _sanitize(A)

    def q_from(parts):
        return " AND ".join(x for x in parts if x)

    cands = []

    # Q0: baseline P ∧ I (NO anchors)
    q0 = q_from([or_block(P,"tiab"), or_block(I,"tiab")])
    cands.append(("baseline", {"P":P, "I":I, "A":[]}, q0, "pi_only"))

    # Q1: PI + ≤2 anchors (only if anchors exist)
    if A:
        q1 = q_from([or_block(P,"tiab"), or_block(I,"tiab"), or_block(A,"tiab")])
        cands.append(("pi_anchors", {"P":P,"I":I,"A":A}, q1, "pi_plus_anchors"))

    # Q2: broad cryo/pectus emphasis (resilient to phrasing)
    cryo = [t for t in I if "cryo" in t] or ["cryoablation","cryoanalgesia"]
    pec  = [t for t in P if any(k in t for k in ["pectus","nuss","mirpe"])] or ["pectus excavatum","nuss","mirpe"]
    q2 = q_from([or_block(pec,"tiab"), or_block(cryo,"tiab")])
    cands.append(("broad_cryo_pectus", {"P":pec,"I":cryo,"A":[]}, q2, "broad_cryo_pectus"))

    # Q3–Q4: synonym mixes (safe subset)
    def mix(root, alts, cap=2):
        alts = [a for a in (alts or []) if a and a.lower()!=root.lower()]
        return [root] + alts[:cap]
    synP = {p: mix(p, synonyms.get(p, [])) for p in P}
    synI = {i: mix(i, synonyms.get(i, [])) for i in I}
    for _ in range(2):
        Ps = [random.choice(v) for v in synP.values()] if synP else P
        Is = [random.choice(v) for v in synI.values()] if synI else I
        q = q_from([or_block(Ps,"tiab"), or_block(Is,"tiab")])
        cands.append(("synmix", {"P":Ps,"I":Is,"A":[]}, q, "syn_mix"))

    # Materialize & prune by token Jaccard (less aggressive)
    def tokset(parts):
        toks=set()
        for arr in (parts["P"] + parts["I"] + parts.get("A", [])):
            toks.update(arr)
        return set(map(str.lower, toks))

    out=[]
    for origin, parts, query, tag in cands:
        cid = hashlib.md5((origin+query).encode()).hexdigest()[:8]
        out.append(QueryCandidate(id=cid, query=query, origin=origin, terms_used=parts, expected_breadth=tag))

    keep=[]
    for c in out:
        if not keep:
            keep.append(c); continue
        sims=[]
        for k in keep:
            a, b = tokset(c.terms_used), tokset(k.terms_used)
            j = len(a & b) / max(1, len(a | b))
            sims.append(j)
        # keep if not too similar to ALL previous
        if (max(sims) if sims else 0.0) < 0.8:  # looser prune (keep more)
            keep.append(c)

    keep = keep[:k_max]
    log_jsonl("query_candidates", {"count": len(keep), "candidates": [asdict(c) for c in keep]})
    print(f"[S2] Built {len(keep)} query candidates.")
    return keep

# Synonyms via LLM (one call). Keep conservative, ≤2 alternates per term.
SYN_SYS = """You propose brief, domain-appropriate synonyms/alternates for search tokens (≤ 3 words). Keep strictly on-topic.
Return JSON: {"P": {"token":[alts...]}, "I": {"token":[alts...]}}. No notes."""
def syn_user(proto: Protocol) -> str:
    return f"""Tokens to expand:
P: {proto.pico_tokens.get("P", [])}
I: {proto.pico_tokens.get("I", [])}

BEGIN_JSON
{{"P":{{}}, "I":{{}}}}
END_JSON"""

def validate_syn(js: Dict[str,Any]) -> Tuple[bool,str]:
    if not isinstance(js, dict): return False, "not object"
    for k in ["P","I"]:
        if k not in js or not isinstance(js[k], dict): return False, f"missing {k} object"
        for root, alts in js[k].items():
            if not isinstance(alts, list): return False, f"{k}.{root} must be list"
            for a in alts:
                if not isinstance(a, str): return False, f"{k}.{root} contains non-string"
                if len(a.split()) > 4: return False, f"alternate too long: {a}"
    return True, ""

def _syn_ok(orig: str, alt: str) -> bool:
    o = norm_txt(orig); a = norm_txt(alt)
    if not a or a == o: 
        return False
    # block dangerous "inc"/"incision"/company expansions
    if re.search(r"\binc(ision|orporat|orporated)?\b", a):
        return False
    # require topical overlap
    if any(s in o for s in ["cryo", "cryoablation", "cryoanalgesia", "intercostal"]) and \
       any(s in a for s in ["cryo", "cryoablation", "cryoanalgesia", "intercostal"]):
        return True
    if any(s in o for s in ["pectus", "nuss", "mirpe", "chest wall"]) and \
       any(s in a for s in ["pectus", "nuss", "mirpe", "chest wall"]):
        return True
    return False

def _filter_synonyms(syn: Dict[str, List[str]]) -> Dict[str, List[str]]:
    out={}
    for k, alts in (syn or {}).items():
        keep=[a for a in alts if _syn_ok(k, a)]
        if keep:
            out[k]=_dedup(keep)
    return out


def get_synonyms_via_llm_once(proto: Protocol) -> Dict[str, List[str]]:
    try:
        js = get_validated_json(QWEN_MODEL, SYN_SYS, syn_user(proto), validate_syn, retries=1)
        synP = _filter_synonyms(js.get("P",{}))
        synI = _filter_synonyms(js.get("I",{}))
        all_syn = {}
        for k,v in synP.items(): all_syn[k]=v
        for k,v in synI.items(): all_syn[k]=list(dict.fromkeys(all_syn.get(k,[])+v))
        log_jsonl("synonyms_mined", {"synonyms": all_syn})
        return all_syn
    except Exception as e:
        log_jsonl("synonyms_failed", {"error": str(e)})
        return {}


# Deterministic prefilter
def passes_prefilter(rec: PubMedRecord, proto: Protocol) -> Tuple[bool, str]:
    ymin = int(proto.deterministic_filters["year_min"])
    langs = [x.lower() for x in proto.deterministic_filters["languages"]]
    if rec.year and rec.year < ymin: 
        return False, f"year<{ymin}"
    if rec.language and rec.language.lower() not in langs: 
        return False, f"lang={rec.language}"

    rpts = set(rec.pubtypes or [])

    # Build hard excludes: KB base + (optionally) reviews + user blocklist
    hard = set(KB["publication_types_hard_exclude_base"])
    review_flavors = set(KB["review_flavors"])
    # If user explicitly allowlists any review flavor, DO NOT hard-exclude them
    user_allow = set(proto.publication_types_allowlist or [])
    if review_flavors & user_allow:
        pass
    else:
        if not INCLUDE_REVIEW_FLAVORS:
            hard |= review_flavors
    # Add user blocklist
    hard |= set(proto.publication_types_blocklist or [])
    if rpts & hard:
        return False, f"pubtype_hard_exclude={list(rpts & hard)}"

    # Build allow-space:
    # 1) designs_allowed (if provided) else KB primary+secondary
    allowed_designs = proto.designs_allowed or (KB["publication_types_allowable_primary"] + KB["publication_types_allowable_secondary"])
    alias_allow = set()
    for d in allowed_designs:
        alias_allow |= set(KB["pubtype_aliases"].get(d, [d]))
    # 2) user publication_types_allowlist (exact names)
    alias_allow |= set(user_allow)

    # If record matches any allowed alias/name, allow
    if rpts & alias_allow:
        return True, "allow_design_or_allowlist"

    # Neutral/unknown types → keep (sniff shouldn’t over-prune)
    return True, "neutral"


def apply_prefilter(records: List[PubMedRecord], proto: Protocol) -> Tuple[List[PubMedRecord], Dict[str,int]]:
    kept=[]; drops=Counter()
    for r in records:
        ok, why = passes_prefilter(r, proto)
        if ok: kept.append(r)
        else: drops[why]+=1
    stats=dict(drops)
    log_jsonl("prefilter_result", {"kept": len(kept), "total": len(records), "drops": stats})
    # quick distributions
    years=[r.year for r in kept if r.year]
    if years:
        print(f"   [Prefilter] Kept={len(kept)}/{len(records)} | year min/median/max = {min(years)}/{sorted(years)[len(years)//2]}/{max(years)}")
    else:
        print(f"   [Prefilter] Kept={len(kept)}/{len(records)}")
    return kept, stats

# Relax ladder (after prefilter)
from dataclasses import replace as _dc_replace

def relax_ladder_fetch(proto: Protocol, base_records: List[PubMedRecord]) -> List[PubMedRecord]:
    # Work on copies; never mutate the locked proto
    steps = []
    y0 = int(proto.deterministic_filters["year_min"])
    steps.append(("widen_year_5",  _dc_replace(proto, deterministic_filters={**proto.deterministic_filters, "year_min": max(1990, y0-5)})))
    steps.append(("widen_year_10", _dc_replace(proto, deterministic_filters={**proto.deterministic_filters, "year_min": max(1990, y0-10)})))
    steps.append(("all_langs",     _dc_replace(proto, deterministic_filters={**proto.deterministic_filters, "languages": KB["languages_human"]})))
    steps.append(("allow_reviews", None))  # toggle global flag locally

    kept = []
    for name, p2 in steps:
        print(f"   [Relax] Attempt: {name}")
        if name == "allow_reviews":
            global INCLUDE_REVIEW_FLAVORS
            old = INCLUDE_REVIEW_FLAVORS
            try:
                INCLUDE_REVIEW_FLAVORS = True
                k, _ = apply_prefilter(base_records, proto)  # use original proto but with reviews allowed
            finally:
                INCLUDE_REVIEW_FLAVORS = old
        else:
            k, _ = apply_prefilter(base_records, p2)
        kept = k
        if len(kept) >= UNIVERSE_TARGET_MIN:
            print(f"   [Relax] Success: kept={len(kept)}")
            return kept
    print("   [Relax] Ladder exhausted.")
    return kept


# Build universe: multi-query → allocate budget → stratified sample → efetch → prefilter (+ relax if needed)
def build_universe_multiquery(proto: Protocol) -> Tuple[str, List[PubMedRecord], Dict[str, Any]]:
    print("[S2] Universe definition with multi-query & budget…")
    synonyms = get_synonyms_via_llm_once(proto)
    candidates = build_candidates(proto, synonyms, k_max=8)

    per_query_counts=[]
    all_ids=[]
    for c in candidates:
        count, ids_full, meta = esearch_all_ids(c.query, proto.deterministic_filters["year_min"], page_size=PAGE_SIZE_ES)
        per_query_counts.append({"cid": c.id, "origin": c.origin, "count": count})
        # store only meta; sampling is deferred until after allocation
        all_ids.append({"cid": c.id, "ids_full": ids_full, "count": count})
        print(f"   [Candidate {c.id}:{c.origin}] hits={count}")
        log_jsonl("candidate_hits", {"cid": c.id, "origin": c.origin, "count": count})

    # Allocate FETCH_BUDGET_IDS proportionally to log(1+count); floor 100 per nonzero; cap by available.
    weights = []
    for pc in per_query_counts:
        w = math.log1p(pc["count"]) if pc["count"]>0 else 0.0
        weights.append(w)
    total_w = sum(weights) or 1.0
    alloc = {}
    for pc, w in zip(per_query_counts, weights):
        if pc["count"]<=0: 
            alloc[pc["cid"]] = 0
        else:
            alloc[pc["cid"]] = max(100, int(FETCH_BUDGET_IDS * (w/total_w)))

    # Stratified sampling across each id list (first/middle/last via stride)
    sampled_pmids=[]
    for pack in all_ids:
        cid = pack["cid"]; ids_full = pack["ids_full"]; n = alloc.get(cid, 0)
        if not ids_full or n<=0: continue
        step = max(1, len(ids_full)//n)
        sampled_pmids.extend(ids_full[::step][:n])

    # Dedup
    pmids = list(dict.fromkeys(sampled_pmids))
    print(f"   [Sampling] Sampled dedup PMIDs = {len(pmids)} (budget {FETCH_BUDGET_IDS})")
    log_jsonl("sampling_summary", {"sampled_pmids": len(pmids), "alloc": alloc, "per_query_counts": per_query_counts})

    # Fetch and parse
    xml = efetch_xml_batched(pmids, batch_size=EFETCH_BATCH_SIZE)
    recs = parse_pubmed_xml(xml)
    print(f"   [Fetch] Parsed records = {len(recs)}")

    # Prefilter
    kept, drop_stats = apply_prefilter(recs, proto)
    if len(kept) < UNIVERSE_TARGET_MIN:
        print(f"   [Prefilter] Post-filter kept={len(kept)} < target {UNIVERSE_TARGET_MIN}")
        kept = relax_ladder_fetch(proto, recs)

    # Baseline string (best “baseline” candidate’s query), for report reproducibility
    baseline = next((c.query for c in candidates if c.origin=="baseline"), candidates[0].query if candidates else "")
    return baseline, kept, {"per_query_counts": per_query_counts, "drop_stats": drop_stats}


In [9]:
# ===== sniff_v4_cell4_rerank_screen_plaus.py =====
# Phrase-aware reranker (1–3 grams + literal hits fallback),
# strict screener AVR, and senior plausibility PASS/FAIL.

from typing import Callable

def rerank_records(records: List[PubMedRecord], proto: Protocol, weights: Dict[str,float]=WEIGHTS, alpha=0.3) -> List[PubMedRecord]:
    texts=[norm_txt((r.title or "")+" "+(r.abstract or "")) for r in records]
    use_sklearn = True
    try:
        from sklearn.feature_extraction.text import TfidfVectorizer
        vec = TfidfVectorizer(ngram_range=(1,3), max_features=100_000, lowercase=False)
        X = vec.fit_transform(texts); vocab=vec.vocabulary_
        def tfidf(term, i):
            j = vocab.get(term.lower())
            return float(X[i,j]) if j is not None else 0.0
    except Exception:
        use_sklearn = False
        def tfidf(term, i):
            # crude ngram TF proxy: count literal occurrences
            return float(texts[i].count(term.lower()))

    def literal_hits(term, i):  # phrase support
        return texts[i].count(term.lower())

    weighted_terms=[]
    for k in ["P","I","C","O"]:
        for t in proto.pico_tokens.get(k, []):
            weighted_terms.append((norm_txt(t), weights[k]))
    for t in proto.anchors_must_have:
        weighted_terms.append((norm_txt(t), weights["ANCHOR"]))
    for t in proto.avoid_terms:
        weighted_terms.append((norm_txt(t), weights["AVOID"]))

    scored=[]
    for i, r in enumerate(records):
        s=0.0
        for term,w in weighted_terms:
            s += w*(tfidf(term,i) + alpha*literal_hits(term,i))
        scored.append((s, r))
    scored.sort(key=lambda x:x[0], reverse=True)
    ordered=[r for s,r in scored]
    log_jsonl("rerank_done", {"records": len(records), "sklearn": use_sklearn})
    print(f"[S2.5] Rerank complete. Sending top {min(SCREEN_TOP_K,len(ordered))} to screener.")
    return ordered

# Screener AVR
SCREEN_SYS = """You are a strict but realistic title+abstract screener. Decision rules:
INCLUDE requires: P (population/context) true AND I (intervention) true AND (O outcomes OR D design) true.
- P: matches the target clinical context (synonyms acceptable).
- I: intercostal nerve cryoablation / cryoanalgesia used intraoperatively for the target surgery.
- O: any acute postoperative analgesia outcomes acceptable (pain, opioid use, LOS, early complications). Do not nitpick day windows.
- D: randomized/comparative preferred, strong cohorts acceptable.
Return ONLY JSON with:
{"pmid":"...","decision":"INCLUDE|BORDERLINE|EXCLUDE","why":"...","checklist":{"P":bool,"I":bool,"O":bool,"D":bool},"mesh_roles":[{"mesh":"...","role":"P|I|C|O|G"}]}"""

def screen_user(proto: Protocol, rec: PubMedRecord) -> str:
    return f"""Protocol (narrative):
{proto.narrative_question}

P: {proto.pico_tokens.get("P", [])}
I: {proto.pico_tokens.get("I", [])}
C: {proto.pico_tokens.get("C", [])}
O: {proto.pico_tokens.get("O", [])}
Anchors: {proto.anchors_must_have}
Avoid: {proto.avoid_terms}
Design preference: {proto.designs_preference}
Inclusion criteria: {proto.inclusion_criteria}
Exclusion criteria: {proto.exclusion_criteria}

Record:
PMID: {rec.pmid}
Title: {rec.title}
PubTypes: {rec.pubtypes}
MeSH: {rec.mesh}
Abstract:
{rec.abstract}

BEGIN_JSON
{{"pmid":"{rec.pmid}","decision":"BORDERLINE","why":"","checklist":{{"P":false,"I":false,"O":false,"D":false}},"mesh_roles":[]}}
END_JSON"""

def validate_screen(js: Dict[str,Any]) -> Tuple[bool,str]:
    if js.get("decision") not in ["INCLUDE","BORDERLINE","EXCLUDE"]:
        return False, "bad decision"
    ch = js.get("checklist", {})
    for k in ["P","I","O","D"]:
        if not isinstance(ch.get(k), bool): return False, f"checklist.{k} must be bool"
    m = js.get("mesh_roles", [])
    if not isinstance(m, list): return False, "mesh_roles must be list"
    for it in m:
        if not isinstance(it, dict): return False, "mesh_roles items must be dict"
        if "mesh" not in it or "role" not in it: return False, "mesh_roles items need mesh & role"
    return True, ""

def screen_top_k(reranked: List[PubMedRecord], proto: Protocol, top_k: int = SCREEN_TOP_K) -> Tuple[List[PubMedRecord], List[PubMedRecord]]:
    cand = reranked[:top_k]
    includes=[]; borderlines=[]; n_exc=0
    for r in cand:
        js = get_validated_json(SCREENER_MODEL, SCREEN_SYS, screen_user(proto, r), validate_screen, retries=2)
        d = js.get("decision"); why = js.get("why",""); chk=js.get("checklist",{})
        r_meta = {"pmid": r.pmid, "decision": d, "why": why, "checklist": chk}
        if d in ["INCLUDE","BORDERLINE"]:
            # attach only roles that exist in record.mesh (guard hallucinations)
            valid_mesh = set(r.mesh or [])
            mesh_roles = [mr for mr in js.get("mesh_roles",[]) if mr.get("mesh") in valid_mesh]
            r._mesh_roles = mesh_roles
        log_jsonl("screen_decision", r_meta)
        if d=="INCLUDE": includes.append(r)
        elif d=="BORDERLINE": borderlines.append(r)
        else: n_exc+=1
        # brief console
        print(f"  [Screen] PMID {r.pmid} -> {d}  chk={chk}  why={why[:100]}")
    print(f"[S3] Tallies: INCLUDE={len(includes)} BORDERLINE={len(borderlines)} EXCLUDE={n_exc}")
    return includes, borderlines

# Plausibility (senior guard)
PLAUS_SYS = """You validate that an already-INCLUDED record matches the core P+I of the protocol.
Return JSON: {"pmid":"...","verdict":"PASS|FAIL","why":"..."}"""

def plaus_user(proto: Protocol, rec: PubMedRecord) -> str:
    core = f"P core: {proto.pico_tokens.get('P', [])}; I core: {proto.pico_tokens.get('I', [])}; Anchors: {proto.anchors_must_have}"
    return f"""Protocol core:
{core}

Record:
PMID: {rec.pmid}
Title: {rec.title}
Abstract:
{rec.abstract}

BEGIN_JSON
{{"pmid":"{rec.pmid}","verdict":"PASS","why":""}}
END_JSON"""

def validate_plaus(js: Dict[str,Any]) -> Tuple[bool,str]:
    v = js.get("verdict")
    if v not in ["PASS","FAIL"]: return False, "verdict must be PASS|FAIL"
    if "pmid" not in js: return False, "missing pmid"
    return True, ""

def plausibility_filter(includes: List[PubMedRecord], proto: Protocol) -> List[PubMedRecord]:
    confirmed=[]
    for r in includes:
        js = get_validated_json(QWEN_MODEL, PLAUS_SYS, plaus_user(proto, r), validate_plaus, retries=1)
        if js.get("verdict")=="PASS":
            confirmed.append(r)
        else:
            print(f"   [Plausibility] DROP {r.pmid} — {js.get('why','')}")
    print(f"[S3.5] Confirmed={len(confirmed)} / Includes={len(includes)}")
    log_jsonl("plausibility_done", {"confirmed": len(confirmed), "includes": len(includes)})
    return confirmed


In [10]:
# ===== sniff_v4_cell5_orchestrate.py =====
# MeSH vernacular mining, strategy validation with recall check, final artifacts,
# and a single orchestrator to run the full Sniff v4 pipeline.

def vernacular_from_includes(includes: List[PubMedRecord]) -> Dict[str, List[str]]:
    roles = {"P":set(),"I":set(),"C":set(),"O":set(),"G":set()}
    for r in includes:
        for mr in getattr(r, "_mesh_roles", []):
            m = mr.get("mesh"); role = mr.get("role","G")
            if m and role in roles: roles[role].add(m)
    return {k: sorted(v) for k,v in roles.items()}

def validate_strategy(universe_baseline_query: str, confirmed: List[PubMedRecord], proto: Protocol, vernac: Dict[str,List[str]]) -> Dict[str,str]:
    topic_tokens = list(dict.fromkeys((vernac.get("P",[]) + vernac.get("I",[]) + vernac.get("O",[]))))
    topic_filter = or_block(topic_tokens, "tiab") if topic_tokens else ""
    # design filter from preference
    pref = proto.designs_preference
    aliases = KB["pubtype_aliases"].get(pref, [pref])
    design_filter = " OR ".join(f'"{a}"[Publication Type]' for a in aliases)

    # Recall proxy: ensure each confirmed appears to match topic_filter lexically
    recall_ok = True
    if topic_filter:
        toks = re.findall(r'"([^"]+)"\[tiab\]|(\w+)\[tiab\]', topic_filter)
        flat = [a or b for a,b in toks if (a or b)]
        for r in confirmed:
            low = norm_txt((r.title or "")+" "+(r.abstract or ""))
            if not any(norm_txt(t) in low for t in flat):
                recall_ok = False
                print(f"   [S4] Recall risk: topic_filter might drop PMID {r.pmid}")
    if not recall_ok:
        topic_filter = ""  # relax to baseline

    print("[S4] Recommended filters:")
    print("   topic_filter:", topic_filter or "<none>")
    print("   design_filter:", design_filter)
    return {"topic_filter": topic_filter, "design_filter": design_filter}

def write_artifacts(proto: Protocol, universe_query: str, recs: List[PubMedRecord],
                    includes: List[PubMedRecord], confirmed: List[PubMedRecord],
                    vernac: Dict[str,List[str]], recommended: Dict[str,str], warn: List[str]) -> None:
    artifacts = {
        "locked_protocol": asdict(proto),
        "universe_baseline_query": universe_query,
        "recommended_filters": recommended,
        "ground_truth_pmids": [r.pmid for r in confirmed],
        "includes_pmids": [r.pmid for r in includes],
        "mesh_vernacular": vernac,
        "warnings": warn
    }
    (OUT_DIR/"sniff_artifacts.json").write_text(json.dumps(artifacts, indent=2, ensure_ascii=False), encoding="utf-8")

    # Text report
    lines=[]
    lines.append("# SNIFF VALIDATION ENGINE REPORT (v4)\n")
    lines.append("## Protocol (narrative)\n" + textwrap.fill(proto.narrative_question, 100) + "\n")
    lines.append("**Deterministic filters**: languages=" + ", ".join(proto.deterministic_filters["languages"]) +
                 f" ; year_min={proto.deterministic_filters['year_min']}\n")
    lines.append("**Universe baseline query**:\n\n```\n" + universe_query + "\n```\n")
    lines.append("**Recommended filters**:\n\n- topic_filter: " + (recommended["topic_filter"] or "<none>") +
                 "\n- design_filter: " + recommended["design_filter"] + "\n")
    lines.append(f"**Ground truth (confirmed) includes** (n={len(artifacts['ground_truth_pmids'])}): " + ", ".join(artifacts["ground_truth_pmids"]) + "\n")
    lines.append("**MeSH vernacular (from confirmed)**:\n\n```\n" + json.dumps(vernac, indent=2, ensure_ascii=False) + "\n```\n")
    if warn:
        lines.append("**WARNINGS**\n- " + "\n- ".join(warn) + "\n")
    (OUT_DIR/"sniff_report.md").write_text("\n".join(lines), encoding="utf-8")
    print("  wrote:", OUT_DIR/"sniff_artifacts.json", "and", OUT_DIR/"sniff_report.md")

# Orchestrator
def sniff_v4_run(USER_NLQ: str):
    warnings=[]
    # S1 protocol
    proto = lock_protocol(USER_NLQ)
    # S2 universe
    baseline_query, kept_records, u_meta = build_universe_multiquery(proto)
    if not kept_records:
        raise RuntimeError("Fatal: no records after prefilter & relax ladder.")
    # S2.5 rerank
    ordered = rerank_records(kept_records, proto, WEIGHTS)
    # S3 screen (top-K)
    includes, borderlines = screen_top_k(ordered, proto, SCREEN_TOP_K)
    if not includes and borderlines:
        # mine some MeSH from borderlines to keep vernac alive; tag as weak (not stored separately here)
        print("   [S3] No includes; using borderlines for vernacular seeding.")
        includes = borderlines[:3]  # small seed, clearly tagged by context (you can split if you prefer)
    if not includes:
        raise RuntimeError("Fatal: no includes after screening.")
    # S3.5 plausibility
    confirmed = plausibility_filter(includes, proto)
    if len(confirmed) < PLAUSIBILITY_MIN_INCLUDES:
        warnings.append(f"Insufficient confirmed includes after plausibility ({len(confirmed)}<{PLAUSIBILITY_MIN_INCLUDES}).")
    # S3.9 vernacular
    vernac = vernacular_from_includes(confirmed if confirmed else includes)
    # S4 strategy validation
    recommended = validate_strategy(baseline_query, confirmed if confirmed else includes, proto, vernac)
    # S5 finalize
    write_artifacts(proto, baseline_query, kept_records, includes, confirmed if confirmed else includes, vernac, recommended, warnings)
    print("[DONE] Sniff v4 complete.")

# ----------------------------
# Example run (your pediatric INC/MIRPE NLQ)
# ----------------------------
EXAMPLE_NLQ = """
Population = children/adolescents undergoing minimally invasive repair of pectus excavatum (Nuss/MIRPE).
Intervention = intercostal nerve cryoablation (INC) used intraoperatively for analgesia during Nuss/MIRPE.
Comparators = thoracic epidural, paravertebral block, intercostal nerve block, erector spinae plane block, or systemic multimodal analgesia.
Outcomes = postoperative opioid consumption (in-hospital and at discharge) and pain scores (abstract-level timing not strictly required).
Study designs = RCTs preferred; if absent, include comparative cohorts/case-control/observational.
Year_min = 2010.
Languages = English, Portuguese, Spanish.
Screening notes: Be conservative; INCLUDE if P & I present and (O or D) is present; do not exclude for lack of exact day window if acute postop outcomes are clearly reported.
""".strip()

# To execute:
sniff_v4_run(EXAMPLE_NLQ)


[S1] Protocol locked. P/I/C/O: {'P': ['children adolescents minimally invasive', 'of pectus excavatum (Nuss/MIRPE)', 'nuss', 'mirpe'], 'I': ['intercostal nerve cryoablation analgesia', 'during Nuss/MIRPE'], 'C': ['thoracic epidural', 'paravertebral block', 'intercostal nerve block', 'erector spinae plane block', 'systemic multimodal analgesia'], 'O': ['opioid consumption', 'in-hospital and at discharge', 'pain scores']}
[S2] Universe definition with multi-query & budget…
[S2] Built 1 query candidates.


ConnectionError: ('Connection aborted.', ConnectionResetError(10054, 'Foi forçado o cancelamento de uma conexão existente pelo host remoto', None, 10054, None))