In [1]:
# Load configs and utils
from src.config import CSV_PATH, FIND_DIR, COMP_DIR, ASSR_DIR
from pathlib import Path
import pandas as pd
import json
from IPython.display import display, Markdown

# Load abstract
df_abs = pd.read_csv(CSV_PATH, usecols=[0, 1], names=["pmid", "abstract"], header=0)
ABS_DICT = df_abs.set_index("pmid")["abstract"].to_dict()

# Load JSONL helper
def load_jsonl(path):
    with open(path, encoding="utf-8") as f:
        return [json.loads(line) for line in f]

# Load data into dicts by pmid
def load_by_pmid(directory, model):
    path = Path(directory) / f"{model}.jsonl"
    data = load_jsonl(path)
    return {doc["pmid"]: doc for doc in data}

In [2]:
# Choose your model here:
model_tag = "gpt4o"  # or "claude", "llama"

# Load all 3 stage outputs
FINDINGS   = load_by_pmid(FIND_DIR, model_tag)
COMPLETES  = load_by_pmid(COMP_DIR, model_tag)
ASSERTIONS = load_by_pmid(ASSR_DIR, model_tag)

In [7]:
def show_pipeline_outputs(pmid: str):
    if int(pmid) not in ABS_DICT:
        display(Markdown(f"**PMID {pmid} not found in abstract file.**"))
        return
    
    display(Markdown(f"## Abstract for PMID `{pmid}`"))
    print(ABS_DICT[int(pmid)])

    if pmid not in FINDINGS:
        display(Markdown("**No finding sentences found.**"))
        return
    
    f_ids = FINDINGS[pmid]["finding_ids"]
    comp_doc = COMPLETES.get(pmid, {}).get("sentences", [])
    assr_doc = ASSERTIONS.get(pmid, {}).get("assertion", [])

    display(Markdown(f"### Findings ({len(f_ids)}):"))
    for fid in f_ids:
        try:
            sent = comp_doc[fid]["original"]
        except:
            sent = f"[Sentence {fid}] (Not found in completion)"
        print(f"[{fid}] {sent}")

    display(Markdown("### Resolved Sentences"))
    for s in comp_doc:
        print(f"[{s['id']}] -- {s['resolved']}")

    display(Markdown("### Extracted Assertions"))
    for a in assr_doc:
        condition = f" | Condition: {a.get('condition')}" if a.get("condition") else ""
        print(f"[{a['id']}] {a['subject']} -- {a['predicate']} -- {a['object']} -- {condition}")

    display(Markdown("### Stats Summary"))
    print(f"- # Finding Sentences: {len(f_ids)}")
    print(f"- # Resolved Sentences: {len(comp_doc)}")
    print(f"- # Extracted Assertions: {len(assr_doc)}")

In [9]:
def get_processed_pmids(model_tag="gpt4o", return_type="intersection"):
    """
    return_type: "all", "intersection", "missing"
    """
    find_pmids = set(load_by_pmid(FIND_DIR, model_tag).keys())
    comp_pmids = set(load_by_pmid(COMP_DIR, model_tag).keys())
    assr_pmids = set(load_by_pmid(ASSR_DIR, model_tag).keys())

    if return_type == "all":
        return {
            "finding": sorted(find_pmids),
            "completion": sorted(comp_pmids),
            "assertion": sorted(assr_pmids)
        }
    elif return_type == "intersection":
        return sorted(find_pmids & comp_pmids & assr_pmids)
    elif return_type == "missing":
        union = find_pmids | comp_pmids | assr_pmids
        missing = {
            "in_finding_not_in_completion": sorted(find_pmids - comp_pmids),
            "in_completion_not_in_assertion": sorted(comp_pmids - assr_pmids),
            "processed_total": sorted(union)
        }
        return missing
    else:
        raise ValueError("return_type must be one of: all, intersection, missing")

In [19]:
all_pmids = get_processed_pmids("gpt4o", return_type="all")
print(list(all_pmids)[:5])

common_pmids = get_processed_pmids("gpt4o", return_type="intersection")
print("Common PMIDs:", list(common_pmids)[:20])

missing_info = get_processed_pmids("gpt4o", return_type="missing")
print("Missing info:", list(missing_info)[:5])

['finding', 'completion', 'assertion']
Common PMIDs: ['11131883', '11798752', '15043929', '15664175', '16133256', '16641947', '17051598', '17722706', '17805045', '18313998', '18690111', '18852102', '18970220', '19041664', '1910436', '19234916', '19422607', '19616778', '20203436', '20410185']
Missing info: ['in_finding_not_in_completion', 'in_completion_not_in_assertion', 'processed_total']


In [23]:
show_pipeline_outputs("20410185") 

## Abstract for PMID `20410185`

A large series of plasma cell dyscrasias (n=2207) was examined for translocations which deregulate the MAF genes, t(14;20)(q32;q12) and t(14;16)(q32;q23), and their disease behavior was compared to a group characterized by the t(4;14)(p16;q32) where CCND2 is also up-regulated. The t(14;20) showed low prevalence in myeloma (27/1830, 1.5%) and smoldering myeloma (1/148, <1%) with a higher incidence in MGUS (9/193, 5% P=0.005). Strong associations with del(13) (76%), non-hyperdiploidy (83%) and gain of 1q (58%) were seen but no association with an IgA M-protein or absence of bone disease was noted. All three translocations were associated with poor outcome in myeloma, but strikingly all t(14;20) MGUS/smoldering myeloma cases (n=10) had stable, low level disease. In contrast, the 10 t(14;16) and 25 t(4;14) MGUS/smoldering myeloma cases were associated with both evolving and non-evolving disease. None of the associated genetic abnormalities helped to predict for progression from MGUS or smo

### Findings (5):

[1] Strong associations with del(13) (76%), non-hyperdiploidy (83%) and gain of 1q (58%) were seen but no association with an IgA M-protein or absence of bone disease was noted.
[2] All three translocations were associated with poor outcome in myeloma, but strikingly all t(14;20) MGUS/smoldering myeloma cases (n=10) had stable, low level disease.
[3] In contrast, the 10 t(14;16) and 25 t(4;14) MGUS/smoldering myeloma cases were associated with both evolving and non-evolving disease.
[4] None of the associated genetic abnormalities helped to predict for progression from MGUS or smoldering myeloma.
[5] [Sentence 5] (Not found in completion)


### Resolved Sentences

[1] -- The t(14;20) showed low prevalence in myeloma with a rate of 27 out of 1830, which is 1.5%.
[2] -- Strong associations with del(13) were seen in 76% of cases.
[3] -- All three translocations were associated with poor outcome in myeloma.
[4] -- The 10 t(14;16) MGUS/smoldering myeloma cases were associated with evolving disease.
[5] -- None of the associated genetic abnormalities helped to predict progression from monoclonal gammopathy of undetermined significance or smoldering myeloma.


### Extracted Assertions

[1] t(14;20) -- has prevalence -- 1.5% --  | Condition: in myeloma
[2] del(13) -- is associated with -- cases --  | Condition: in 76% of cases
[3] translocations -- is associated with -- poor outcome --  | Condition: in myeloma
[4] 10 t(14;16) MGUS/smoldering myeloma cases -- are associated with -- evolving disease -- 
[5] associated genetic abnormalities -- does not help to predict -- progression from monoclonal gammopathy of undetermined significance or smoldering myeloma -- 


### Stats Summary

- # Finding Sentences: 5
- # Resolved Sentences: 5
- # Extracted Assertions: 5
