# PMC Harvester (JATS / PMC XML) → internal schema

Purpose:
- Fetch PMC full-text XML for given PMCIDs.
- Parse JATS/PMC XML into structured fields.
- Normalize into the `health_document` JSON schema you provided.

Notes:
- This notebook fetches XML from the NCBI PMC article XML endpoint (works for OA & many PMC records).
- If an article is not available via this endpoint, fallback to Europe PMC API or OAI-PMH would be used.

# Dependencies

In [2]:
import hashlib
import json
from datetime import datetime,timezone

import requests
from lxml import etree


In [None]:
PMC_NS = {
    "mml": "http://www.w3.org/1998/Math/MathML",
    "xlink": "http://www.w3.org/1999/xlink"
}

# Helpers

In [None]:
# Small helpers used across cells
def now_iso():
    return datetime.now(timezone.utc).isoformat() + "Z"

def sha256_bytes(b: bytes) -> str:
    return hashlib.sha256(b).hexdigest()

def make_document_id(source: str, source_id: str) -> str:
    base = f"{source}:{source_id}"
    return hashlib.sha256(base.encode()).hexdigest()


In [None]:
def safe_text(el):
    if el is None:
        return None
    return " ".join(el.itertext()).strip()

In [None]:
def pmc_to_internal_schema(root):
    return {
        "source": "PMC",
        "title": safe_text(
            root.find(".//j:article-title", namespaces=PMC_NS)
        ),
        "abstract": safe_text(
            root.find(".//j:abstract", namespaces=PMC_NS)
        ),
        "journal": safe_text(
            root.find(".//j:journal-title", namespaces=PMC_NS)
        ),
        "doi": safe_text(
            root.find(".//j:article-id[@pub-id-type='doi']", namespaces=PMC_NS)
        ),
        "pmcid": safe_text(
            root.find(".//j:article-id[@pub-id-type='pmc']", namespaces=PMC_NS)
        ),
        "pub_date": safe_text(
            root.find(".//j:pub-date", namespaces=PMC_NS)
        )
    }

# Configuration

In [None]:
PMC_XML_URL_TEMPLATE = "https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/?report=xml&format=raw"
HEADERS = {
    "User-Agent": "health-harvester/1.0 (abamsheikh@gmail.com)",
    "Accept": "application/xml, text/xml, */*"
}
REQUEST_TIMEOUT = 30

# PMC Fetcher

In [None]:
def fetch_pmc_xml(pmcid: str) -> str:
    url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextXML"
    resp = requests.get(url)
    if resp.status_code != 200:
        raise ValueError(f"Failed to fetch PMC XML for {pmcid}: {resp.status_code}")
    return resp.text


# Parse XML

In [None]:
def parse_pmc_xml(xml_str: str):
    parser = etree.XMLParser(recover=True, huge_tree=True, remove_comments=True)
    root = etree.fromstring(xml_str.encode("utf-8"), parser=parser)
    return root

In [None]:
def safe_text(el):
    if el is None:
        return None
    return " ".join(el.itertext()).strip()

# Normalize

In [None]:
def pmc_to_internal_schema(root):
    from datetime import datetime

    # IDs
    pmcid = safe_text(root.find(".//article-id[@pub-id-type='pmcid']"))
    pmid = safe_text(root.find(".//article-id[@pub-id-type='pmid']"))
    doi = safe_text(root.find(".//article-id[@pub-id-type='doi']"))

    # Title
    title = safe_text(root.find(".//article-title"))

    # Abstract
    abstract = safe_text(root.find(".//abstract"))

    # Journal info
    journal_name = safe_text(root.find(".//journal-title"))
    issn = safe_text(root.find(".//issn[@pub-type='epub']"))

    # Publication date
    pub_date_el = root.find(".//pub-date")
    if pub_date_el is not None:
        year = safe_text(pub_date_el.find("year"))
        month = safe_text(pub_date_el.find("month"))
        day = safe_text(pub_date_el.find("day"))
        pub_date = "-".join(filter(None, [year, month, day]))
    else:
        pub_date = None

    # Ingestion timestamps
    now = datetime.utcnow().isoformat() + "Z"

    # --- BODY EXTRACTION ---
    body_el = root.find(".//body")
    text_sections = []
    plain_text = ""
    methods_summary = None
    figures = []
    tables = []

    if body_el is not None:
        offset = 0
        for sec in body_el.findall(".//sec"):
            section_title = safe_text(sec.find("title")) or "Untitled"
            section_text = " ".join([t.strip() for t in sec.itertext() if t.strip()])
            start_offset = offset
            end_offset = start_offset + len(section_text)
            text_sections.append({
                "label": section_title,
                "text": section_text,
                "start_offset": start_offset,
                "end_offset": end_offset
            })
            offset = end_offset + 1
            plain_text += section_text + "\n\n"

            # Capture methods summary if section ID indicates methods
            sec_id = sec.get("id", "")
            if sec_id.startswith("s1"):  # s1-* is methods
                methods_summary = (methods_summary + "\n" + section_text) if methods_summary else section_text

        # Figures
        for fig in body_el.findall(".//fig"):
            figures.append({
                "id": fig.get("id"),
                "label": safe_text(fig.find("label")),
                "title": safe_text(fig.find("caption/title")),
                "graphic": fig.find(".//graphic").get("{http://www.w3.org/1999/xlink}href") if fig.find(".//graphic") is not None else None
            })

        # Tables
        for table in body_el.findall(".//table-wrap"):
            tables.append({
                "id": table.get("id"),
                "label": safe_text(table.find("label")),
                "title": safe_text(table.find("caption/title")),
                "table_text": " ".join([t.strip() for t in table.itertext() if t.strip()])
            })

    # Build internal schema
    document = {
        "schema_version": "1.0",
        "document_id": f"PMC:{pmcid}" if pmcid else f"PMID:{pmid}",
        "source": "PMC",
        "source_id": f"PMC:{pmcid}" if pmcid else f"PMID:{pmid}",
        "identifiers": [
            {"type": "pmcid", "value": pmcid},
            {"type": "pmid", "value": pmid},
            {"type": "doi", "value": doi}
        ],
        "title": title,
        "subtitle": None,
        "authors": [],
        "affiliations": [],
        "abstract": abstract,
        "plain_text": plain_text.strip() if plain_text else None,
        "text_sections": text_sections,
        "content_type": "journal_article",
        "language": "en",
        "keywords": [],
        "mesh_terms": [],
        "topics": [],
        "published_date": pub_date,
        "journal": {
            "name": journal_name,
            "issn": issn,
            "publisher": safe_text(root.find(".//publisher-name")),
            "volume": None,
            "issue": None,
            "pages": None
        },
        "license": {},
        "access": {},
        "figures": figures,
        "tables": tables,
        "supplementary_files": [],
        "methods_summary": methods_summary,
        "data_availability": {},
        "funding": [],
        "conflicts_of_interest": None,
        "ethics": {},
        "clinical_trial": {},
        "references": [],
        "metrics": {},
        "ingestion": {
            "ingested_at": now,
            "harvester_id": "health-harvester",
            "adapter": {
                "name": "pmc_adapter",
                "version": "1.0",
                "fetched_at": now,
                "source_fetch_url": None
            },
            "raw_payload_ref": None,
            "raw_format": "xml"
        },
        "processing": {},
        "tags": [],
        "security": {},
        "last_updated": now
    }

    return document

# Example usage (fetch → parse → normalize → save)

In [None]:
xml_text = fetch_pmc_xml("PMC212403")  # fetch from correct XML endpoint
root = parse_pmc_xml(xml_text)

record = pmc_to_internal_schema(root)
record

# 4) save JSON locally (or push to your object store / DB)
with open(f"pmc_PMC212403_normalized.json", "w", encoding="utf-8") as f:
    json.dump(record, f, indent=2)