# PubMed Adapter (Entrez API)

Purpose:
- Harvest biomedical literature from PubMed
- Normalize records into internal health document schema
- Preserve provenance and raw payloads

Source: NCBI Entrez (PubMed)

## Dependencies

In [6]:
import requests
import uuid
import json
import hashlib
import xml.etree.ElementTree as ET
from typing import List, Dict, Any, Optional
from datetime import datetime,timezone
from pathlib import Path


## Storage directory configurations

In [7]:
STORAGE_DIR = Path.cwd() / "storage"
STORAGE_DIR.mkdir(exist_ok=True)
STORAGE_DIR

WindowsPath('C:/Users/Aman Sheikh/Desktop/Projects/VeriFact/Model/harvester/storage')

## Helper to generate deterministic document IDs:

In [8]:
def make_document_id(source: str, source_id: str) -> str:
    base = f"{source}:{source_id}"
    return hashlib.sha256(base.encode()).hexdigest()

## Adapter Configuration

In [None]:
ENTREZ_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
EMAIL = "abamsheikh@gmail.com"   # REQUIRED by NCBI
TOOL = "health-harvester"
API_KEY = 'ec74621abe110994f710510d05aa0780d607'  # optional but recommended

REQUEST_DELAY = 0.34  # seconds (NCBI rate limit safety)

## ESearch – find PubMed IDs (PMIDs)

In [10]:
def pubmed_search(query: str, retstart=0, retmax=20):
    params = {
        # using Pubmed Database
        "db": "pubmed",
        # The term used to search for the record
        "term": query,
        "retmode": "json",
        # Sequential index of the first record to be retrieved (default is 0)
        "retstart": retstart,
        # Total number of records from the input set to be retrieved, up to a maximum of 10,000.
        "retmax": retmax,
        "email": EMAIL,
        "tool": TOOL,
    }
    if API_KEY:
        params["api_key"] = API_KEY

    r = requests.get(f"{ENTREZ_BASE}/esearch.fcgi", params=params)
    r.raise_for_status()
    return r.json()

### Test it:

In [11]:
# search_resp = pubmed_search("risk of coughing", retmax=5)
# search_resp

{'header': {'type': 'esearch', 'version': '0.3'},
 'esearchresult': {'count': '13991',
  'retmax': '5',
  'retstart': '0',
  'idlist': ['41519567', '41519373', '41518894', '41513728', '41508572'],
  'translationset': [{'from': 'risk of',
    'to': '"risk"[MeSH Terms] OR "risk"[All Fields] OR "risk of"[All Fields]'},
   {'from': 'coughing',
    'to': '"cough"[MeSH Terms] OR "cough"[All Fields] OR "coughing"[All Fields] OR "coughs"[All Fields] OR "coughed"[All Fields]'}],
  'querytranslation': '("risk"[MeSH Terms] OR "risk"[All Fields] OR "risk of"[All Fields]) AND ("cough"[MeSH Terms] OR "cough"[All Fields] OR "coughing"[All Fields] OR "coughs"[All Fields] OR "coughed"[All Fields])'}}

### Expected Output:
```json
"idlist" ["38712345", "38699811", ...]
```

## EFetch – fetch full article records

In [12]:
def pubmed_fetch(pmids: List[str]) -> str:
    params = {
        "db": "pubmed",
        "id": ",".join(pmids),
        "retmode": "xml",
        # Rettype is null for pubmed database with json retmode
        # "rettype":null
        "email": EMAIL,
        "tool": TOOL,
    }
    if API_KEY:
        params["api_key"] = API_KEY

    r = requests.get(f"{ENTREZ_BASE}/efetch.fcgi", params=params)
    r.raise_for_status()
    return r.text

### Usage:

In [13]:
# pmids = search_resp["esearchresult"]["idlist"]
# xml_data = pubmed_fetch(pmids)
# xml_data

'<?xml version="1.0" ?>\n<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2025//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_250101.dtd">\n<PubmedArticleSet>\n<PubmedArticle><MedlineCitation Status="MEDLINE" Owner="NLM" IndexingMethod="Automated"><PMID Version="1">41519567</PMID><DateCompleted><Year>2026</Year><Month>01</Month><Day>10</Day></DateCompleted><DateRevised><Year>2026</Year><Month>01</Month><Day>10</Day></DateRevised><Article PubModel="Print"><Journal><ISSN IssnType="Electronic">1931-3543</ISSN><JournalIssue CitedMedium="Internet"><Volume>169</Volume><Issue>1</Issue><PubDate><Year>2026</Year><Month>Jan</Month></PubDate></JournalIssue><Title>Chest</Title><ISOAbbreviation>Chest</ISOAbbreviation></Journal><ArticleTitle>A Case of Mediastinal Adenopathy and Clinically Suspected Myocarditis.</ArticleTitle><Pagination><StartPage>e25</StartPage><EndPage>e29</EndPage><MedlinePgn>e25-e29</MedlinePgn></Pagination><ELocationID EIdType="doi" ValidYN="Y">10

## Parse PubMed XML (core logic)

In [14]:
def _text_or_none(elem: Optional[ET.Element]) -> Optional[str]:
    if elem is None or elem.text is None:
        return None
    return elem.text.strip()

In [15]:
def _parse_pubdate(pubdate_elem: Optional[ET.Element]) -> Optional[str]:
    """
    Try to build an ISO-ish date string from PubDate elements.
    Handles:
      <PubDate><Year>YYYY</Year><Month>Mon</Month><Day>DD</Day></PubDate>
      <PubDate><MedlineDate>YYYY MMM</MedlineDate></PubDate>
    Returns YYYY-MM-DD if day/month present, otherwise YYYY-MM or YYYY.
    """
    if pubdate_elem is None:
        return None

    # Prefer explicit Year/Month/Day children
    year = _text_or_none(pubdate_elem.find("Year"))
    month = _text_or_none(pubdate_elem.find("Month"))
    day = _text_or_none(pubdate_elem.find("Day"))
    medline = _text_or_none(pubdate_elem.find("MedlineDate"))

    if year:
        # Try to convert a month name to numeric if present
        month_num = None
        if month:
            try:
                # Accept month abbreviations like "Jun" or numeric strings
                month_num = datetime.strptime(month[:3], "%b").month
            except Exception:
                # fallback: if month is numeric already
                try:
                    month_num = int(month)
                except Exception:
                    month_num = None

        # Construct ISO-like string
        if month_num and day:
            try:
                return f"{int(year):04d}-{int(month_num):02d}-{int(day):02d}"
            except Exception:
                return f"{year}-{month_num}-{day}"
        if month_num:
            return f"{int(year):04d}-{int(month_num):02d}"
        return f"{int(year):04d}"

    if medline:
        # medline often contains "2001 Jun" or ranges; return raw medline as fallback
        return medline

    return None

In [16]:
def parse_pubmed_xml(xml_str: str) -> List[Dict]:
    """
    Parse PubMed EFetch XML and return a list of record dicts with expanded fields.
    Safe if some nodes are missing.
    """
    root = ET.fromstring(xml_str)
    records: List[Dict] = []

    for article in root.findall(".//PubmedArticle"):
        rec: Dict = {}

        # PMIDs: prefer MedlineCitation/PMID but also capture ArticleIdList if present
        pmid = _text_or_none(article.find(".//MedlineCitation/PMID"))
        rec["pmid"] = pmid

        # Title
        rec["title"] = _text_or_none(article.find(".//Article/ArticleTitle"))

        # Abstract: concatenate AbstractText parts; keep labels if present
        abstract_elems = article.findall(".//Article/Abstract/AbstractText")
        if abstract_elems:
            parts = []
            for p in abstract_elems:
                label = p.get("Label") or p.get("NlmCategory")
                text = _text_or_none(p)
                if text:
                    if label:
                        parts.append(f"{label}: {text}")
                    else:
                        parts.append(text)
            rec["abstract"] = " ".join(parts).strip() if parts else None
        else:
            rec["abstract"] = None

        # Copyright info (if present under Abstract)
        rec["copyright"] = _text_or_none(article.find(".//Article/Abstract/AbstractText/..//CopyrightInformation")) \
            or _text_or_none(article.find(".//Article/Abstract/CopyrightInformation"))

        # Journal fields
        journal_title = _text_or_none(article.find(".//Article/Journal/Title"))
        iso_abbrev = _text_or_none(article.find(".//Article/Journal/ISOAbbreviation"))
        issn_print = _text_or_none(article.find(".//Article/Journal/ISSN"))
        # fallback linking ISSN in MedlineJournalInfo
        issn_linking = _text_or_none(article.find(".//MedlineJournalInfo/ISSNLinking"))

        rec["journal"] = journal_title
        rec["journal_isoabbrev"] = iso_abbrev
        rec["journal_issn"] = issn_print or issn_linking

        # Pagination and pages
        start_page = _text_or_none(article.find(".//Pagination/StartPage"))
        end_page = _text_or_none(article.find(".//Pagination/EndPage"))
        medline_pgn = _text_or_none(article.find(".//Pagination/MedlinePgn"))
        # fallback to MedlineCitation/Article/Journal/JournalIssue
        if not start_page and not end_page:
            # some records have MedlinePgn at MedlineCitation level
            medline_pgn = medline_pgn or _text_or_none(article.find(".//MedlineCitation/Article/Pagination/MedlinePgn"))

        rec["start_page"] = start_page
        rec["end_page"] = end_page
        rec["medline_pgn"] = medline_pgn

        # Volume / Issue
        rec["volume"] = _text_or_none(article.find(".//Article/Journal/JournalIssue/Volume"))
        rec["issue"] = _text_or_none(article.find(".//Article/Journal/JournalIssue/Issue"))

        # Publication date (Article/Journal/JournalIssue/PubDate) and history dates
        pubdate_elem = article.find(".//Article/Journal/JournalIssue/PubDate")
        rec["pub_date"] = _parse_pubdate(pubdate_elem)
        # DateCompleted / DateRevised under MedlineCitation
        rec["date_completed"] = _parse_pubdate(article.find(".//MedlineCitation/DateCompleted"))
        rec["date_revised"] = _parse_pubdate(article.find(".//MedlineCitation/DateRevised"))

        # PubMed history (multiple PubMedPubDate entries)
        pubmed_dates = []
        for d in article.findall(".//PubmedData/History/PubMedPubDate"):
            status = d.get("PubStatus")
            y = _text_or_none(d.find("Year"))
            m = _text_or_none(d.find("Month"))
            day = _text_or_none(d.find("Day"))
            if y:
                # try to form YYYY-MM-DD if possible
                try:
                    month_num = int(m) if m and m.isdigit() else None
                except Exception:
                    month_num = None
                if month_num and day:
                    value = f"{int(y):04d}-{int(month_num):02d}-{int(day):02d}"
                elif month_num:
                    value = f"{int(y):04d}-{int(month_num):02d}"
                else:
                    value = f"{int(y):04d}"
            else:
                value = None
            pubmed_dates.append({"status": status, "value": value})
        rec["pubmed_history"] = pubmed_dates

        # Language
        rec["language"] = _text_or_none(article.find(".//Article/Language")) or _text_or_none(article.find(".//MedlineCitation/Language"))

        # Publication types
        pub_types = [ _text_or_none(pt) for pt in article.findall(".//Article/PublicationTypeList/PublicationType") ]
        pub_types = [p for p in pub_types if p]
        rec["publication_types"] = pub_types

        # Authors + affiliations
        authors_list = []
        for a in article.findall(".//Article/AuthorList/Author"):
            last = _text_or_none(a.find("LastName"))
            fore = _text_or_none(a.find("ForeName"))
            initials = _text_or_none(a.find("Initials"))
            affiliation = None
            affs = []
            # AffiliationInfo may be present multiple times
            for aff in a.findall("AffiliationInfo"):
                aff_text = _text_or_none(aff.find("Affiliation"))
                if aff_text:
                    affs.append(aff_text)
            # some authors may not have LastName/ForeName (corporate authors)
            name = None
            if last and fore:
                name = f"{fore} {last}"
            elif last:
                name = last
            elif fore:
                name = fore
            else:
                # check for CollectiveName
                coll = _text_or_none(a.find("CollectiveName"))
                name = coll

            author_entry = {
                "name": name,
                "given_names": fore,
                "family_name": last,
                "initials": initials,
                "affiliations": affs,
                "orcid": None,
                "email": None,
                "contribution_role": [],
                "author_id": None
            }
            authors_list.append(author_entry)
        rec["authors"] = authors_list

        # MeSH Headings
        mesh_terms = []
        for mh in article.findall(".//MeshHeadingList/MeshHeading"):
            descriptor = _text_or_none(mh.find("DescriptorName"))
            qualifiers = [ _text_or_none(q) for q in mh.findall("QualifierName") ]
            qualifiers = [q for q in qualifiers if q]
            if descriptor:
                mesh_terms.append({"descriptor": descriptor, "qualifiers": qualifiers})
        rec["mesh_headings"] = mesh_terms

        # References / ArticleIdList (pubmed, doi, pii, pmc etc.)
        article_ids = []
        for aid in article.findall(".//PubmedData/ArticleIdList/ArticleId"):
            idtype = aid.get("IdType")
            val = _text_or_none(aid)
            if idtype and val:
                article_ids.append({"type": idtype.lower(), "value": val})
        rec["article_ids"] = article_ids

        # If DOI present also put it at top-level 'doi' convenience key
        doi_val = None
        for a in article_ids:
            if a["type"] == "doi":
                doi_val = a["value"]
                break
        rec["doi"] = doi_val

        # Collect Mesh term strings as convenience list (descriptor only)
        rec["mesh_terms_simple"] = [m["descriptor"] for m in mesh_terms]

        # Anything else useful: NLM unique id, MedlineTA (journal short), country
        rec["nlm_unique_id"] = _text_or_none(article.find(".//MedlineJournalInfo/NlmUniqueID"))
        rec["medline_ta"] = _text_or_none(article.find(".//MedlineJournalInfo/MedlineTA"))
        rec["country"] = _text_or_none(article.find(".//MedlineJournalInfo/Country"))

        # Append record
        records.append(rec)

    return records

### Usage:

In [17]:
# parsed = parse_pubmed_xml(xml_data)
# parsed[:1]

[{'pmid': '41519567',
  'title': 'A Case of Mediastinal Adenopathy and Clinically Suspected Myocarditis.',
  'abstract': 'A 29-year-old man originally from an Eastern European country with a high TB incidence had resided in Western Europe for 6 years. He worked in air conditioning repair and currently smoked. He presented to the emergency department with a 10-day history of chest pain, dry cough, and fever. He had no significant medical history, recent travel, animal contact, or insect bites. He denied IV drug use and risk factors for sexually transmitted infections. On admission, his vital signs were stable except for a low-grade fever of 38.5 °C, with no respiratory distress or audible wheezing noted. He was discharged with a diagnosis of a common cold.',
  'copyright': 'Copyright © 2025 American College of Chest Physicians. Published by Elsevier Inc. All rights reserved.',
  'journal': 'Chest',
  'journal_isoabbrev': 'Chest',
  'journal_issn': '1931-3543',
  'start_page': 'e25',
  '

## Normalize to internal schema

In [18]:
def _sha256_of_bytes(b: bytes) -> str:
    import hashlib
    return hashlib.sha256(b).hexdigest()

In [19]:
def normalize_pubmed_record(
    rec: Dict[str, Any],
    raw_ref: Optional[str] = None,
    raw_bytes: Optional[bytes] = None,
    fetched_url: Optional[str] = None,
    harvester_id: str = "health-harvester",
    adapter_name: str = "pubmed_adapter",
    adapter_version: str = "1.0"
) -> Dict[str, Any]:
    """
    Normalize a parsed PubMed record (output of parse_pubmed_xml) to the user's JSON schema.
    - rec: dict with keys like pmid, doi, title, abstract, authors (list), article_ids (list of {type,value}), pub_date, volume, issue, medline_pgn, journal_issn, mesh_terms_simple, etc.
    - raw_ref: storage reference for raw payload (eg s3://...)
    - raw_bytes: optional raw XML bytes (to compute sha256 and size)
    - fetched_url: optional URL used to fetch (useful if adapter fetched via efetch url)
    """
    now = datetime.now(timezone.utc).isoformat() + "Z"
    source = "pubmed"
    pmid = rec.get("pmid")
    source_id = f"PMID:{pmid}" if pmid else None

    # Build identifiers list (include pmid and any ArticleIdList entries)
    identifiers: List[Dict[str,str]] = []
    if pmid:
        identifiers.append({"type": "pmid", "value": str(pmid)})

    # Add article_ids from parse (rec['article_ids'] expected as list of {"type","value"})
    for aid in rec.get("article_ids", []):
        t = aid.get("type")
        v = aid.get("value")
        if t and v:
            # normalize type names
            identifiers.append({"type": t.lower(), "value": v})

    # convenience DOI
    doi = rec.get("doi")
    if doi and not any(i["type"] == "doi" for i in identifiers):
        identifiers.append({"type": "doi", "value": doi})

    # Authors mapping (rec['authors'] expected as list with name, given_names, family_name, affiliations)
    authors_out = []
    affiliations_agg = []
    for a in rec.get("authors", []):
        name = a.get("name")
        given = a.get("given_names") or a.get("given") or a.get("fore")
        family = a.get("family_name") or a.get("last")
        affs = a.get("affiliations") or []
        # aggregate unique affs
        for af in affs:
            if af and af not in affiliations_agg:
                affiliations_agg.append(af)

        author_entry = {
            "name": name,
            "given_names": given,
            "family_name": family,
            "affiliations": affs,
            "orcid": a.get("orcid"),
            "email": a.get("email"),
            "contribution_role": a.get("contribution_role") or [],
            "author_id": a.get("author_id") or None
        }
        authors_out.append(author_entry)

    # Text sections: put abstract as a section (start/end offsets not available here; set to 0..len-1)
    text_sections = []
    abstract_text = rec.get("abstract")
    if abstract_text:
        start = 0
        end = len(abstract_text)
        text_sections.append({
            "label": "abstract",
            "text": abstract_text,
            "start_offset": start,
            "end_offset": end
        })

    # Pages: try to assemble pages field
    pages = None
    if rec.get("medline_pgn"):
        pages = rec.get("medline_pgn")
    else:
        s = rec.get("start_page")
        e = rec.get("end_page")
        if s and e:
            pages = f"{s}-{e}"
        elif s:
            pages = s

    # Access: detect PMC ID -> build PMC url, set has_fulltext True
    fulltext_urls = []
    has_fulltext = False
    for ident in identifiers:
        if ident["type"] == "pmc" or (ident["type"] == "pmcid"):
            pmcval = ident["value"]
            # normalize pmc id to start with PMC
            if not pmcval.upper().startswith("PMC"):
                pmcval = "PMC" + pmcval
            fulltext_urls.append({"url": f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcval}/", "format": "html", "source": "pmc"})
            has_fulltext = True

    # If DOI exists, we may set has_fulltext unknown (we will enrich later via Unpaywall)
    access_type = "unknown"
    if has_fulltext:
        access_type = "open"

    # ingestion raw metadata
    raw_size = len(raw_bytes) if raw_bytes is not None else None
    raw_sha = _sha256_of_bytes(raw_bytes) if raw_bytes is not None else None

    # Determine published_date: prefer rec['pub_date'] else date_completed else pubmed_history's pubmed entry
    published_date = rec.get("pub_date") or rec.get("date_completed")
    if not published_date:
        # try to find PubStatus 'pubmed' or 'medline' in rec['pubmed_history']
        for d in rec.get("pubmed_history", []) or []:
            if d.get("status") == "pubmed" and d.get("value"):
                published_date = d.get("value"); break
            if d.get("status") == "medline" and d.get("value"):
                published_date = d.get("value")

    # Mesh terms: prefer rec['mesh_terms_simple'] if available, else structured mesh_headings
    mesh_terms = []
    if rec.get("mesh_terms_simple"):
        mesh_terms = rec.get("mesh_terms_simple")
    else:
        for m in rec.get("mesh_headings", []):
            if isinstance(m, dict):
                mesh_terms.append(m.get("descriptor"))

    # Build journal object
    journal_obj = {
        "name": rec.get("journal"),
        "issn": rec.get("journal_issn"),
        "publisher": None,
        "volume": rec.get("volume"),
        "issue": rec.get("issue"),
        "pages": pages
    }

    # processing dedup key: use DOI if present, else PMID
    dedup_key = None
    if doi:
        dedup_key = doi.lower()
    elif pmid:
        dedup_key = f"pmid:{pmid}"

    # put publication types into tags
    tags = rec.get("publication_types") or []

    document = {
        "schema_version": "1.0",
        "document_id": make_document_id(source, source_id) if source_id else make_document_id(source, str(doi or uuid.uuid4())),
        "source": source,
        "source_id": source_id,
        "identifiers": identifiers,
        "title": rec.get("title"),
        "subtitle": None,
        "authors": authors_out,
        "affiliations": [{"affiliation": a} for a in affiliations_agg] if affiliations_agg else [],
        "abstract": abstract_text,
        "plain_text": abstract_text,
        "text_sections": text_sections,
        "content_type": "journal_article",
        "language": rec.get("language") or "en",
        "keywords": rec.get("keywords") or [],
        "mesh_terms": mesh_terms,
        "topics": rec.get("topics") or [],
        "published_date": published_date,
        "journal": journal_obj,
        "license": {
            "type": "unknown",
            "url": None,
            "notes": None
        },
        "access": {
            "has_fulltext": bool(has_fulltext),
            "access_type": access_type,
            "fulltext_urls": fulltext_urls
        },
        "figures": [],
        "tables": [],
        "supplementary_files": [],
        "methods_summary": None,
        "data_availability": {},
        "funding": rec.get("funding") or [],
        "conflicts_of_interest": rec.get("conflicts_of_interest"),
        "ethics": {},
        "clinical_trial": rec.get("clinical_trial") or {},
        "references": rec.get("references") or [],
        "metrics": {},
        "ingestion": {
            "ingested_at": now,
            "harvester_id": harvester_id,
            "adapter": {
                "name": adapter_name,
                "version": adapter_version,
                "fetched_at": now,
                "source_fetch_url": fetched_url
            },
            "raw_payload_ref": raw_ref,
            "raw_format": "xml",
            "raw_size_bytes": raw_size,
            "raw_sha256": raw_sha,
            "source_last_modified": None
        },
        "processing": {
            "normalized_at": now,
            "language_detected": rec.get("language") or "en",
            "dedup_key": dedup_key,
            "canonical_id": doi.lower() if doi else None,
            "quality_flags": [],
            "processing_notes": None
        },
        "tags": tags,
        "security": {},
        "last_updated": now
    }

    return document

### Usage:

In [20]:
# normalized_docs = [
#     normalize_pubmed_record(rec, raw_ref="pubmed_raw.xml")
#     for rec in parsed
# ]
# normalized_docs[0]

{'schema_version': '1.0',
 'document_id': '9ab9863e0578bb7b959e25817bf2cb5e4d3a741758f95845859470313c11ea39',
 'source': 'pubmed',
 'source_id': 'PMID:41519567',
 'identifiers': [{'type': 'pmid', 'value': '41519567'},
  {'type': 'pubmed', 'value': '41519567'},
  {'type': 'doi', 'value': '10.1016/j.chest.2025.07.4094'},
  {'type': 'pii', 'value': 'S0012-3692(25)05144-X'}],
 'title': 'A Case of Mediastinal Adenopathy and Clinically Suspected Myocarditis.',
 'subtitle': None,
 'authors': [{'name': 'Alexandre Terré',
   'given_names': 'Alexandre',
   'family_name': 'Terré',
   'affiliations': ['Service de Médecine Interne, APHP, Hôpital Ambroise Paré; Université de Versailles Saint-Quentin-en-Yvelines, Boulogne Billancourt, France. Electronic address: alexandre.terre@aphp.fr.'],
   'orcid': None,
   'email': None,
   'contribution_role': [],
   'author_id': None},
  {'name': 'Catherine Julié',
   'given_names': 'Catherine',
   'family_name': 'Julié',
   'affiliations': ["Service d'Anatomie

## Save raw + normalized output

In [23]:
def save_normalized_pubmed(normalized_docs, xml_data=None, storage_dir: Path = STORAGE_DIR) -> dict:
    """
    Saves the raw XML and normalized JSON of PubMed data.

    Parameters:
    - normalized_docs: list/dict of normalized documents
    - xml_data: optional raw XML string to save
    - storage_dir: Path object for where to save files

    Returns:
    - dict with keys 'raw_xml_path' and 'normalized_json_path'
    """
    storage_dir.mkdir(parents=True, exist_ok=True)

    paths = {}

    if xml_data is not None:
        raw_path = storage_dir / "pubmed_raw.xml"
        with open(raw_path, "w", encoding="utf-8") as f:
            f.write(xml_data)
        paths["raw_xml_path"] = raw_path

    normalized_path = storage_dir / "pubmed_normalized.json"
    with open(normalized_path, "w", encoding="utf-8") as f:
        json.dump(normalized_docs, f, indent=2)
    paths["normalized_json_path"] = normalized_path

    return paths
