# Main Harvester Pipeline (PubMed → Unpaywall)

This notebook orchestrates:
1. PubMed harvesting
2. PubMed XML parsing
3. Normalization into internal `health_document` schema
4. DOI extraction
5. Unpaywall OA enrichment using DOI

Adapters used:
- pubmed_adapter
- unpaywall_adapter

## Dependencies

In [4]:
import json
from pathlib import Path
import time
import random

## Storage directory configuration

In [5]:
PROJECT_ROOT = Path.cwd()          # main.ipynb directory
STORAGE_DIR = PROJECT_ROOT / "storage"

STORAGE_DIR.mkdir(parents=True, exist_ok=True)

print("Storage dir:", STORAGE_DIR.resolve())

Storage dir: C:\Users\Aman Sheikh\Desktop\Projects\VeriFact\Model\harvester\storage


## Importing python notebooks

In [3]:
%run adapters/adapter_pubmed.ipynb
%run adapters/adapter_unpaywall.ipynb

## pubmed Keyword Search: (Add dynamic keyword mechanism later)

In [6]:
HARVEST_DIR = Path("./storage/outputs/harvest_outputs")   # change if different
RETRIEVE_MAX = 5                        # or larger if you want more hits per query
MAX_RETRIES = 3
BACKOFF_BASE = 1.0                      # seconds, exponential backoff

In [7]:
def safe_pubmed_search(query, retmax=RETRIEVE_MAX):
    """Call pubmed_search with a simple retry/backoff wrapper."""
    last_exc = None
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            # pubmed_search is your existing function
            return pubmed_search(query, retmax=retmax)
        except Exception as e:
            last_exc = e
            if attempt == MAX_RETRIES:
                raise
            sleep_for = BACKOFF_BASE * (2 ** (attempt - 1)) + random.uniform(0, 0.3)
            print(f"[retry] query failed (attempt {attempt}) -> sleeping {sleep_for:.2f}s")
            time.sleep(sleep_for)
    raise last_exc

In [8]:
unique_pmids = set()
pmid_to_source = {}   # first-seen mapping pmid -> {"seed":..., "query":...}

harvest_files = sorted(HARVEST_DIR.glob("*.json"))
if not harvest_files:
    raise RuntimeError(f"No harvest files found in {HARVEST_DIR}")

RuntimeError: No harvest files found in storage\outputs\harvest_outputs

In [1]:
total_queries = 0
for hf in harvest_files:
    with open(hf, "r", encoding="utf-8") as f:
        data = json.load(f)
    seed = data.get("label", hf.stem)
    harvest_list = data.get("result", {}).get("harvest_list", []) or []
    total_queries += len(harvest_list)

print(f"Found {len(harvest_files)} seed files, {total_queries} total queries to run.")

NameError: name 'harvest_files' is not defined

In [None]:
processed_count = 0
for hf in harvest_files:
    with open(hf, "r", encoding="utf-8") as f:
        data = json.load(f)
    seed = data.get("label", hf.stem)
    harvest_list = data.get("result", {}).get("harvest_list", []) or []

    for query in harvest_list:
        processed_count += 1
        try:
            search_resp = safe_pubmed_search(query, retmax=RETRIEVE_MAX)
        except Exception as e:
            print(f"[ERROR] search failed for seed={seed!r}, query={query!r}: {e}")
            continue

        # Grab pmids safely from the result structure
        pmids = search_resp.get("esearchresult", {}).get("idlist", [])
        new_found = 0
        for pmid in pmids:
            if pmid not in unique_pmids:
                unique_pmids.add(pmid)
                pmid_to_source[pmid] = {"seed": seed, "query": query}
                new_found += 1

        print(f"[{processed_count}/{total_queries}] {seed!r} → {query!r} -> found {len(pmids)} pmids, {new_found} new, {len(unique_pmids)} unique so far")

        # gentle jitter/rate-limit so you don't hammer NCBI
        time.sleep(random.uniform(0.34, 0.6))

In [None]:
# Save results
unique_list = list(unique_pmids)
with open("unique_pmids.json", "w", encoding="utf-8") as f:
    json.dump(unique_list, f, indent=2)

with open("pmid_to_source.json", "w", encoding="utf-8") as f:
    json.dump(pmid_to_source, f, indent=2)

print(f"Done. Unique PMIDs collected: {len(unique_list)}. Saved to unique_pmids.json")

In [4]:
# # Example: Start by using keyword search:
# SEARCH_KEYWORD='Cow breeding'
# RETRIEVE_MAX=5
# search_resp = pubmed_search(SEARCH_KEYWORD, retmax=RETRIEVE_MAX)
# # Get the pmids:
# pmids = search_resp["esearchresult"]["idlist"]

## pubmed Fetch

In [5]:
print("ℹ️ Fetching PubMed XML...")
pubmed_xml = pubmed_fetch(unique_list)
print("✅ PubMed XML fetched")

ℹ️ Fetching PubMed XML...
✅ PubMed XML fetched


## Parse fetched XML records

In [6]:
print("ℹ️ Parsing PubMed XML...")
parsed_pubmed_records = parse_pubmed_xml(pubmed_xml)

print(f"✅ Parsed {len(parsed_pubmed_records)} PubMed records")
print(f"ℹ️ Showing the first PubMed record:")
parsed_pubmed_records[0]

ℹ️ Parsing PubMed XML...
✅ Parsed 5 PubMed records
ℹ️ Showing the first PubMed record:


{'pmid': '41518224',
 'title': 'Effect of Insemination Timing Within a TAI Program on Fertility Using Sex-Sorted Semen in Lactating Dairy Cows.',
 'abstract': 'The objective of this study was to evaluate the effect of insemination timing with sex-sorted semen on fertility in dairy cows subjected to a timed artificial insemination (TAI) protocol. A total of 611 Holstein cows (46\u2009±\u20093 DIM) were enrolled and subjected to a presynchronized Ovsynch protocol (G7G; PGF₂α-2d-GnRH-7d-GnRH-7d-PGF₂α-56\u2009h-GnRH), and randomly allocated to four treatment groups. The control group (CONV-14, n = 154) was inseminated with conventional semen 14\u2009h after the final GnRH, while cows in the sex-sorted semen groups were inseminated at 14 (SS-14, n\u2009=\u2009152), 18 (SS-18, n\u2009=\u2009153), or 22\u2009h (SS-22, n\u2009=\u2009152) after the same treatment. The same bull was used for all inseminations. All cows were examined by ultrasonography to individually evaluate ovarian responses t

## Normalize the parsed records (in list of Maps)

In [7]:
normalized_documents = []

for rec in parsed_pubmed_records:
    doc = normalize_pubmed_record(
        rec,
        raw_ref="pubmed_raw.xml"
    )
    normalized_documents.append(doc)

print(f"✅ Normalized {len(normalized_documents)} documents")

print(f"ℹ️ Showing the first normalized PubMed record:")
normalized_documents[0]

✅ Normalized 5 documents
ℹ️ Showing the first normalized PubMed record:


{'schema_version': '1.0',
 'document_id': '8b30975f7f70338e35bed64be6fd37b62d033d83fc44275c27c19da8d167d5ad',
 'source': 'pubmed',
 'source_id': 'PMID:41518224',
 'identifiers': [{'type': 'pmid', 'value': '41518224'},
  {'type': 'pubmed', 'value': '41518224'},
  {'type': 'doi', 'value': '10.1111/rda.70174'}],
 'title': 'Effect of Insemination Timing Within a TAI Program on Fertility Using Sex-Sorted Semen in Lactating Dairy Cows.',
 'subtitle': None,
 'authors': [{'name': 'Enes Serim',
   'given_names': 'Enes',
   'family_name': 'Serim',
   'affiliations': ['Department of Veterinary Obstetrics and Gynecology, Graduate School of Health Sciences, Bursa Uludag University, Bursa, Turkiye.'],
   'orcid': None,
   'email': None,
   'contribution_role': [],
   'author_id': None},
  {'name': 'Ebru Karakaya-Bilen',
   'given_names': 'Ebru',
   'family_name': 'Karakaya-Bilen',
   'affiliations': ['Department of Obstetrics and Gynecology, Faculty of Ceyhan Veterinary Medicine, Cukurova University

## Save the Normalized pubmed Record (JSON & XML)

In [8]:
paths = save_normalized_pubmed(normalized_documents, pubmed_xml)
print(f"Saved raw XML: {paths['raw_xml_path']}")
print(f"Saved normalized JSON: {paths['normalized_json_path']}")

Saved raw XML: C:\Users\Aman Sheikh\Desktop\Projects\VeriFact\Model\harvester\storage\pubmed_raw.xml
Saved normalized JSON: C:\Users\Aman Sheikh\Desktop\Projects\VeriFact\Model\harvester\storage\pubmed_normalized.json


## Enrich documents with Unpaywall (multithreading)

In [9]:
# Enrich documents with Unpaywall (clean notebook output)

import os
import time
import random
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed

MAX_WORKERS = 10
MAX_RETRIES = 5
BACKOFF_BASE = 1  # seconds

# Thread-safe print
print_lock = threading.Lock()


def safe_print(*args, **kwargs):
    with print_lock:
        print(*args, **kwargs)


os.makedirs("output", exist_ok=True)


def process_doc(doc):
    identifiers = doc.get("identifiers", [])
    doi = extract_doi(identifiers)

    if not doi:
        return ("skipped", doc)

    try:
        unpay_json = None
        for attempt in range(1, MAX_RETRIES + 1):
            try:
                unpay_json = fetch_unpaywall(doi)
                break
            except Exception as e:
                if attempt == MAX_RETRIES:
                    raise
                backoff = BACKOFF_BASE * (2 ** (attempt - 1)) + random.uniform(0, 0.2)
                time.sleep(backoff)

        # tiny jitter to avoid hammering API
        time.sleep(random.uniform(0.05, 0.2))

        raw_unpay_path = save_unpaywall_raw(doi, unpay_json)
        with open(raw_unpay_path, "rb") as f:
            raw_bytes = f.read()

        enriched_doc = enrich_document_with_unpaywall(
            document=doc,
            unpay_json=unpay_json,
            raw_ref=raw_unpay_path,
            raw_bytes=raw_bytes
        )

        return ("ok", enriched_doc, raw_unpay_path)

    except Exception as e:
        return ("error", doc, str(e))


# Run threaded processing
enriched_documents = []
errors = []
total = len(normalized_documents)

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    future_to_doc = {executor.submit(process_doc, doc): doc for doc in normalized_documents}

    for i, future in enumerate(as_completed(future_to_doc), 1):
        result = future.result()
        status = result[0]

        if status == "ok":
            enriched_doc, raw_path = result[1], result[2]
            enriched_documents.append(enriched_doc)
            safe_print(f"✅ [{i}/{total}] {enriched_doc['document_id']} saved")
        elif status == "skipped":
            enriched_documents.append(result[1])
            safe_print(f"⚪ [{i}/{total}] {result[1]['document_id']} skipped (no DOI)")
        else:
            doc, err_msg = result[1], result[2]
            errors.append((doc.get("document_id"), err_msg))
            enriched_documents.append(doc)
            safe_print(f"❌ [{i}/{total}] {doc['document_id']} error: {err_msg}")

safe_print(f"✅ Unpaywall enrichment complete — {total} docs, {len(errors)} errors")
safe_print(f"✅ Enriched {len(enriched_documents)} docs")

✅ [1/5] 10ac5f27529e73a6885c25433b767a42e13b54f26b8b5b68c1dd19c4ffedd251 saved
✅ [2/5] 9027c2413e299c7dfb24977743b56368409a56b7c58ea47eddb724b44b12b44a saved
✅ [3/5] 8b30975f7f70338e35bed64be6fd37b62d033d83fc44275c27c19da8d167d5ad saved
✅ [4/5] 065269ab1acbba8c079afb68bcc32b9236d59cba3e488415624a99a01d068c74 saved
✅ [5/5] 667501dff77dd83b730729e3afcf37e81d5931ffe7b97e3c3d092de56c6c6188 saved
✅ Unpaywall enrichment complete — 5 docs, 0 errors
✅ Enriched 5 docs


## Display enriched-only data for first record

In [10]:
final_doc = enriched_documents[0]
print(f"Showing the first PubMed enrichment-only data:")
print(json.dumps({
    "document_id": final_doc["document_id"],
    "source": final_doc["source"],
    "identifiers": final_doc["identifiers"],
    "access": final_doc["access"],
    "license": final_doc["license"],
    "tags": final_doc["tags"]
}, indent=2))


Showing the first PubMed enrichment-only data:
{
  "document_id": "10ac5f27529e73a6885c25433b767a42e13b54f26b8b5b68c1dd19c4ffedd251",
  "source": "pubmed",
  "identifiers": [
    {
      "type": "pmid",
      "value": "41514840"
    },
    {
      "type": "pubmed",
      "value": "41514840"
    },
    {
      "type": "doi",
      "value": "10.3390/ani16010153"
    },
    {
      "type": "pii",
      "value": "ani16010153"
    }
  ],
  "access": {
    "has_fulltext": true,
    "access_type": "open",
    "fulltext_urls": [
      {
        "url": "https://doi.org/10.3390/ani16010153",
        "format": "html",
        "source": "unpaywall:publisher"
      }
    ]
  },
  "license": {
    "type": "cc-by",
    "url": null,
    "notes": null
  },
  "tags": [
    "Journal Article",
    "oa"
  ]
}


## Output each records in separate files

In [11]:
os.makedirs("output", exist_ok=True)


for doc in enriched_documents:
    # Create an output path name. (using the document id now) change later lol.
    output_path = f"output/{doc['document_id']}.json"
    # Open the file for writing
    with open(output_path, "w", encoding="utf-8") as f:
        # Dump the JSON value in the file (fall back to string if we couldn't serialize)
        json.dump(doc, f, indent=2, default=str)

    print("Saved:", output_path)

Saved: output/10ac5f27529e73a6885c25433b767a42e13b54f26b8b5b68c1dd19c4ffedd251.json
Saved: output/9027c2413e299c7dfb24977743b56368409a56b7c58ea47eddb724b44b12b44a.json
Saved: output/8b30975f7f70338e35bed64be6fd37b62d033d83fc44275c27c19da8d167d5ad.json
Saved: output/065269ab1acbba8c079afb68bcc32b9236d59cba3e488415624a99a01d068c74.json
Saved: output/667501dff77dd83b730729e3afcf37e81d5931ffe7b97e3c3d092de56c6c6188.json
