In [1]:
import os
import time
import re
import pandas as pd
import xml.etree.ElementTree as ET
from tqdm import tqdm
from Bio import Entrez

Entrez.email = "nd23942@bristol.ac.uk"
INPUT_FILE = "../02_extract_results_CITATIONS/raw_abstracts.csv"
OUTPUT_FILE = "pmc_extracted_sections_sample.csv"
SECTION_KEYWORDS = ["discussion", "conclusion", "summary"]

In [2]:
df = pd.read_csv(INPUT_FILE)
pmids = df["PMID"].dropna().astype(str).unique().tolist()
print(f"Loaded {len(pmids)} PMIDs")

Loaded 696 PMIDs


In [3]:
def pmid_to_pmcid(pmids):
    pmcid_map = {}
    batch_size = 200
    for i in tqdm(range(0, len(pmids), batch_size), desc="Converting PMIDs"):
        batch = pmids[i:i+batch_size]
        try:
            handle = Entrez.elink(dbfrom="pubmed", db="pmc", id=batch, linkname="pubmed_pmc")
            records = Entrez.read(handle)
            handle.close()
            for record in records:
                pmid = record['IdList'][0]
                pmcid = None
                if record.get("LinkSetDb"):
                    for linkset in record["LinkSetDb"]:
                        if linkset["LinkName"] == "pubmed_pmc":
                            pmcid = linkset["Link"][0]["Id"]
                            break
                pmcid_map[pmid] = pmcid
        except Exception as e:
            print(f"Batch error at {i}: {e}")
        time.sleep(0.34)
    return pmcid_map

pmcid_map = pmid_to_pmcid(pmids)
df["PMCID"] = df["PMID"].astype(str).map(pmcid_map)
print(f"Matched PMCID for {df['PMCID'].notna().sum()}/{len(df)} PMIDs")
df.to_csv("pmid_pmcid_mapping.csv", index=False)

Converting PMIDs: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:14<00:00, 18.61s/it]

Matched PMCID for 221/696 PMIDs





In [13]:
def fetch_pmc_fulltext(pmcid, verbose=False):
    try:
        handle = Entrez.efetch(db="pmc", id=pmcid, rettype="full", retmode="xml")
        xml_bytes = handle.read()
        handle.close()

        # Decode bytes to string
        xml_data = xml_bytes.decode("utf-8", errors="replace")

        if verbose:
            print(f"\n🔍 Preview of PMC {pmcid} XML:\n{xml_data[:300]}\n")

        # Check if it's a valid PMC XML document
        if not xml_data.strip().startswith("<?xml") or "<article" not in xml_data:
            print(f"⚠️ Invalid or malformed XML for PMCID {pmcid}")
            return None

        return xml_data

    except Exception as e:
        print(f"Fetch error for PMCID {pmcid}: {e}")
        return None

In [17]:
def extract_target_section(xml_data, target_keywords=['discussion', 'conclusion', 'summary']):
    try:
        root = ET.fromstring(xml_data)
        sections = root.findall(".//sec")

        for sec in sections:
            title_elem = sec.find("title") 
            if title_elem is not None and title_elem.text:
                title_text = title_elem.text.strip().lower()
                for keyword in target_keywords:
                    if keyword in title_text:
                        paras = [p.text.strip() for p in sec.findall("p") if p.text]
                        full_text = " ".join(paras)
                        if full_text:
                            return full_text, title_text
        return None, None
    except Exception as e:
        print(f"XML parsing error: {e}")
        return None, None

In [18]:
sample_df = df[df['PMCID'].notna()].head(30)
results = []

for _, row in tqdm(sample_df.iterrows(), total=len(sample_df), desc="Extracting sections"):
    pmid, pmcid = row["PMID"], row["PMCID"]
    xml_data = fetch_pmc_fulltext(pmcid)
    if xml_data:
        text, section_name = extract_target_section(xml_data)
        status = "success" if text else "section_not_found"
    else:
        text, section_name = None, None
        status = "fetch_error"

    results.append({
        "PMID": pmid,
        "PMCID": pmcid,
        "Matched_Section": section_name,
        "Extracted_Section": text,
        "Status": status
    })
    time.sleep(0.34)

Extracting sections: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:26<00:00,  1.15it/s]


In [19]:
results_df = pd.DataFrame(results)
results_df.to_csv(OUTPUT_FILE, index=False)

print("\nExtraction Summary:")
print(results_df["Status"].value_counts())
print("\nSection Type Frequency:")
print(results_df["Matched_Section"].value_counts(dropna=False))


Extraction Summary:
Status
success              19
section_not_found    11
Name: count, dtype: int64

Section Type Frequency:
Matched_Section
None                            11
discussion                       4
conclusion                       3
authors' conclusions             2
conclusions:                     2
conclusions                      2
3. discussion                    2
summary of the evidence          1
conclusions and perspectives     1
3.3. result summary              1
4. discussion                    1
Name: count, dtype: int64
