In [None]:
"""
Function: download_pmc_articles

Description:
    This script simulates searching for articles on PubMed Central (PMC).
    For each PMCID in the input file, it checks the PMC webpage to determine:
        - Whether a "PDF" download button is available.
        - Whether there are one or more "Supplementary" file links.
    If found, the script downloads the PDF and all supplementary files.

Outputs:
    - missing_row_pmcid_results_temp.csv : temporary log file (updated every 10 articles),
      recording whether each article has a downloadable PDF, whether supplementary files exist,
      and how many were found.
    - missing_row_pmcid_results.csv : final summary file after all downloads are complete.
    - failed.txt : records supplementary files that failed to download.

Parameters:
    csv_path : str
        Path to the input CSV file containing the PMCID list.
    out_dir : str
        Path to the output directory where downloaded files will be saved.
        Each article will be stored as:
            {PMID}.pdf  → main article
            {PMID}_supp_{i}  → supplementary files

Notes:
    Some failed or error cases may require manual fixing.
"""


In [None]:
import os
import re
import time
from pathlib import Path
from wsgiref import headers

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

In [None]:
def normalize_col(col: str) -> str:
    return re.sub(r"[^a-z0-9]+", "_", col.strip().lower())

def find_col(cols, candidates):
    norm = {normalize_col(c): c for c in cols}
    for cand in candidates:
        key = normalize_col(cand)
        if key in norm:
            return norm[key]
    return None

def download_file(url, out_path, chunk=1024*256, timeout=10, headless=False, user_agent=None):
    import os
    import re
    import time
    import tempfile
    import shutil
    from urllib.parse import urlparse, urljoin

    import requests
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.chrome.service import Service
    from selenium.common.exceptions import WebDriverException
    from webdriver_manager.chrome import ChromeDriverManager

    def _safe_filename_from_cd(cd):
        if not cd:
            return None
        m = re.search(r'filename\*=UTF-8\'\'([^;]+)', cd)
        if m:
            from urllib.parse import unquote
            return unquote(m.group(1))
        m = re.search(r'filename=\"?([^\";]+)\"?', cd)
        if m:
            return m.group(1)
        return None

    def _stream_save(resp, path, chunk_size):
        tmp = path + ".part"
        with open(tmp, "wb") as f:
            for chunk_ in resp.iter_content(chunk_size=chunk_size):
                if chunk_:
                    f.write(chunk_)
        os.replace(tmp, path)
        return os.path.abspath(path)

    def _wait_for_download(tmpdir, before_set, wait_timeout):
        end = time.time() + wait_timeout
        last_candidate = None
        while time.time() < end:
            current = set(os.listdir(tmpdir))
            new = current - before_set
            ready = [f for f in new if not f.endswith((".crdownload", ".part", ".partial", ".tmp"))]
            if ready:
                ready.sort(key=lambda x: os.path.getmtime(os.path.join(tmpdir, x)))
                candidate = os.path.join(tmpdir, ready[-1])
                s1 = os.path.getsize(candidate)
                time.sleep(0.1)
                s2 = os.path.getsize(candidate)
                if s1 == s2 and s1 > 0:
                    return candidate
                last_candidate = candidate
            time.sleep(0.1)
        return last_candidate if last_candidate and os.path.exists(last_candidate) else None

    parsed = urlparse(url)
    if not parsed.scheme:
        url = "http://" + url
        parsed = urlparse(url)

    out_path = os.path.abspath(out_path)
    os.makedirs(os.path.dirname(out_path), exist_ok=True)

    session = requests.Session()
    headers = {"User-Agent": user_agent or "Mozilla/5.0"}

    try:
        resp = session.get(url, stream=True, timeout=10, allow_redirects=True, headers=headers)
        ctype = resp.headers.get("Content-Type", "").lower()
        cd = resp.headers.get("Content-Disposition", "")
        if ("application/pdf" in ctype) or ("application/octet-stream" in ctype) or ("attachment" in cd.lower()) or url.lower().endswith(".pdf"):
            fname = _safe_filename_from_cd(cd) or os.path.basename(urlparse(url).path) or f"file_{int(time.time())}.pdf"
            target = out_path if not os.path.isdir(out_path) else os.path.join(out_path, fname)
            return _stream_save(resp, target, chunk)
    except Exception:
        pass

    tmpdir = tempfile.mkdtemp(prefix="selenium_dl_")
    prefs = {
        "download.default_directory": os.path.abspath(tmpdir),
        "download.prompt_for_download": False,
        "download.directory_upgrade": True,
        "plugins.always_open_pdf_externally": True,
    }
    opts = Options()

    opts.add_argument("--headless=new")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--window-size=1400,900")
    opts.add_experimental_option("prefs", prefs)

    driver = None
    try:
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=opts)

        origin = f"{parsed.scheme}://{parsed.netloc}"
        before = set(os.listdir(tmpdir))

        driver.get(origin)
        time.sleep(0.5)
        driver.get(url)
        time.sleep(1)

        maybe = _wait_for_download(tmpdir, before, 10)
        if maybe:
            target = out_path if not os.path.isdir(out_path) else os.path.join(out_path, os.path.basename(maybe))
            shutil.move(maybe, target)
            return os.path.abspath(target)

        xpaths = [
            "//a[contains(@href, '.pdf')]",
            "//a[@download]",
            "//a[contains(translate(text(),'PDF','pdf'),'pdf')]",
            "//button[contains(translate(text(),'DOWNLOAD','download'),'download')]",
        ]
        for xp in xpaths:
            els = driver.find_elements("xpath", xp)
            if els:
                for el in els:
                    try:
                        el.click()
                        maybe = _wait_for_download(tmpdir, before, timeout)
                        if maybe:
                            target = out_path if not os.path.isdir(out_path) else os.path.join(out_path, os.path.basename(maybe))
                            shutil.move(maybe, target)
                            return os.path.abspath(target)
                    except Exception:
                        continue

        raise RuntimeError("no file")

    finally:
        if driver:
            time.sleep(0.1)
            driver.quit()
        shutil.rmtree(tmpdir, ignore_errors=True)

In [None]:
import numpy as np
def download_from_pmcid(pmcid, pmid, out_dir, session):

    base_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/"
    pdf_url = f"https://pmc.ncbi.nlm.nih.gov/articles/{pmcid}/pdf"
    supp_dir = Path(out_dir)
    main_pdf_path = supp_dir / f"{pmid}.pdf"

    results = {"pmcid": pmcid, "pmid": pmid, "pdf": False, "supp": 0}

    r = session.get(base_url, timeout=10)
    soup = BeautifulSoup(r.text, "html.parser")
    # print(soup)
    results["pdf"] = False
    for a in soup.find_all("a", href=True):
            href = a["href"]
            if pdf_url in href or re.search(r"pdf/[^/]+\.pdf", href, re.IGNORECASE):
                # print(href)
                results["pdf"] = True
                break
    # print(results["pdf"])
    try:
        if results["pdf"]:
            download_file(pdf_url, main_pdf_path, session)
            results["pdf"] = True
        # print(f"pdf download success {pmid}.pdf")
    except Exception as e:
        print(f"pdf download false {pmcid}: {e}")

    try:
        r = session.get(base_url, timeout=10)
        if r.status_code != 200:
            print(f"cannot reach ({r.status_code}): {base_url}")
            return results

        soup = BeautifulSoup(r.text, "html.parser")
        supp_links = [
            a["href"]
            for a in soup.find_all("a", href=True)
            if "/articles/instance/" in a["href"] and "/bin/" in a["href"]
        ]
        # supp_links =  np.unique(supp_links)
        supp_links = list(dict.fromkeys(supp_links))
        if not supp_links:
            # print(f" {pmcid} has no supp")
            return results

        # print(f"{pmcid} found {len(supp_links)} supp links")

        for i, link in enumerate(supp_links, 1):
            if link.startswith("//"):
                url = "https:" + link
            elif link.startswith("/pmc/"):
                url = "https://www.ncbi.nlm.nih.gov" + link
            elif link.startswith("/articles/instance/"):
                url = "https://pmc.ncbi.nlm.nih.gov" + link
            elif not link.startswith("http"):
                url = "https://pmc.ncbi.nlm.nih.gov" + link
            else:
                url = link

            # print(url)
            orig_name = os.path.basename(url.split("?")[0])
            supp_name = supp_dir / f"{pmid}_supp_{i}_{orig_name}"
            # ext = os.path.splitext(url.split("?")[0])[1] or ".dat"
            # supp_name = supp_dir / f"{pmid}_supp_{i}{ext}"

            try:
                download_file(url, supp_name, session)
                results["supp"] += 1
                # print(f"   supp {i} download success：{supp_name.name}")
            except Exception as e:
                print(f"   supp {i} download false：{url} ({e})")
                with open("failed.txt", "a", encoding="utf-8") as f:
                    f.write(f"{pmid}:supp{i}:{url}\n")


    except Exception as e:
        print(f"reach supp false {pmcid}: {e}")

    return results


In [None]:
csv_path = "missing_rows.csv"
out_dir = "pmc_papers"
sleep = 0.1

session = requests.Session()
session.headers.update({
    "User-Agent": "PMCDownloader/1.0"
})

report = []

df = pd.read_csv(csv_path)
# df = df.head(3)
# print(df)
i = 0
for pmcid, pmid in tqdm(zip(df["PMCID"], df["PMID"]), total=len(df)):
    if pd.isna(pmcid):
        continue
    i = i+1
    # if i<2191:
    #     continue
    result = download_from_pmcid(pmcid, pmid, out_dir, session)
    report.append(result)

    if i%10 == 0:
        temp = pd.DataFrame(report)
        temp.to_csv("missing_row_pmcid_results_temp.csv", index=False)
    time.sleep(0.1)

report = pd.DataFrame(report)
report.to_csv("missing_row_pmcid_results.csv", index=False)