In [None]:
"""
Function: download_sciencedirect_supplementaries

Description:
    Uses the ScienceDirect/Elsevier pattern to fetch supplementary materials (mmc files)
    by first obtaining the PII from a DOI via the ScienceDirect API, then constructing a
    compact download URL for each attachment.

    Base URL template (compact form):
        https://ars-els-cdn-com.ezproxy.med.cornell.edu/content/image/1-s2.0-{pii}-mmc{i}.{ext}

    Given a cleaned PII, the function iterates attachment indices (i = 1, 2, 3, ...)
    and tries multiple candidate extensions (e.g., pdf, docx, xlsx, jpg, png, zip, etc.).
    If a URL responds successfully, it is treated as the (i)-th supplementary file and is
    downloaded. The loop continues with i+1. When no extension matches for the next i,
    the function concludes that all supplements have been downloaded.

Inputs:
    pii : str
        Cleaned Elsevier PII (e.g., "S0167739X24012345"). Must not include spaces or prefixes.
    out_dir : str | Path
        Directory where supplementary files will be saved. Files are named:
        {PMID}_supp_{i}.{ext} or {PII}_supp_{i}.{ext}, depending on your naming strategy.
    base_url_template : str
        URL format string containing {pii}, {i}, and {ext} placeholders.
        Defaults to:
        "https://ars-els-cdn-com.ezproxy.med.cornell.edu/content/image/1-s2.0-{pii}-mmc{i}.{ext}"

Process:
    1) For i = start_index, build candidate URLs by substituting {pii}, {i}, and each {ext}.
    2) Issue a GET for each candidate URL until one succeeds (HTTP 200).
    3) On first success:
         - Save the response content as the i-th supplementary file and break the ext loop.
         - Increment i and repeat step 1.
    4) If no extension succeeds for a given i (e.g., 404/403 for all), stop the iteration.

Outputs:
    - Downloaded supplementary files saved under out_dir.
    - Optional log of successes/failures, depending on your implementation.

Auth & Access Notes:
    - Some articles require institutional access. On the very first run, you will have ~1 minute
      to complete your university library login (via the ezproxy flow). Use a persistent
      requests.Session and keep it alive so subsequent downloads remain authenticated.


"""


In [None]:
# science direct
import requests, re, time, pandas as pd

import requests, re, pandas as pd

def get_pii_from_doi(doi):
    url = f"https://api.elsevier.com/content/article/doi/{doi}"
    r = requests.get(url, headers={"Accept": "application/xml"}, timeout=15)
    if r.status_code == 200:
        match = re.search(r"<pii>(.*?)</pii>", r.text)
        if match:
            return match.group(1)
    return None

df = pd.read_csv("last_literature list - pubmed.csv", dtype=str)

import os
pdf_dir = "sci-hub"
existing_pmids = {os.path.splitext(f)[0] for f in os.listdir(pdf_dir) if f.lower().endswith(".pdf")}
df = df[df["PMID"].astype(str).isin(existing_pmids)]

df["PII"] = df["DOI"].apply(get_pii_from_doi)
df["PII"]

In [None]:
df["PII_clean"] = df["PII"].str.upper().apply(lambda x: re.sub(r'[^0-9A-Z]', '', str(x)))
df["PII_clean"]

In [None]:
import os, time, shutil
from pathlib import Path
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

BASE = ("https://ars-els-cdn-com.ezproxy.med.cornell.edu/content/image/1-s2.0-{pii}-mmc{i}.{ext}")
EXTS = [
    "jpg","jpeg","png","tif","tiff","bmp","gif","svg",
    "pdf","docx","doc","xlsx","xls","pptx","ppt","txt","csv","tex","xml","json",
    "zip","tar","gz","rar","7z","tar.gz",
    "mp4","avi","mov","mpg","mpeg","wav","mp3"
]
MAX_I = 30
MISS_STOP = 3

DOWNLOAD_DIR = Path("downloads_tmp"); DOWNLOAD_DIR.mkdir(exist_ok=True)
OUT_DIR = Path("supp_outputs"); OUT_DIR.mkdir(exist_ok=True)

def init_driver():
    opts = Options()
    # opts.add_argument("--headless=new")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--log-level=3")
    prefs = {
        "download.default_directory": str(DOWNLOAD_DIR.resolve()),
        "download.prompt_for_download": False,
        "download.directory_upgrade": True,
        "safebrowsing.enabled": True,
    }
    opts.add_experimental_option("prefs", prefs)
    return webdriver.Chrome(options=opts)

def snapshot(p: Path):
    return set(p.glob("*"))

def wait_new_file(p: Path, before, timeout=25):
    end = time.time() + timeout
    while time.time() < end:
        after = set(p.glob("*"))
        new_files = [f for f in after - before if not f.name.endswith(".crdownload")]
        if new_files:
            new_files.sort(key=lambda x: x.stat().st_mtime, reverse=True)
            return new_files[0]
        time.sleep(0.5)
    return None

driver = init_driver()

for _, row in df.iterrows():
    pmid = str(row["PMID"]).strip()
    pii  = str(row["PII_clean"]).strip()
    if not pmid or not pii:
        continue

    print(f"\n=== PMID {pmid} | PII {pii} ===")
    miss = 0

    for i in range(1, MAX_I + 1):
        base_name = OUT_DIR / f"{pmid}_supp_{i}"
        if list(base_name.parent.glob(f"{base_name.name}.*")):
            print(f"  - i={i}: already existed")
            continue

        got = False
        for ext in EXTS:
            url = BASE.format(pii=pii, i=i, ext=ext)
            print(f"  - try: {url}")

            before = snapshot(DOWNLOAD_DIR)
            try:
                driver.get(url)
            except Exception as e:
                print(f"  false opening: {e}")
                continue

            f = wait_new_file(DOWNLOAD_DIR, before, timeout=2)
            if not f:
                continue

        if got:
            miss = 0
        else:
            miss += 1
            print(f"  - miss {miss}")
            if miss >= MISS_STOP:
                print("  -> all supple downloaded, next")
                break

driver.quit()
