In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Combined Sci-Hub + OA/PMC PDF Fetcher (v6, standalone)
------------------------------------------------------
Reads PMIDs (and optional DOIs) from an Excel file,
downloads PDFs via Sci-Hub in parallel, then for any
remaining failures performs a parallel Open Access
fallback (Unpaywall + PubMed Central). Improved logging:
  • Reports which PMIDs lacked DOIs.
  • Times each phase (DOI fetch, Sci-Hub, OA fallback).
  • Summarizes successes/failures per phase.
  • Removes byte‐count logs; instead logs per‐article result.
Maintains fast performance with 4 threads and minimal delays.
"""

import os, time, re, random, logging, requests, xml.etree.ElementTree as ET
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, quote_plus
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from concurrent.futures import ThreadPoolExecutor, as_completed

# === CONFIGURATION ===
EXCEL_FILE_PATH     = r"C:\Users\Galaxy\Downloads\screening_ERAS.xlsx"
OUTPUT_PDF_DIR      = "downloaded_pdfs_v6"
OA_DOWNLOAD_DIR     = "oa_downloads_v6"

SCI_HUB_DOMAINS         = [
    "https://sci-hub.se","https://sci-hub.ru","https://sci-hub.ren",
    "https://sci-hub.wf","https://sci-hub.ee","https://sci-hub.st",
]
REQUEST_DELAY_SCIHUB    = 1.5
MAX_WORKERS_SCIHUB      = 4
PREFERRED_DOMAINS_COUNT = 2
FALLBACK_DOMAINS_COUNT  = 2

NCBI_API_KEY        = "YOUR_API_KEY_HERE"
CROSSREF_MAILTO     = "your_email@example.com"
EFETCH_BATCH_SIZE   = 100
REQUEST_DELAY_NCBI  = 0.35

MY_EMAIL_FOR_APIS   = "levi4328@gmail.com"

# === LOGGING ===
logger = logging.getLogger("PDFFetcherV6")
logger.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - [%(threadName)s] - %(message)s'))
logger.addHandler(ch)

# === HTTP SESSIONS ===
def make_session(user_agent):
    s = requests.Session()
    retries = Retry(total=3, backoff_factor=0.5,
                    status_forcelist=[429,500,502,503,504],
                    allowed_methods=frozenset(['GET','POST']))
    adapter = HTTPAdapter(max_retries=retries, pool_connections=15, pool_maxsize=30)
    s.mount("https://", adapter)
    s.mount("http://", adapter)
    s.headers.update({'User-Agent': user_agent})
    return s

session_ncbi   = make_session(f"PDFFetcher/v6 (mailto:{CROSSREF_MAILTO})")
session_scihub = make_session("Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
session_http   = make_session(f"OAFetcher/v1.0 (mailto:{MY_EMAIL_FOR_APIS})")

# === UTILITIES ===
def sanitize_filename(name: str) -> str:
    s = re.sub(r'[\\/*?:"<>|]',"", name)
    s = s.replace(" ", "_").replace("__","_")
    return s[:200]

def _get_ncbi_params(extra=None):
    params = {"tool":"combined_pdf_fetcher_v6","email":CROSSREF_MAILTO}
    if NCBI_API_KEY!="YOUR_API_KEY_HERE":
        params["api_key"] = NCBI_API_KEY
    if CROSSREF_MAILTO=="your_email@example.com":
        params.pop("email",None)
    if extra: params.update(extra)
    return params

# === DOI FETCHING ===
def fetch_dois_for_pmids(pmids):
    pmid2doi = {pm: None for pm in pmids}
    valid = [pm for pm in pmids if pm]
    logger.info(f"Fetching DOIs for {len(valid)} PMIDs...")
    for i in range(0, len(valid), EFETCH_BATCH_SIZE):
        batch = valid[i:i+EFETCH_BATCH_SIZE]
        data = _get_ncbi_params({"db":"pubmed","retmode":"xml","id":",".join(batch)})
        try:
            r = session_ncbi.post(
                "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
                data=data, timeout=60
            ); r.raise_for_status()
            root = ET.fromstring(r.content)
            for art in root.findall(".//PubmedArticle"):
                pmid_elem = art.find(".//MedlineCitation/PMID")
                pm = pmid_elem.text if pmid_elem is not None else None
                if not pm: continue
                doi_elem = art.find(".//ArticleId[@IdType='doi']")
                if doi_elem is None:
                    doi_elem = art.find(".//ELocationID[@EIdType='doi'][@ValidYN='Y']")
                if doi_elem is not None and doi_elem.text:
                    pmid2doi[pm] = doi_elem.text.strip()
        except Exception as e:
            logger.warning(f"DOI batch error: {e}")
        time.sleep(REQUEST_DELAY_NCBI)
    found = sum(1 for v in pmid2doi.values() if v)
    missing = [pm for pm,d in pmid2doi.items() if not d]
    logger.info(f"DOI fetch: {found}/{len(valid)} found; missing DOIs for PMIDs: {missing}")
    return pmid2doi

# === Sci-Hub HELPERS ===
def test_scihub_domain(domain, test_doi="10.1000/182"):
    try:
        r = session_scihub.get(f"{domain.rstrip('/')}/{test_doi}", timeout=7)
        if r.status_code in (200,404) and 'html' in r.headers.get('Content-Type','').lower():
            return True
    except:
        pass
    return False

def init_scihub_domains():
    logger.info("Testing Sci-Hub domains...")
    working = []
    for d in random.sample(SCI_HUB_DOMAINS, len(SCI_HUB_DOMAINS)):
        if test_scihub_domain(d):
            working.append(d)
            if len(working) >= PREFERRED_DOMAINS_COUNT:
                break
    # backfill if needed
    if len(working) < PREFERRED_DOMAINS_COUNT:
        for d in SCI_HUB_DOMAINS:
            if d not in working:
                working.append(d)
                if len(working) >= PREFERRED_DOMAINS_COUNT:
                    break
    if not working:
        logger.error("No working Sci-Hub domains found!")
    else:
        logger.info(f"Sci-Hub domains: {working}")
    return working

def find_pdf_in_scihub(html, base_url):
    soup = BeautifulSoup(html, 'html.parser')
    base = f"{urlparse(base_url).scheme}://{urlparse(base_url).netloc}"
    # iframe#pdf, iframe#article
    for id_ in ('pdf','article'):
        ifr = soup.find('iframe', id=id_)
        if ifr and ifr.get('src'):
            return urljoin(base, ifr['src'])
    # any iframe with .pdf or downloads or tree
    for ifr in soup.find_all('iframe'):
        src = ifr.get('src','')
        if any(k in src.lower() for k in ('.pdf','downloads/','tree/')):
            return urljoin(base, src)
    # embed[type=application/pdf]
    emb = soup.find('embed', type='application/pdf')
    if emb and emb.get('src'):
        return urljoin(base, emb['src'])
    # selectors
    sels = ['button#download','a#download','div#download',
            'button[onclick*="location.href"]','a[onclick*="location.href"]',
            'a[href*=".pdf"]','a[href*="downloads/"]','a[href*="/tree/"]']
    for sel in sels:
        el = soup.select_one(sel)
        if el:
            href = el.get('href')
            if not href and el.get('onclick'):
                m = re.search(r"location\.href=['\"]([^'\"]+)", el['onclick'])
                href = m.group(1) if m else None
            if href:
                return urljoin(base, href)
    # text-pattern fallback
    for tag in soup.find_all(['a','button']):
        txt = (tag.text or "").lower()
        if any(w in txt for w in ('download','save','pdf','get article')):
            href = tag.get('href')
            if not href and tag.get('onclick'):
                m = re.search(r"location\.href=['\"]([^'\"]+)", tag['onclick'])
                href = m.group(1) if m else None
            if href:
                return urljoin(base, href)
    return None

def download_scihub(identifier, pmid, author, year, out_dir, working):
    fname = sanitize_filename(f"{year}_{author}_{pmid}") + ".pdf"
    path = os.path.join(out_dir, fname)
    if os.path.exists(path):
        return True, None
    domains = working[:] + random.sample(
        [d for d in SCI_HUB_DOMAINS if d not in working],
        k=min(FALLBACK_DOMAINS_COUNT, len(SCI_HUB_DOMAINS)-len(working))
    )
    for dom in domains:
        try:
            time.sleep(REQUEST_DELAY_SCIHUB)
            r = session_scihub.get(f"{dom.rstrip('/')}/{identifier}", timeout=20)
            r.raise_for_status()
            ct = r.headers.get('Content-Type','').lower()
            if 'application/pdf' in ct:
                with open(path,'wb') as f: f.write(r.content)
                return True, dom
            if 'html' in ct:
                link = find_pdf_in_scihub(r.content, r.url)
                if link:
                    pr = session_scihub.get(link, timeout=45, stream=True); pr.raise_for_status()
                    if 'application/pdf' in pr.headers.get('Content-Type','').lower():
                        with open(path,'wb') as f:
                            for c in pr.iter_content(8192):
                                f.write(c)
                        return True, dom
        except:
            continue
    return False, None

# === OA / PMC FALLBACK ===
def download_url(pdf_url, filepath, pmid, source):
    try:
        r = session_http.get(pdf_url, timeout=(10,60), stream=True, allow_redirects=True)
        r.raise_for_status()
        if 'application/pdf' not in r.headers.get('Content-Type','').lower() and not r.url.lower().endswith('.pdf'):
            logger.warning(f"PMID {pmid}: {source} link not PDF")
            return False
        with open(filepath,'wb') as f:
            for chunk in r.iter_content(65536):
                f.write(chunk)
        return True
    except Exception as e:
        logger.warning(f"PMID {pmid}: {source} download error: {e}")
        return False

def unpaywall_url(doi):
    try:
        api = f"https://api.unpaywall.org/v2/{quote_plus(doi)}?email={MY_EMAIL_FOR_APIS}"
        r = session_http.get(api, timeout=20)
        if r.status_code!=200: return None
        j = r.json()
        if j.get("is_oa"):
            loc = j.get("best_oa_location") or {}
            if loc.get("url_for_pdf"):
                return loc["url_for_pdf"]
            for o in j.get("oa_locations",[]):
                if o.get("url_for_pdf"):
                    return o["url_for_pdf"]
    except:
        pass
    return None

def pmc_id(pmid):
    params = _get_ncbi_params({"dbfrom":"pubmed","db":"pmc","id":pmid,"cmd":"neighbor_score"})
    for linkname in ("pubmed_pmc_refs","pubmed_pmc"):
        params["linkname"] = linkname
        try:
            r = session_ncbi.post("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi",
                                  data=params, timeout=15); r.raise_for_status()
            root = ET.fromstring(r.content)
            ids = root.findall(".//Link/Id")
            if ids:
                text = ids[0].text.strip()
                return text if text.upper().startswith("PMC") else "PMC"+text
        except:
            continue
    return None

def pmc_download(pmcid, filepath, pmid):
    try:
        url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/pdf/"
        r = session_http.get(url, timeout=(10,30), allow_redirects=True); r.raise_for_status()
        if r.url.lower().endswith('.pdf'):
            return download_url(r.url, filepath, pmid, f"PMC {pmcid}")
        if 'html' in r.headers.get('Content-Type',''):
            soup = BeautifulSoup(r.content,'html.parser')
            sel = soup.select_one('a[href$=".pdf"]')
            if sel and sel.get('href'):
                link = urljoin(r.url, sel['href'])
                return download_url(link, filepath, pmid, f"PMC {pmcid}")
    except Exception:
        pass
    return False

def process_oa(article):
    pmid, doi, title = article["pmid"], article.get("doi"), article.get("title_for_file", article["pmid"])
    fname = sanitize_filename(title) + ".pdf"
    path = os.path.join(OA_DOWNLOAD_DIR, fname)
    if os.path.exists(path):
        return pmid, "skipped"
    if doi:
        url = unpaywall_url(doi)
        if url and download_url(url, path, pmid, "Unpaywall"):
            return pmid, "Unpaywall"
    pmcid = pmc_id(pmid)
    if pmcid and pmc_download(pmcid, path, pmid):
        return pmid, "PMC"
    return pmid, "failed"

# === MAIN ===
def main():
    t0 = time.time()
    logger.info("=== Starting Combined PDF Fetcher V6 ===")

    # read
    try:
        df = pd.read_excel(EXCEL_FILE_PATH)
    except Exception as e:
        logger.error(f"Cannot read Excel: {e}")
        return

    pmids = [str(x).split('.')[0] for x in df['PMID'].dropna().unique()]

    # DOI fetch
    t1 = time.time()
    pmid2doi = fetch_dois_for_pmids(pmids)
    t2 = time.time()

    # Sci-Hub phase
    os.makedirs(OUTPUT_PDF_DIR, exist_ok=True)
    working = init_scihub_domains()
    jobs = []
    with ThreadPoolExecutor(max_workers=MAX_WORKERS_SCIHUB) as exec:
        for _, row in df.iterrows():
            pmid = str(row.get('PMID','')).split('.')[0]
            if not pmid: continue
            doi = pmid2doi.get(pmid)
            ident = doi if doi else pmid
            author = str(row.get('First Author','')).strip() or "Unknown"
            year   = str(row.get('Year','')).strip() or "Unknown"
            jobs.append(exec.submit(download_scihub, ident, pmid, author, year, OUTPUT_PDF_DIR, working))
        sci_results = [f.result() for f in as_completed(jobs)]
    succeeded = [res for res,dom in sci_results if res]
    failed_pmids = [str(row.get('PMID','')).split('.')[0] for (res,dom), (_,row) in zip(sci_results, df.iterrows()) if not res]
    t3 = time.time()
    logger.info(f"Sci-Hub phase: {len(succeeded)}/{len(pmids)} succeeded in {t3-t2:.2f}s; failures: {failed_pmids}")

    # OA fallback phase (parallel)
    os.makedirs(OA_DOWNLOAD_DIR, exist_ok=True)
    oa_jobs = [{"pmid": pm, "doi": pmid2doi.get(pm), "title_for_file": pm} for pm in failed_pmids]
    with ThreadPoolExecutor(max_workers=MAX_WORKERS_SCIHUB) as exec:
        futs = {exec.submit(process_oa, art): art["pmid"] for art in oa_jobs}
        oa_results = [f.result() for f in as_completed(futs)]
    oa_summary = {}
    for pm, src in oa_results:
        oa_summary[src] = oa_summary.get(src, 0) + 1
    t4 = time.time()
    logger.info(f"OA fallback: completed {len(oa_results)} in {t4-t3:.2f}s; breakdown: {oa_summary}")

    logger.info(f"Total runtime: {time.time()-t0:.2f}s")
    logger.info("=== Fetcher V6 Completed ===")

if __name__ == "__main__":
    main()


2025-05-21 07:21:27,271 - INFO - [MainThread] - === Starting Combined PDF Fetcher V6 ===
2025-05-21 07:21:27,500 - INFO - [MainThread] - Fetching DOIs for 32 PMIDs...
2025-05-21 07:21:28,761 - INFO - [MainThread] - DOI fetch: 31/32 found; missing DOIs for PMIDs: ['26888001']
2025-05-21 07:21:28,762 - INFO - [MainThread] - Testing Sci-Hub domains...
2025-05-21 07:21:31,439 - INFO - [MainThread] - Sci-Hub domains: ['https://sci-hub.ren', 'https://sci-hub.ru']
2025-05-21 07:22:22,402 - INFO - [MainThread] - Sci-Hub phase: 14/32 succeeded in 53.64s; failures: ['39955421', '40340819', '39068053', '39384309', '32145713', '30518491', '33401363', '39185540', '37802689', '37062759', '35790215', '34059337', '27083963', '26888001', '39520824', '39489669', '39083294', '31859070']
2025-05-21 07:22:47,266 - INFO - [MainThread] - OA fallback: completed 18 in 24.86s; breakdown: {'PMC': 10, 'Unpaywall': 4, 'failed': 4}
2025-05-21 07:22:47,266 - INFO - [MainThread] - Total runtime: 80.00s
2025-05-21 07:

In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
PDF Fetcher v9  (single-file, no external imports beyond pip-installables)
=========================================================================
• Reads PMIDs from an Excel file
• Gets DOI + Title + Year + First-author from PubMed (one EFetch)
• Sci-Hub download first  →  OA fallback (Unpaywall → PMC)
• One output folder, one filename pattern:
      [Year]-[PMID]-[FirstAuthor]-[Title].pdf
• Every download validated (%PDF- header) ; bad files deleted & URL logged
• Logs ALL URLs hit (Sci-Hub HTML + direct, Unpaywall API, PMC landing & PDF)

Dependencies: pandas, requests, beautifulsoup4, openpyxl, urllib3
"""

# ---------------------------- USER SETTINGS -----------------------------------
EXCEL_FILE_PATH      = r"C:\Users\Galaxy\Downloads\screening_ERAS.xlsx"
OUTPUT_PDF_DIR       = "downloaded_pdfs"

NCBI_API_KEY         = "YOUR_API_KEY_HERE"        # optional
CROSSREF_MAILTO      = "your_email@example.com"   # put a real e-mail
MY_EMAIL_FOR_APIS    = "levi4328@gmail.com"       # Unpaywall UA

SCI_HUB_DOMAINS      = [
    "https://sci-hub.ren","https://sci-hub.se","https://sci-hub.ru",
    "https://sci-hub.ee","https://sci-hub.wf","https://sci-hub.st"
]
PREFERRED            = 2     # probe until this many mirrors respond
FALLBACK             = 2     # extra mirrors to shuffle in per article
DELAY_SCIHUB         = 0.8   # s between attempts per article
MAX_THREADS          = 6
# ------------------------------------------------------------------------------

import os, time, re, random, logging, requests, xml.etree.ElementTree as ET
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, quote_plus
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from concurrent.futures import ThreadPoolExecutor, as_completed

# ------------------------------- LOGGING --------------------------------------
log = logging.getLogger("PDF_FETCHER_v9")
log.setLevel(logging.INFO)
_hdl = logging.StreamHandler()
_hdl.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - [%(threadName)s] - %(message)s"))
log.addHandler(_hdl)

# --------------------------- HTTP SESSIONS ------------------------------------
def sess(user_agent: str, retries:int=3) -> requests.Session:
    s = requests.Session()
    r = Retry(total=retries, backoff_factor=0.5,
              status_forcelist=[429,500,502,503,504],
              allowed_methods=frozenset(["GET","POST"]))
    ad = HTTPAdapter(max_retries=r, pool_connections=20, pool_maxsize=40)
    s.mount("https://", ad);  s.mount("http://", ad)
    s.headers.update({"User-Agent": user_agent})
    return s

s_ncbi   = sess(f"PDFFetcher/9 (mailto:{CROSSREF_MAILTO})")
s_scihub = sess("Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
s_http   = sess(f"OAFetcher/9 (mailto:{MY_EMAIL_FOR_APIS})", retries=2)

# ------------------------------ HELPERS ---------------------------------------
def sanitize(txt:str)->str:
    txt = re.sub(r'[\\/*?:"<>|]', '', txt or '')
    txt = re.sub(r'\s+', '_', txt).strip('_')
    return txt[:160]

def ncbi_params(extra=None):
    p={"tool":"pdf_fetcher_v9","email":CROSSREF_MAILTO}
    if NCBI_API_KEY and NCBI_API_KEY!="YOUR_API_KEY_HERE": p["api_key"]=NCBI_API_KEY
    if CROSSREF_MAILTO=="your_email@example.com": p.pop("email",None)
    if extra: p.update(extra)
    return p

def valid_pdf(path:str)->bool:
    try:
        with open(path,'rb') as f:
            hdr=f.read(4096)
        return b'%PDF-' in hdr[:8]
    except: return False

# ----------------------  PUBMED METADATA / DOI --------------------------------
def fetch_meta(pmids:list[str])->dict[str,dict]:
    meta = {pm:{'doi':None,'title':pm,'author':'Unknown','year':'Unknown'} for pm in pmids}
    chunk = 150
    log.info(f"EFetch metadata for {len(pmids)} PMIDs …")
    for i in range(0,len(pmids),chunk):
        ids = pmids[i:i+chunk]
        try:
            r=s_ncbi.post("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
                          data=ncbi_params({"db":"pubmed","retmode":"xml","id":",".join(ids)}),
                          timeout=60); r.raise_for_status()
            root=ET.fromstring(r.content)
            for art in root.findall(".//PubmedArticle"):
                pm = art.findtext(".//PMID");  meta.setdefault(pm,{})
                meta[pm].update({
                    'title' : art.findtext(".//ArticleTitle") or pm,
                    'year'  : art.findtext(".//PubDate/Year") or art.findtext(".//ArticleDate/Year") or "Unknown",
                    'author': art.findtext(".//AuthorList/Author[1]/LastName") or "Unknown",
                    'doi'   : (art.findtext(".//ArticleId[@IdType='doi']") or
                               art.findtext(".//ELocationID[@EIdType='doi'][@ValidYN='Y']"))
                })
        except Exception as e: log.warning(f"EFetch chunk error: {e}")
        time.sleep(0.25 if NCBI_API_KEY else 0.35)
    missing=[pm for pm,d in meta.items() if not d.get('doi')]
    if missing: log.info(f"Missing DOI: {missing}")
    return meta

# ------------------------------  SCI-HUB --------------------------------------
def probe(domain:str)->bool:
    try: return requests.head(f"{domain.rstrip('/')}/10.1000/xyz",timeout=5).status_code in (200,404)
    except: return False

def working_domains()->list[str]:
    log.info("Probing Sci-Hub mirrors …")
    ok=[d for d in random.sample(SCI_HUB_DOMAINS,len(SCI_HUB_DOMAINS)) if probe(d)][:PREFERRED]
    rest=[d for d in SCI_HUB_DOMAINS if d not in ok]
    ok+=random.sample(rest,k=min(FALLBACK,len(rest)))
    log.info(f"Using mirrors: {ok}")
    return ok

def pdf_link(html:bytes,base:str)->str|None:
    soup=BeautifulSoup(html,'html.parser')
    base=f"{urlparse(base).scheme}://{urlparse(base).netloc}"
    for sel in ['iframe#pdf','iframe#article','embed[type="application/pdf"]']:
        tag=soup.select_one(sel);  src=tag['src'] if tag and tag.get('src') else None
        if src: return urljoin(base,src)
    for tag in soup.find_all(['iframe','a']):
        src=tag.get('src') or tag.get('href','')
        if any(k in src.lower() for k in ('.pdf','downloads','tree')):
            return urljoin(base,src)
    return None

def sci_worker(article:dict, mirrors:list[str])->bool:
    pm, doi = article['pmid'], article['doi']
    name=f"{article['year']}-{pm}-{sanitize(article['author'])}-{sanitize(article['title'])}.pdf"
    path=os.path.join(OUTPUT_PDF_DIR,name)
    if os.path.exists(path) and valid_pdf(path):
        log.info(f"Sci-Hub ✓ {pm} (cached)")
        return True
    for mirror in mirrors:
        for ident in (doi, pm) if doi else (pm,):
            try:
                time.sleep(DELAY_SCIHUB)
                url=f"{mirror.rstrip('/')}/{ident}"
                r=s_scihub.get(url,timeout=15); r.raise_for_status()
                if 'application/pdf' in r.headers.get('Content-Type',''):
                    with open(path,'wb') as f: f.write(r.content)
                else:
                    link=pdf_link(r.content,r.url)
                    if not link: continue
                    r2=s_scihub.get(link,timeout=40,stream=True); r2.raise_for_status()
                    with open(path,'wb') as f:
                        for c in r2.iter_content(65536): f.write(c)
                if valid_pdf(path):
                    log.info(f"Sci-Hub ✓ {pm} via {mirror}")
                    return True
                os.remove(path)
            except Exception: pass
        # after first mirror failure switch ident order to PMID only
    log.warning(f"Sci-Hub ✗ {pm}")
    return False

# ------------------------  OPEN-ACCESS: Unpaywall + PMC -----------------------
def log_get(tag:str,pmid:str,url:str): log.info(f"{tag} -> PMID {pmid} : GET {url}")

def save_stream(resp, path):
    with open(path,'wb') as f:
        for chunk in resp.iter_content(65536):
            f.write(chunk)

def unpaywall_pdf(doi:str)->str|None:
    api=f"https://api.unpaywall.org/v2/{quote_plus(doi)}?email={MY_EMAIL_FOR_APIS}"
    log_get("OA-API",doi,api)
    try:
        r=s_http.get(api,timeout=20); r.raise_for_status()
        j=r.json()
        if not j.get("is_oa"): return None
        loc=j.get("best_oa_location") or {}
        if loc.get("url_for_pdf"): return loc["url_for_pdf"]
        for o in j.get("oa_locations",[]):
            if o.get("url_for_pdf"): return o["url_for_pdf"]
    except Exception as e: log.debug(f"Unpaywall API error {e}")
    return None

def pmcid(pmid:str)->str|None:
    for link in ("pubmed_pmc_refs","pubmed_pmc"):
        try:
            r=s_ncbi.post("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi",
                          data=ncbi_params({"dbfrom":"pubmed","db":"pmc","id":pmid,
                                            "cmd":"neighbor_score","linkname":link}),
                          timeout=15); r.raise_for_status()
            node=ET.fromstring(r.content).find(".//Link/Id")
            if node is not None:
                txt=node.text.strip()
                return txt if txt.upper().startswith("PMC") else "PMC"+txt
        except Exception: pass
    return None

def download_url(tag:str,pmid:str,url:str,path:str)->bool:
    log_get(tag,pmid,url)
    try:
        r=s_http.get(url,timeout=(10,60),stream=True,allow_redirects=True)
        r.raise_for_status(); save_stream(r,path)
        if valid_pdf(path):
            log.info(f"{tag} ✓ {pmid}")
            return True
    except Exception as e:
        log.warning(f"{tag} ✗ {pmid}: {e}")
    if os.path.exists(path): os.remove(path)
    return False

def pmc_pdf(pmcid:str,pmid:str,path:str)->bool:
    landing=f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/pdf/"
    log_get("OA-PMC",pmid,landing)
    try:
        r=s_http.get(landing,timeout=(10,30),allow_redirects=True); r.raise_for_status()
        if r.url.lower().endswith(".pdf"):
            return download_url("PMC",pmid,r.url,path)
        soup=BeautifulSoup(r.content,'html.parser')
        a=soup.select_one('a[href$=".pdf"]')
        if a and a.get('href'):
            link=urljoin(r.url,a['href'])
            return download_url("PMC",pmid,link,path)
    except Exception as e: log.warning(f"PMC ✗ {pmid}: {e}")
    return False

def oa_worker(article:dict)->bool:
    pm=article['pmid']
    name=f"{article['year']}-{pm}-{sanitize(article['author'])}-{sanitize(article['title'])}.pdf"
    path=os.path.join(OUTPUT_PDF_DIR,name)
    if os.path.exists(path) and valid_pdf(path): return True
    if article['doi']:
        pdf=unpaywall_pdf(article['doi'])
        if pdf and download_url("Unpaywall",pm,pdf,path): return True
    pid=pmcid(pm)
    if pid and pmc_pdf(pid,pm,path): return True
    log.warning(f"OA ✗ {pm}")
    return False

# ----------------------------------- MAIN -------------------------------------
def main():
    t0=time.time(); log.info("=== PDF Fetcher v9 started ===")
    df=pd.read_excel(EXCEL_FILE_PATH)
    pmids=[str(x).split('.')[0] for x in df['PMID'].dropna() if str(x).strip()]
    if not pmids: log.error("No PMIDs in sheet."); return
    meta=fetch_meta(pmids)
    articles=[{'pmid':pm,**meta[pm]} for pm in pmids]

    os.makedirs(OUTPUT_PDF_DIR,exist_ok=True)
    mirrors=working_domains()

    # ----- Sci-Hub -----
    sci_start=time.time()
    with ThreadPoolExecutor(max_workers=MAX_THREADS) as pool:
        futs={pool.submit(sci_worker,a,mirrors):a['pmid'] for a in articles}
        sci_ok=[pm for f,pm in futs.items() if f.result()]
    sci_fail=[pm for pm in pmids if pm not in sci_ok]
    log.info(f"Sci-Hub → {len(sci_ok)}/{len(pmids)} OK  (failed: {sci_fail})  [{time.time()-sci_start:.1f}s]")

    # ----- OA fallback -----
    oa_start=time.time()
    with ThreadPoolExecutor(max_workers=MAX_THREADS) as pool:
        futs={pool.submit(oa_worker,{**meta[pm],'pmid':pm}):pm for pm in sci_fail}
        oa_ok=[pm for f,pm in futs.items() if f.result()]
    oa_fail=[pm for pm in sci_fail if pm not in oa_ok]
    log.info(f"OA  → {len(oa_ok)}/{len(sci_fail)} recovered  (still missing: {oa_fail})  [{time.time()-oa_start:.1f}s]")

    log.info(f"TOTAL: {(len(sci_ok)+len(oa_ok))}/{len(pmids)} PDFs  "
             f"[{time.time()-t0:.1f}s total]")
    log.info("=== PDF Fetcher v9 finished ===")

if __name__=="__main__":
    main()


2025-05-21 07:26:29,430 - INFO - [MainThread] - === PDF Fetcher v9 started ===
2025-05-21 07:26:29,449 - INFO - [MainThread] - EFetch metadata for 32 PMIDs …
2025-05-21 07:26:30,644 - INFO - [MainThread] - Missing DOI: ['26888001']
2025-05-21 07:26:30,645 - INFO - [MainThread] - Probing Sci-Hub mirrors …
2025-05-21 07:26:33,331 - INFO - [MainThread] - Using mirrors: ['https://sci-hub.ru', 'https://sci-hub.se', 'https://sci-hub.wf', 'https://sci-hub.st']
2025-05-21 07:26:43,454 - INFO - [ThreadPoolExecutor-4_3] - Sci-Hub ✓ 32145713 (cached)
2025-05-21 07:26:43,455 - INFO - [ThreadPoolExecutor-4_3] - Sci-Hub ✓ 30518491 (cached)
2025-05-21 07:26:44,709 - INFO - [ThreadPoolExecutor-4_2] - Sci-Hub ✓ 33888360 (cached)
2025-05-21 07:26:53,093 - INFO - [ThreadPoolExecutor-4_3] - Sci-Hub ✓ 34089071 (cached)
2025-05-21 07:26:53,095 - INFO - [ThreadPoolExecutor-4_3] - Sci-Hub ✓ 33210165 (cached)
2025-05-21 07:26:53,667 - INFO - [ThreadPoolExecutor-4_4] - Sci-Hub ✓ 32696123 (cached)
2025-05-21 07:

In [4]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
PDF Fetcher v10  —  single‐file, no intermediate imports, robust logging, always fresh fetches.
Changes from v9:
  • Removed file‐exists “cache” checks — every download re-runs.
  • Added log.info for every URL hit: Sci-Hub HTML, Sci-Hub PDF, Unpaywall API, PMC landing & PDF.
  • Explicitly log when a PMCID is found or not.
  • Tweaked Sci-Hub HTML → PDF selector to match both iframe#pdf and common <a href> patterns.
  • Ensured OA‐PMC parser tries all <a href$=".pdf"> links.
"""

import os, time, re, random, logging, requests, xml.etree.ElementTree as ET
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, quote_plus
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from concurrent.futures import ThreadPoolExecutor, as_completed

# ---------------------------- SETTINGS ---------------------------------------
EXCEL_FILE_PATH   = r"C:\Users\Galaxy\Downloads\screening_ERAS.xlsx"
OUTPUT_PDF_DIR    = "downloaded_pdfs"
NCBI_API_KEY      = "YOUR_API_KEY_HERE"
CROSSREF_MAILTO   = "your_email@example.com"
MY_EMAIL_FOR_APIS = "levi4328@gmail.com"

SCI_HUB_DOMAINS = [
    "https://sci-hub.ren","https://sci-hub.se","https://sci-hub.ru",
    "https://sci-hub.ee","https://sci-hub.wf","https://sci-hub.st"
]
PREFERRED       = 2
FALLBACK        = 2
DELAY_SCIHUB    = 0.5
MAX_THREADS     = 10

# ----------------------------- LOGGING ---------------------------------------
log = logging.getLogger("PDF_FETCHER_v10")
log.setLevel(logging.INFO)
h = logging.StreamHandler()
h.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - [%(threadName)s] - %(message)s"))
log.addHandler(h)

# ------------------------- HTTP SESSION FACTORY ------------------------------
def session(user_agent, retries=3):
    s = requests.Session()
    r = Retry(total=retries, backoff_factor=0.5,
              status_forcelist=[429,500,502,503,504],
              allowed_methods=frozenset(["GET","POST"]))
    a = HTTPAdapter(max_retries=r, pool_connections=20, pool_maxsize=40)
    s.mount("https://", a)
    s.mount("http://", a)
    s.headers.update({"User-Agent": user_agent})
    return s

s_ncbi   = session(f"PDFFetcher/10 (mailto:{CROSSREF_MAILTO})")
s_scihub = session("Mozilla/5.0 (Sci-Hub)")
s_http   = session(f"OAFetcher/10 (mailto:{MY_EMAIL_FOR_APIS})", retries=2)

# ----------------------------- UTILITIES -------------------------------------
def sanitize(text):
    t = re.sub(r'[\\/*?:"<>|]', '', text or "")
    t = re.sub(r'\s+', '_', t).strip('_')
    return t[:160]

def ncbi_params(extra=None):
    p = {"tool":"pdf_fetcher_v10", "email":CROSSREF_MAILTO}
    if NCBI_API_KEY and NCBI_API_KEY!="YOUR_API_KEY_HERE":
        p["api_key"] = NCBI_API_KEY
    if CROSSREF_MAILTO=="your_email@example.com":
        p.pop("email",None)
    if extra: p.update(extra)
    return p

def valid_pdf(path):
    try:
        with open(path, 'rb') as f:
            hdr = f.read(4096)
        return b'%PDF-' in hdr
    except:
        return False

# --------------------- PUBMED METADATA + DOI FETCH --------------------------
def fetch_meta(pmids):
    meta = {pm:{'doi':None,'title':pm,'author':'Unknown','year':'Unknown'} for pm in pmids}
    log.info(f"EFetch metadata for {len(pmids)} PMIDs …")
    for i in range(0, len(pmids), 150):
        batch = pmids[i:i+150]
        try:
            r = s_ncbi.post(
                "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
                data=ncbi_params({"db":"pubmed","retmode":"xml","id":",".join(batch)}),
                timeout=60
            )
            r.raise_for_status()
            root = ET.fromstring(r.content)
            for art in root.findall(".//PubmedArticle"):
                pmid = art.findtext(".//PMID")
                d = meta.get(pmid, {})
                d['title']  = art.findtext(".//ArticleTitle") or pmid
                d['year']   = (art.findtext(".//PubDate/Year")
                               or art.findtext(".//ArticleDate/Year")
                               or "Unknown")
                d['author'] = art.findtext(".//AuthorList/Author[1]/LastName") or "Unknown"
                doi = art.findtext(".//ArticleId[@IdType='doi']") or \
                      art.findtext(".//ELocationID[@EIdType='doi'][@ValidYN='Y']")
                d['doi'] = doi
                meta[pmid] = d
        except Exception as e:
            log.warning(f"EFetch batch error: {e}")
        time.sleep(0.25 if NCBI_API_KEY else 0.35)

    missing = [pm for pm,d in meta.items() if not d['doi']]
    if missing:
        log.info(f"Missing DOI for: {missing}")
    return meta

# ----------------------------- SCI-HUB LOGIC ---------------------------------
def probe(domain):
    try:
        r = s_scihub.head(f"{domain.rstrip('/')}/10.1000/xyz", timeout=5)
        return r.status_code in (200,404)
    except:
        return False

def pick_domains():
    log.info("Probing Sci-Hub mirrors …")
    ok = [d for d in random.sample(SCI_HUB_DOMAINS,len(SCI_HUB_DOMAINS)) if probe(d)][:PREFERRED]
    rest = [d for d in SCI_HUB_DOMAINS if d not in ok]
    ok += random.sample(rest, k=min(FALLBACK,len(rest)))
    log.info(f"Using mirrors: {ok}")
    return ok

def find_pdf_link(html, base_url):
    soup = BeautifulSoup(html, 'html.parser')
    base = f"{urlparse(base_url).scheme}://{urlparse(base_url).netloc}"
    # try iframe#pdf / iframe#article / embed
    for sel in ('iframe#pdf','iframe#article','embed[type="application/pdf"]'):
        tag = soup.select_one(sel)
        if tag and tag.get('src'):
            return urljoin(base, tag['src'])
    # fallback to any <a> or <iframe> with .pdf
    for tag in soup.find_all(['a','iframe']):
        href = tag.get('href') or tag.get('src') or ""
        if '.pdf' in href.lower():
            return urljoin(base, href)
    return None

def sci_worker(article, mirrors):
    pmid, doi = article['pmid'], article['doi']
    fn = f"{article['year']}-{pmid}-{sanitize(article['author'])}-{sanitize(article['title'])}.pdf"
    path = os.path.join(OUTPUT_PDF_DIR, fn)

    for mirror in mirrors:
        for ident in ((doi,pmid) if doi else (pmid,)):
            url_html = f"{mirror.rstrip('/')}/{ident}"
            log.info(f"Sci-Hub-HTML -> PMID {pmid}: GET {url_html}")
            try:
                time.sleep(DELAY_SCIHUB)
                r = s_scihub.get(url_html, timeout=15); r.raise_for_status()
                # direct PDF?
                if 'application/pdf' in r.headers.get('Content-Type','').lower():
                    log.info(f"Sci-Hub-PDF -> PMID {pmid}: GET {url_html}")
                    open(path,'wb').write(r.content)
                else:
                    link = find_pdf_link(r.content, r.url)
                    if not link:
                        continue
                    log.info(f"Sci-Hub-PDF -> PMID {pmid}: GET {link}")
                    r2 = s_scihub.get(link, timeout=40, stream=True); r2.raise_for_status()
                    with open(path,'wb') as f:
                        for chunk in r2.iter_content(65536):
                            f.write(chunk)
                if valid_pdf(path):
                    log.info(f"Sci-Hub ✓ {pmid} via {mirror}")
                    return True
                else:
                    os.remove(path)
            except Exception as e:
                log.debug(f"Sci-Hub error for {pmid}@{mirror}: {e}")
        # next mirror
    log.warning(f"Sci-Hub ✗ {pmid}")
    return False

# --------------------------- OPEN-ACCESS LOGIC -------------------------------
def log_get(tag, pmid, url):
    log.info(f"{tag} -> PMID {pmid}: GET {url}")

def save_stream(resp, path):
    with open(path,'wb') as f:
        for chunk in resp.iter_content(65536):
            f.write(chunk)

def unpaywall_pdf(doi, pmid):
    api = f"https://api.unpaywall.org/v2/{quote_plus(doi)}?email={MY_EMAIL_FOR_APIS}"
    log_get("OA-API", pmid, api)
    try:
        r = s_http.get(api, timeout=20); r.raise_for_status()
        j = r.json()
        if not j.get("is_oa"): return None
        best = j.get("best_oa_location") or {}
        if best.get("url_for_pdf"): return best["url_for_pdf"]
        for loc in j.get("oa_locations",[]):
            if loc.get("url_for_pdf"):
                return loc["url_for_pdf"]
    except Exception as e:
        log.debug(f"Unpaywall API error {pmid}: {e}")
    return None

def find_pmcid(pmid):
    for linkname in ("pubmed_pmc_refs","pubmed_pmc"):
        try:
            r = s_ncbi.post("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi",
                            data=ncbi_params({
                                "dbfrom":"pubmed","db":"pmc","id":pmid,
                                "cmd":"neighbor_score","linkname":linkname
                            }), timeout=15)
            r.raise_for_status()
            node = ET.fromstring(r.content).find(".//Link/Id")
            if node is not None:
                pmc = node.text.strip()
                pmcid = pmc if pmc.upper().startswith("PMC") else "PMC"+pmc
                log.info(f"Found PMCID {pmcid} for PMID {pmid}")
                return pmcid
        except Exception as e:
            log.debug(f"PMC elink error for {pmid}@{linkname}: {e}")
    log.info(f"No PMCID for PMID {pmid}")
    return None

def download_url(tag, pmid, url, path):
    log_get(tag, pmid, url)
    try:
        resp = s_http.get(url, timeout=(10,60), stream=True, allow_redirects=True)
        resp.raise_for_status()
        save_stream(resp, path)
        if valid_pdf(path):
            log.info(f"{tag} ✓ {pmid}")
            return True
    except Exception as e:
        log.warning(f"{tag} ✗ {pmid}: {e}")
    if os.path.exists(path): os.remove(path)
    return False

def pmc_worker(pmid, path):
    pmcid = find_pmcid(pmid)
    if not pmcid: return False
    landing = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/pdf/"
    log_get("OA-PMC", pmid, landing)
    try:
        r = s_http.get(landing, timeout=(10,30), allow_redirects=True)
        r.raise_for_status()
        # Redirected to direct PDF?
        if r.url.lower().endswith(".pdf"):
            return download_url("PMC", pmid, r.url, path)
        soup = BeautifulSoup(r.content, 'html.parser')
        a = soup.select_one('a[href$=".pdf"]')
        if a and a.get('href'):
            link = urljoin(r.url, a['href'])
            return download_url("PMC", pmid, link, path)
    except Exception as e:
        log.warning(f"PMC landing ✗ {pmid}: {e}")
    return False

def oa_worker(article):
    pmid, doi = article['pmid'], article['doi']
    fn = f"{article['year']}-{pmid}-{sanitize(article['author'])}-{sanitize(article['title'])}.pdf"
    path = os.path.join(OUTPUT_PDF_DIR, fn)

    if doi:
        pdf_url = unpaywall_pdf(doi, pmid)
        if pdf_url and download_url("Unpaywall", pmid, pdf_url, path):
            return True

    # then PMC fallback
    return pmc_worker(pmid, path)

# ---------------------------------- MAIN --------------------------------------
def main():
    t0 = time.time()
    log.info("=== PDF Fetcher v10 started ===")

    # read PMIDs
    df = pd.read_excel(EXCEL_FILE_PATH)
    pmids = [str(x).split('.')[0] for x in df['PMID'].dropna() if str(x).strip()]
    if not pmids:
        log.error("No PMIDs found in Excel.")
        return

    # fetch metadata
    meta = fetch_meta(pmids)
    articles = [{'pmid':pm, **meta[pm]} for pm in pmids]

    os.makedirs(OUTPUT_PDF_DIR, exist_ok=True)

    # Sci-Hub phase
    mirrors = pick_domains()
    with ThreadPoolExecutor(max_workers=MAX_THREADS) as pool:
        sci_futs = {pool.submit(sci_worker,a,mirrors): a['pmid'] for a in articles}
        sci_ok  = [pm for fut,pm in sci_futs.items() if fut.result()]
    sci_fail = [pm for pm in pmids if pm not in sci_ok]
    log.info(f"Sci-Hub → {len(sci_ok)}/{len(pmids)} OK (failed: {sci_fail}) [{time.time()-t0:.1f}s]")

    # OA fallback
    with ThreadPoolExecutor(max_workers=MAX_THREADS) as pool:
        oa_futs = {pool.submit(oa_worker, {'pmid':pm,**meta[pm]}): pm for pm in sci_fail}
        oa_ok   = [pm for fut,pm in oa_futs.items() if fut.result()]
    oa_fail = [pm for pm in sci_fail if pm not in oa_ok]
    log.info(f"OA       → {len(oa_ok)}/{len(sci_fail)} recovered (still missing: {oa_fail})")

    log.info(f"TOTAL    → {len(sci_ok)+len(oa_ok)}/{len(pmids)} PDFs  [total {(time.time()-t0):.1f}s]")
    log.info("=== PDF Fetcher v10 finished ===")

if __name__ == "__main__":
    main()


2025-05-21 07:33:08,856 - INFO - [MainThread] - === PDF Fetcher v10 started ===
2025-05-21 07:33:08,872 - INFO - [MainThread] - EFetch metadata for 32 PMIDs …
2025-05-21 07:33:10,050 - INFO - [MainThread] - Missing DOI for: ['26888001']
2025-05-21 07:33:10,053 - INFO - [MainThread] - Probing Sci-Hub mirrors …
2025-05-21 07:33:14,873 - INFO - [MainThread] - Using mirrors: ['https://sci-hub.ren', 'https://sci-hub.se', 'https://sci-hub.ru', 'https://sci-hub.st']
2025-05-21 07:33:14,874 - INFO - [ThreadPoolExecutor-6_0] - Sci-Hub-HTML -> PMID 39955421: GET https://sci-hub.ren/10.1007/s00383-025-05977-0
2025-05-21 07:33:14,876 - INFO - [ThreadPoolExecutor-6_1] - Sci-Hub-HTML -> PMID 40340819: GET https://sci-hub.ren/10.1136/bmjpo-2024-003280
2025-05-21 07:33:14,877 - INFO - [ThreadPoolExecutor-6_2] - Sci-Hub-HTML -> PMID 39068053: GET https://sci-hub.ren/10.1016/j.jpedsurg.2024.06.021
2025-05-21 07:33:14,879 - INFO - [ThreadPoolExecutor-6_3] - Sci-Hub-HTML -> PMID 39384309: GET https://sci-

In [5]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Combined Sci-Hub + OA/PMC PDF Fetcher (v10, standalone)
-------------------------------------------------------
Reads PMIDs from an Excel file, fetches metadata (year, first author, title, DOI)
via NCBI EFetch, then downloads PDFs:
  1) In parallel via Sci-Hub (preferred).
  2) For any failures, in parallel via Open Access fallback:
       a) Unpaywall API
       b) PubMed Central (PMC) with HEAD→GET to follow to the real PDF URL.
  
Key features and fixes since v6:
  • Single OUTPUT_PDF_DIR for all PDFs (no separate OA folder).
  • Filenames standardized: {year}-{pmid}-{first_author}-{title}.pdf
  • Detailed logging of every URL fetched (Sci-Hub HTML, Sci-Hub PDF, Unpaywall API, PMC HEAD, PMC GET).
  • Proper PMC two-step redirect: HEAD to landing page → follow to actual PDF.
  • PubMed metadata (year, first author, title, DOI) fetched once up front.
  • Configurable delays and thread counts for performance tuning.
"""

import os
import re
import time
import logging
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, quote_plus
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from concurrent.futures import ThreadPoolExecutor, as_completed

# === CONFIGURATION ===
EXCEL_FILE_PATH     = r"C:\Users\Galaxy\Downloads\screening_ERAS.xlsx"
OUTPUT_PDF_DIR      = "downloaded_pdfs_v10"
SCI_HUB_DOMAINS     = [
    "https://sci-hub.se", "https://sci-hub.ru", "https://sci-hub.ren",
    "https://sci-hub.wf", "https://sci-hub.ee", "https://sci-hub.st"
]
DELAY_SCIHUB        = 0.5   # seconds between Sci-Hub requests
MAX_THREADS         = 10
NCBI_API_KEY        = "YOUR_API_KEY_HERE"
CROSSREF_EMAIL      = "your_email@example.com"
EFETCH_BATCH_SIZE   = 100
DELAY_NCBI          = 0.35  # seconds between NCBI batches
UNPAYWALL_EMAIL     = "levi4328@gmail.com"

# === LOGGING SETUP ===
logger = logging.getLogger("PDFFetcherV10")
logger.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - [%(threadName)s] - %(message)s'))
logger.addHandler(ch)

# === HTTP SESSIONS WITH RETRIES ===
def make_session(user_agent):
    s = requests.Session()
    retries = Retry(total=3, backoff_factor=0.5,
                    status_forcelist=[429,500,502,503,504],
                    allowed_methods=frozenset(['GET','POST']))
    adapter = HTTPAdapter(max_retries=retries, pool_connections=MAX_THREADS, pool_maxsize=MAX_THREADS)
    s.mount("https://", adapter)
    s.mount("http://", adapter)
    s.headers.update({'User-Agent': user_agent})
    return s

session_ncbi   = make_session(f"PDFFetcherV10 (mailto:{CROSSREF_EMAIL})")
session_scihub = make_session("Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
session_oa     = make_session(f"PDFFetcherV10-OA (mailto:{UNPAYWALL_EMAIL})")

# === UTILITIES ===
def sanitize_filename(s: str) -> str:
    s = re.sub(r'[\\/*?:"<>|]', "", s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s.replace(' ', '_')[:200]

def _get_ncbi_params(extra=None):
    params = {"tool":"PDFFetcherV10","email":CROSSREF_EMAIL}
    if NCBI_API_KEY and NCBI_API_KEY!="YOUR_API_KEY_HERE":
        params["api_key"] = NCBI_API_KEY
    if CROSSREF_EMAIL=="your_email@example.com":
        params.pop("email",None)
    if extra:
        params.update(extra)
    return params

# === STEP 1: FETCH METADATA FROM PUBMED ===
def fetch_metadata(pmids):
    """
    Returns dict pmid -> { 'doi','year','author','title' }
    """
    meta = {}
    batches = [pmids[i:i+EFETCH_BATCH_SIZE] for i in range(0, len(pmids), EFETCH_BATCH_SIZE)]
    logger.info(f"EFetch PubMed metadata for {len(pmids)} PMIDs in {len(batches)} batch(es)...")
    for batch in batches:
        data = _get_ncbi_params({"db":"pubmed","retmode":"xml","id":",".join(batch)})
        logger.info(f"NCBI EFetch POST -> IDs={','.join(batch)}")
        try:
            resp = session_ncbi.post("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
                                     data=data, timeout=60)
            resp.raise_for_status()
            root = ET.fromstring(resp.content)
            for art in root.findall(".//PubmedArticle"):
                pmid_el = art.find(".//PMID")
                pmid = pmid_el.text.strip()
                # DOI
                doi_el = art.find(".//ArticleId[@IdType='doi']") or \
                         art.find(".//ELocationID[@EIdType='doi'][@ValidYN='Y']")
                doi = doi_el.text.strip() if doi_el is not None else None
                # Year
                year_el = art.find(".//Journal/JournalIssue/PubDate/Year")
                year = year_el.text.strip() if year_el is not None else art.find(".//Article/Journal/JournalIssue/PubDate/MedlineDate").text[:4]
                # First author last name
                fa = art.find(".//AuthorList/Author")
                author = fa.find("LastName").text.strip() if fa is not None and fa.find("LastName") is not None else "Unknown"
                # Title
                title_el = art.find(".//ArticleTitle")
                title = "".join(title_el.itertext()).strip() if title_el is not None else pmid
                meta[pmid] = {'doi':doi, 'year':year, 'author':author, 'title':title}
        except Exception as e:
            logger.warning(f"NCBI EFetch batch error: {e}")
        time.sleep(DELAY_NCBI)
    missing = [pm for pm in pmids if pm not in meta]
    if missing:
        logger.warning(f"Metadata missing for PMIDs: {missing}")
    return meta

# === STEP 2: TEST & INITIALIZE SCI-HUB DOMAINS ===
def test_scihub_domain(domain):
    test_doi = "10.1000/182"
    url = f"{domain.rstrip('/')}/{test_doi}"
    logger.info(f"Sci-Hub TEST GET -> {url}")
    try:
        r = session_scihub.get(url, timeout=7)
        if r.status_code in (200,404) and 'html' in r.headers.get('Content-Type','').lower():
            return True
    except:
        pass
    return False

def init_scihub_domains():
    logger.info("Probing Sci-Hub mirrors for availability...")
    working = []
    for d in SCI_HUB_DOMAINS:
        if test_scihub_domain(d):
            working.append(d)
            if len(working) >= 2:
                break
    if not working:
        logger.error("No working Sci-Hub domains found!")
    else:
        logger.info(f"Using Sci-Hub domains: {working}")
    return working

# === STEP 3: DOWNLOAD VIA SCI-HUB ===
def find_scihub_pdf(html, base_url):
    soup = BeautifulSoup(html, 'html.parser')
    base = f"{urlparse(base_url).scheme}://{urlparse(base_url).netloc}"
    # try iframe#pdf / iframe#article
    for id_ in ('pdf','article'):
        ifr = soup.find('iframe', id=id_)
        if ifr and ifr.get('src'):
            return urljoin(base, ifr['src'])
    # fallback: any iframe with .pdf
    for ifr in soup.find_all('iframe'):
        src = ifr.get('src','')
        if '.pdf' in src:
            return urljoin(base, src)
    # embed
    emb = soup.find('embed', type='application/pdf')
    if emb and emb.get('src'):
        return urljoin(base, emb['src'])
    # selectors
    for sel in ['a[href$=".pdf"]','button[onclick*="location.href"]','a[onclick*="location.href"]']:
        el = soup.select_one(sel)
        if el:
            href = el.get('href')
            if not href and el.get('onclick'):
                m = re.search(r"location\.href=['\"]([^'\"]+)", el['onclick'])
                href = m.group(1) if m else None
            if href:
                return urljoin(base, href)
    return None

def download_scihub(identifier, pmid, md, domains):
    """
    identifier: DOI or PMID
    md: metadata dict for pmid
    domains: list of Sci-Hub domains to try
    """
    fname = sanitize_filename(f"{md['year']}-{pmid}-{md['author']}-{md['title']}") + ".pdf"
    path = os.path.join(OUTPUT_PDF_DIR, fname)
    if os.path.exists(path):
        logger.info(f"Sci-Hub ✓ {pmid}: already exists (cached)")
        return True
    for dom in domains:
        url_html = f"{dom.rstrip('/')}/{identifier}"
        logger.info(f"Sci-Hub HTML GET -> PMID {pmid}: {url_html}")
        try:
            r = session_scihub.get(url_html, timeout=20)
            r.raise_for_status()
            ctype = r.headers.get('Content-Type','').lower()
            if 'application/pdf' in ctype:
                logger.info(f"Sci-Hub PDF GET  -> PMID {pmid}: {url_html}")
                with open(path,'wb') as f: f.write(r.content)
                return True
            elif 'html' in ctype:
                pdf_url = find_scihub_pdf(r.content, r.url)
                if pdf_url:
                    logger.info(f"Sci-Hub PDF GET  -> PMID {pmid}: {pdf_url}")
                    pr = session_scihub.get(pdf_url, timeout=45, stream=True)
                    pr.raise_for_status()
                    if 'application/pdf' in pr.headers.get('Content-Type','').lower():
                        with open(path,'wb') as f:
                            for chunk in pr.iter_content(8192):
                                f.write(chunk)
                        return True
        except Exception as e:
            logger.warning(f"Sci-Hub ✗ {pmid} via {dom}: {e}")
        time.sleep(DELAY_SCIHUB)
    return False

# === STEP 4: OA / PMC FALLBACK ===
def unpaywall_pdf_url(doi):
    api = f"https://api.unpaywall.org/v2/{quote_plus(doi)}?email={UNPAYWALL_EMAIL}"
    logger.info(f"Unpaywall API GET -> DOI {doi}: {api}")
    try:
        r = session_oa.get(api, timeout=20)
        r.raise_for_status()
        j = r.json()
        if j.get("is_oa"):
            best = j.get("best_oa_location") or {}
            url = best.get("url_for_pdf") or \
                  next((loc.get("url_for_pdf") for loc in j.get("oa_locations",[]) if loc.get("url_for_pdf")), None)
            return url
    except Exception as e:
        logger.warning(f"Unpaywall ✗ DOI {doi}: {e}")
    return None

def pmc_id_for_pmid(pmid):
    params = _get_ncbi_params({"dbfrom":"pubmed","db":"pmc","id":pmid,"cmd":"neighbor_score"})
    for linkname in ("pubmed_pmc_refs","pubmed_pmc"):
        params["linkname"] = linkname
        try:
            r = session_ncbi.post("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi",
                                  data=params, timeout=15)
            r.raise_for_status()
            root = ET.fromstring(r.content)
            ids = [el.text.strip() for el in root.findall(".//Link/Id") if el.text]
            if ids:
                pmc = ids[0]
                return pmc if pmc.upper().startswith("PMC") else "PMC"+pmc
        except:
            pass
    return None

def pmc_download(pmcid, pmid, md):
    fname = sanitize_filename(f"{md['year']}-{pmid}-{md['author']}-{md['title']}") + ".pdf"
    path  = os.path.join(OUTPUT_PDF_DIR, fname)
    if os.path.exists(path):
        logger.info(f"PMC ✓ {pmid}: already exists")
        return True

    landing = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/pdf/"
    logger.info(f"PMC HEAD    -> PMID {pmid}: {landing}")
    try:
        head = session_oa.head(landing, timeout=(5,15), allow_redirects=True)
        head.raise_for_status()
    except Exception as e:
        logger.warning(f"PMC HEAD ✗ {pmid}: {e}")
        return False

    final_url = head.url
    logger.info(f"PMC PDF GET -> PMID {pmid}: {final_url}")
    try:
        r = session_oa.get(final_url, timeout=(10,60), stream=True)
        r.raise_for_status()
        ctype = r.headers.get('Content-Type','').lower()
        if 'application/pdf' not in ctype and not final_url.lower().endswith('.pdf'):
            logger.warning(f"PMC GET ✗ {pmid}: bad content-type {ctype}")
            return False
        with open(path, 'wb') as f:
            for chunk in r.iter_content(65536):
                f.write(chunk)
        return True
    except Exception as e:
        logger.warning(f"PMC GET ✗ {pmid}: {e}")
        return False

def oa_worker(pmid, md):
    """
    Try Unpaywall → PMC fallback for PMID, using metadata md.
    Returns (pmid, success_bool).
    """
    doi = md.get('doi')
    # 1) Unpaywall
    if doi:
        url = unpaywall_pdf_url(doi)
        if url:
            logger.info(f"Unpaywall PDF GET -> PMID {pmid}: {url}")
            try:
                r = session_oa.get(url, timeout=(10,60), stream=True)
                r.raise_for_status()
                if 'application/pdf' in r.headers.get('Content-Type','').lower() or url.lower().endswith('.pdf'):
                    fname = sanitize_filename(f"{md['year']}-{pmid}-{md['author']}-{md['title']}") + ".pdf"
                    path = os.path.join(OUTPUT_PDF_DIR, fname)
                    with open(path,'wb') as f:
                        for chunk in r.iter_content(65536):
                            f.write(chunk)
                    return pmid, True
            except Exception as e:
                logger.warning(f"Unpaywall GET ✗ {pmid}: {e}")
    # 2) PMC fallback
    pmcid = pmc_id_for_pmid(pmid)
    if pmcid:
        success = pmc_download(pmcid, pmid, md)
        return pmid, success
    logger.warning(f"OA ✗ {pmid}: no DOI or PMCID available")
    return pmid, False

# === MAIN ===
def main():
    t0 = time.time()
    logger.info("=== PDF Fetcher v10 started ===")

    # load PMIDs
    try:
        df = pd.read_excel(EXCEL_FILE_PATH)
    except Exception as e:
        logger.error(f"Cannot read Excel: {e}")
        return
    pmids = [str(x).split('.')[0] for x in df['PMID'].dropna().unique()]
    logger.info(f"Loaded {len(pmids)} unique PMIDs")

    # metadata fetch
    meta = fetch_metadata(pmids)

    # ensure output dir
    os.makedirs(OUTPUT_PDF_DIR, exist_ok=True)

    # Sci-Hub phase
    domains = init_scihub_domains()
    sci_futures = {}
    with ThreadPoolExecutor(max_workers=MAX_THREADS) as exe:
        for pm in pmids:
            md = meta.get(pm)
            if not md:
                continue
            ident = md['doi'] if md['doi'] else pm
            sci_futures[exe.submit(download_scihub, ident, pm, md, domains)] = pm
        sci_success = []
        sci_failed = []
        for fut in as_completed(sci_futures):
            pm = sci_futures[fut]
            try:
                ok = fut.result()
                (sci_success if ok else sci_failed).append(pm)
            except Exception as e:
                logger.warning(f"Sci-Hub thread error for {pm}: {e}")
                sci_failed.append(pm)
    logger.info(f"Sci-Hub phase: {len(sci_success)}/{len(pmids)} succeeded; failures: {sci_failed}")

    # OA fallback phase
    oa_futures = {}
    with ThreadPoolExecutor(max_workers=MAX_THREADS) as exe:
        for pm in sci_failed:
            md = meta.get(pm)
            if not md:
                continue
            oa_futures[exe.submit(oa_worker, pm, md)] = pm
        oa_success = []
        oa_failed  = []
        for fut in as_completed(oa_futures):
            pm = oa_futures[fut]
            try:
                pmid, ok = fut.result()
                (oa_success if ok else oa_failed).append(pmid)
            except Exception as e:
                logger.warning(f"OA thread error for {pm}: {e}")
                oa_failed.append(pm)
    logger.info(f"OA fallback: {len(oa_success)}/{len(sci_failed)} recovered; still missing: {oa_failed}")

    # summary
    total = len(sci_success) + len(oa_success)
    logger.info(f"TOTAL: {total}/{len(pmids)} PDFs fetched in {time.time()-t0:.2f}s")
    logger.info("=== PDF Fetcher v10 completed ===")

if __name__ == "__main__":
    main()


2025-05-21 08:22:32,727 - INFO - [MainThread] - === PDF Fetcher v10 started ===
2025-05-21 08:22:32,743 - INFO - [MainThread] - Loaded 32 unique PMIDs
2025-05-21 08:22:32,744 - INFO - [MainThread] - EFetch PubMed metadata for 32 PMIDs in 1 batch(es)...
2025-05-21 08:22:32,745 - INFO - [MainThread] - NCBI EFetch POST -> IDs=39955421,40340819,39068053,39384309,38673038,36788057,32145713,30518491,33401363,39185540,37802689,37062759,36969299,33888360,35790215,34089071,33210165,34059337,32696123,32385680,30686518,30922685,31274269,27679510,27810148,27083963,26888001,39520824,39489669,39083294,39342249,31859070
2025-05-21 08:22:34,037 - INFO - [MainThread] - Probing Sci-Hub mirrors for availability...
2025-05-21 08:22:34,037 - INFO - [MainThread] - Sci-Hub TEST GET -> https://sci-hub.se/10.1000/182
2025-05-21 08:22:34,668 - INFO - [MainThread] - Sci-Hub TEST GET -> https://sci-hub.ru/10.1000/182
2025-05-21 08:22:35,408 - INFO - [MainThread] - Using Sci-Hub domains: ['https://sci-hub.se', 'ht

In [6]:
import os
import hashlib
import logging
import requests

# === CONFIG ===
OUTPUT_DIR = "debug_pdf_fetch"
LOG_FILE   = "debug_fetch.log"
REQUEST_TIMEOUT = (5, 30)  # (connect, read) seconds
HEADERS = {"User-Agent": "PDFDebugHarness/1.0"}

# Ensure output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# === LOGGING ===
logger = logging.getLogger("PDFDebug")
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler(LOG_FILE, mode="w")
fmt = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
fh.setFormatter(fmt)
logger.addHandler(fh)
ch = logging.StreamHandler()
ch.setFormatter(fmt)
logger.addHandler(ch)

# === TEST CASES ===
# Fill this list with tuples of (pmid, identifier, expected_good_url)
# identifier = DOI or PMID that your fetcher uses for Sci-Hub
# expected_good_url = a URL you know works (e.g. direct PMC link)
tests = [
    ("38673038", "10.3390/jpm14040411", "https://pmc.ncbi.nlm.nih.gov/articles/PMC11051180/pdf/jpm-14-00411.pdf"),
    ("31274269", "31274269",        "https://pmc.ncbi.nlm.nih.gov/articles/PMC9726462/pdf/kja-22279.pdf"),
    ("39384309", "10.1136/bmjpo-2024-002824", "https://pmc.ncbi.nlm.nih.gov/articles/PMC12060878/pdf/bmjpo-9-1.pdf"),
    ("39068053", "10.1016/j.jpedsurg.2024.06.021", "https://jpedasurg.org/article/S0022-3468(24)162046/pdf"),
    # add more...
]

session = requests.Session()
session.headers.update(HEADERS)

def sha256(data: bytes) -> str:
    return hashlib.sha256(data).hexdigest()

def inspect_url(pmid, url, label):
    """
    Fetch `url`, record status, headers, content length, first bytes, checksum.
    Save raw bytes to OUTPUT_DIR/<pmid>-<label>.bin
    """
    try:
        logger.info(f"[{pmid}] FETCH → {label}: {url}")
        r = session.get(url, timeout=REQUEST_TIMEOUT, allow_redirects=True)
        status = r.status_code
        ctype  = r.headers.get("Content-Type", "<none>")
        clen_h = r.headers.get("Content-Length", "<none>")
        data   = r.content
        clen_a = len(data)
        first_bytes = data[:256]
        checksum    = sha256(data)

        logger.info(f"[{pmid}] {label} STATUS  : {status}")
        logger.info(f"[{pmid}] {label} CT       : {ctype}")
        logger.info(f"[{pmid}] {label} CL(header): {clen_h}")
        logger.info(f"[{pmid}] {label} CL(actual): {clen_a}")
        logger.info(f"[{pmid}] {label} SHA256   : {checksum}")
        logger.info(f"[{pmid}] {label} First256 : {first_bytes.hex()}\n")

        # save raw bytes for manual inspection
        fname = f"{pmid}-{label}.bin"
        path  = os.path.join(OUTPUT_DIR, fname)
        with open(path, "wb") as fw:
            fw.write(data)

        return {
            "pmid": pmid,
            "label": label,
            "url": url,
            "status": status,
            "ctype": ctype,
            "clen_header": clen_h,
            "clen_actual": clen_a,
            "sha256": checksum,
            "first256_hex": first_bytes.hex(),
            "file": path
        }

    except Exception as e:
        logger.error(f"[{pmid}] {label} ERROR: {e}")
        return {
            "pmid": pmid,
            "label": label,
            "url": url,
            "error": str(e)
        }

def run_debug():
    all_results = []
    for pmid, ident, good_url in tests:
        # 1) Inspect “good” URL
        res_good = inspect_url(pmid, good_url, "GOOD")
        all_results.append(res_good)

        # 2) Inspect what your script would have fetched from PMC landing
        landing = f"https://pmc.ncbi.nlm.nih.gov/articles/PMC{res_good['url'].split('PMC')[-1]}/pdf/"
        res_landing = inspect_url(pmid, landing, "LANDING")
        all_results.append(res_landing)

        # 3) Inspect the DOI-based redirect (simulate Sci-Hub HTML fetch)
        doi_url = f"https://sci-hub.se/{quote_plus(ident)}"
        res_html = inspect_url(pmid, doi_url, "SCIHTML")
        all_results.append(res_html)

        # 4) Attempt to parse out direct PDF link from the HTML
        if res_html.get("status")==200 and "html" in res_html.get("ctype",""):
            from bs4 import BeautifulSoup
            soup = BeautifulSoup(open(res_html["file"], "rb"), "html.parser")
            iframe = soup.find("iframe", {"id":"pdf"}) or soup.find("iframe")
            if iframe and iframe.get("src"):
                pdf_link = urljoin(doi_url, iframe["src"])
                res_pdf = inspect_url(pmid, pdf_link, "SCIPDF")
                all_results.append(res_pdf)

    return all_results

if __name__ == "__main__":
    results = run_debug()
    logger.info("=== DEBUG RUN COMPLETE ===")


2025-05-21 08:31:53,964 - INFO - [38673038] FETCH → GOOD: https://pmc.ncbi.nlm.nih.gov/articles/PMC11051180/pdf/jpm-14-00411.pdf
2025-05-21 08:31:54,241 - INFO - [38673038] GOOD STATUS  : 200
2025-05-21 08:31:54,242 - INFO - [38673038] GOOD CT       : text/html; charset=utf-8
2025-05-21 08:31:54,243 - INFO - [38673038] GOOD CL(header): 1285
2025-05-21 08:31:54,243 - INFO - [38673038] GOOD CL(actual): 1285
2025-05-21 08:31:54,244 - INFO - [38673038] GOOD SHA256   : f5ce7d2c71fedae9e6055566c5d20e665d6e6ec21ec140ac1d2806b92e6d1c63
2025-05-21 08:31:54,244 - INFO - [38673038] GOOD First256 : 0a0a0a0a3c68746d6c3e0a20203c686561643e0a202020203c6d657461206e616d653d2276696577706f72742220636f6e74656e743d2277696474683d6465766963652d77696474682c20696e697469616c2d7363616c653d312e30223e0a202020203c7469746c653e507265706172696e6720746f20646f776e6c6f6164202e2e2e3c2f7469746c653e0a202020203c7374796c6520747970653d22746578742f637373223e0a202020202020626f64797b666f6e742d73697a653a3172656d3b6c696e652d68656967

In [7]:
import requests
from bs4 import BeautifulSoup
import re

TEST_URL = "https://pmc.ncbi.nlm.nih.gov/articles/PMC11051180/pdf/jpm-14-00411.pdf"

def inspect_pmc_pdf(url):
    print(f"→ Fetching: {url}")
    r = requests.get(url, allow_redirects=True, timeout=30)
    print("Status :", r.status_code)
    print("CT     :", r.headers.get("Content-Type"))
    print("Body[0:200]:", r.content[:200])
    if "text/html" in r.headers.get("Content-Type", ""):
        print("\n→ Detected HTML wrapper, parsing for PDF link…")
        soup = BeautifulSoup(r.content, "html.parser")
        # Look for <embed src="…"> or <iframe src="…pdf">
        candidates = []
        for tag in soup.find_all(["embed","iframe"]):
            src = tag.get("src") or tag.get("data")
            if src and src.lower().endswith(".pdf"):
                candidates.append(src)
        # Fallback: search any href=".pdf"
        for a in soup.find_all("a", href=True):
            if a["href"].lower().endswith(".pdf"):
                candidates.append(a["href"])
        if not candidates:
            print("! No obvious PDF link found in HTML.")
            return
        # Normalize and try each
        for link in set(candidates):
            link = requests.compat.urljoin(r.url, link)
            print(f"\n→ Trying real PDF URL: {link}")
            r2 = requests.get(link, timeout=30)
            print("  Status:", r2.status_code)
            print("  CT    :", r2.headers.get("Content-Type"))
            print("  Size  :", len(r2.content))
            if "application/pdf" in r2.headers.get("Content-Type", ""):
                print("  ✅ Found real PDF!")
                return
        print("❌ All candidates failed to deliver a PDF.")
    else:
        print("✅ It was already a PDF!")

if __name__ == "__main__":
    inspect_pmc_pdf(TEST_URL)


→ Fetching: https://pmc.ncbi.nlm.nih.gov/articles/PMC11051180/pdf/jpm-14-00411.pdf
Status : 403
CT     : text/html; charset=UTF-8
Body[0:200]: b'<!doctype html><meta charset="utf-8"><meta name=viewport content="width=device-width, initial-scale=1"><title>403</title>403 Forbidden'

→ Detected HTML wrapper, parsing for PDF link…
! No obvious PDF link found in HTML.


In [8]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Combined OA-first + Sci-Hub PDF Fetcher (v11, standalone)
-------------------------------------------------------
1) Reads PMIDs from an Excel file.
2) Fetches metadata (year, first author, title, DOI) via NCBI EFetch.
3) Tries Open Access first (Unpaywall → PMC with proper Accept headers).
4) Any remaining PMIDs are tried via Sci-Hub in parallel.
5) All PDFs land in one OUTPUT_PDF_DIR named `{year}-{pmid}-{author}-{title}.pdf`.
6) Detailed logging of every URL fetched and why it failed or succeeded.
"""

import os
import re
import time
import logging
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, quote_plus
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from concurrent.futures import ThreadPoolExecutor, as_completed

# === CONFIGURATION ===
EXCEL_FILE_PATH     = r"C:\Users\Galaxy\Downloads\screening_ERAS.xlsx"
OUTPUT_PDF_DIR      = "downloaded_pdfs_v11"

# Sci-Hub
SCI_HUB_DOMAINS     = [
    "https://sci-hub.se", "https://sci-hub.ru", "https://sci-hub.ren",
    "https://sci-hub.wf", "https://sci-hub.ee", "https://sci-hub.st"
]
DELAY_SCIHUB        = 0.5   # seconds between Sci-Hub tries

# Threads / batching
MAX_THREADS         = 10
EFETCH_BATCH_SIZE   = 100
DELAY_NCBI          = 0.35  # seconds between NCBI batches

# API credentials
NCBI_API_KEY        = "YOUR_API_KEY_HERE"
CROSSREF_EMAIL      = "your_email@example.com"
UNPAYWALL_EMAIL     = "levi4328@gmail.com"

# === LOGGING SETUP ===
logger = logging.getLogger("PDFFetcherV11")
logger.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setFormatter(logging.Formatter(
    '%(asctime)s - %(levelname)s - [%(threadName)s] - %(message)s'
))
logger.addHandler(ch)

# === HTTP SESSIONS WITH RETRIES ===
def make_session(user_agent):
    s = requests.Session()
    retries = Retry(
        total=3,
        backoff_factor=0.5,
        status_forcelist=[429,500,502,503,504],
        allowed_methods=frozenset(['GET','POST','HEAD'])
    )
    adapter = HTTPAdapter(
        max_retries=retries,
        pool_connections=MAX_THREADS,
        pool_maxsize=MAX_THREADS
    )
    s.mount("https://", adapter)
    s.mount("http://", adapter)
    s.headers.update({'User-Agent': user_agent})
    return s

session_ncbi   = make_session(f"PDFFetcherV11 (mailto:{CROSSREF_EMAIL})")
session_oa     = make_session(f"PDFFetcherV11-OA (mailto:{UNPAYWALL_EMAIL})")
session_scihub = make_session("Mozilla/5.0 (Windows NT 10.0; Win64; x64)")

# === UTILITIES ===
def sanitize_filename(s: str) -> str:
    s = re.sub(r'[\\/*?:"<>|]', "", s)
    s = re.sub(r'\s+', " ", s).strip()
    return s.replace(" ", "_")[:200]

def _get_ncbi_params(extra=None):
    params = {"tool":"PDFFetcherV11","email":CROSSREF_EMAIL}
    if NCBI_API_KEY and NCBI_API_KEY!="YOUR_API_KEY_HERE":
        params["api_key"] = NCBI_API_KEY
    if CROSSREF_EMAIL=="your_email@example.com":
        params.pop("email",None)
    if extra:
        params.update(extra)
    return params

# === STEP 1: FETCH METADATA FROM PUBMED ===
def fetch_metadata(pmids):
    meta = {}
    batches = [pmids[i:i+EFETCH_BATCH_SIZE] for i in range(0, len(pmids), EFETCH_BATCH_SIZE)]
    logger.info(f"EFetch PubMed metadata for {len(pmids)} PMIDs in {len(batches)} batch(es)...")
    for batch in batches:
        data = _get_ncbi_params({"db":"pubmed","retmode":"xml","id":",".join(batch)})
        logger.info(f"NCBI EFetch POST → IDs={','.join(batch)}")
        try:
            resp = session_ncbi.post(
                "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
                data=data,
                timeout=60
            )
            resp.raise_for_status()
            root = ET.fromstring(resp.content)
            for art in root.findall(".//PubmedArticle"):
                pmid_el = art.find(".//PMID")
                pmid = pmid_el.text.strip()
                doi_el = art.find(".//ArticleId[@IdType='doi']") or \
                         art.find(".//ELocationID[@EIdType='doi'][@ValidYN='Y']")
                doi = doi_el.text.strip() if doi_el is not None else None
                year_el = art.find(".//Journal/JournalIssue/PubDate/Year")
                if year_el is not None and year_el.text:
                    year = year_el.text.strip()
                else:
                    med = art.find(".//Article/Journal/JournalIssue/PubDate/MedlineDate")
                    year = med.text[:4] if med is not None else "Unknown"
                fa = art.find(".//AuthorList/Author")
                author = fa.find("LastName").text.strip() if fa is not None and fa.find("LastName") is not None else "Unknown"
                title_el = art.find(".//ArticleTitle")
                title = "".join(title_el.itertext()).strip() if title_el is not None else pmid
                meta[pmid] = {'doi':doi, 'year':year, 'author':author, 'title':title}
        except Exception as e:
            logger.warning(f"NCBI EFetch batch error: {e}")
        time.sleep(DELAY_NCBI)
    missing = [pm for pm in pmids if pm not in meta]
    if missing:
        logger.warning(f"Metadata missing for PMIDs: {missing}")
    return meta

# === STEP 2: OPEN ACCESS → Unpaywall then PMC (with proper Accept) ===
def unpaywall_pdf_url(doi):
    api = f"https://api.unpaywall.org/v2/{quote_plus(doi)}?email={UNPAYWALL_EMAIL}"
    logger.info(f"Unpaywall API GET → DOI {doi}: {api}")
    try:
        r = session_oa.get(api, timeout=20)
        r.raise_for_status()
        j = r.json()
        if j.get("is_oa"):
            best = j.get("best_oa_location") or {}
            url = best.get("url_for_pdf") or next(
                (loc.get("url_for_pdf") for loc in j.get("oa_locations",[]) if loc.get("url_for_pdf")),
                None
            )
            return url
    except Exception as e:
        logger.warning(f"Unpaywall ✗ DOI {doi}: {e}")
    return None

def pmc_id_for_pmid(pmid):
    params = _get_ncbi_params({"dbfrom":"pubmed","db":"pmc","id":pmid,"cmd":"neighbor_score"})
    for linkname in ("pubmed_pmc_refs","pubmed_pmc"):
        params["linkname"] = linkname
        try:
            r = session_ncbi.post(
                "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi",
                data=params,
                timeout=15
            )
            r.raise_for_status()
            root = ET.fromstring(r.content)
            ids = [el.text.strip() for el in root.findall(".//Link/Id") if el.text]
            if ids:
                pmc = ids[0]
                return pmc if pmc.upper().startswith("PMC") else "PMC"+pmc
        except:
            continue
    return None

def pmc_download(pmcid, pmid, md):
    fname = sanitize_filename(f"{md['year']}-{pmid}-{md['author']}-{md['title']}") + ".pdf"
    path  = os.path.join(OUTPUT_PDF_DIR, fname)
    if os.path.exists(path):
        logger.info(f"PMC ✓ {pmid}: already exists")
        return True

    landing = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/pdf/"
    # 1) HEAD with Accept: application/pdf so we get redirected to the real .pdf
    logger.info(f"PMC HEAD → PMID {pmid}: {landing}")
    try:
        head = session_oa.head(
            landing,
            headers={"Accept":"application/pdf"},
            timeout=(5,15),
            allow_redirects=True
        )
        head.raise_for_status()
    except Exception as e:
        logger.warning(f"PMC HEAD ✗ {pmid}: {e}")
        return False

    final_url = head.url
    logger.info(f"PMC GET → PMID {pmid}: {final_url}")
    try:
        r = session_oa.get(
            final_url,
            headers={"Accept":"application/pdf"},
            timeout=(10,60),
            stream=True
        )
        r.raise_for_status()
        ctype = r.headers.get('Content-Type','').lower()
        if 'application/pdf' not in ctype and not final_url.lower().endswith('.pdf'):
            logger.warning(f"PMC GET ✗ {pmid}: bad content-type {ctype}")
            return False
        with open(path, 'wb') as f:
            for chunk in r.iter_content(65536):
                f.write(chunk)
        return True
    except Exception as e:
        logger.warning(f"PMC GET ✗ {pmid}: {e}")
        return False

def oa_worker(pmid, md):
    doi = md.get('doi')
    # 1) Unpaywall
    if doi:
        url = unpaywall_pdf_url(doi)
        if url:
            logger.info(f"Unpaywall GET → PMID {pmid}: {url}")
            try:
                r = session_oa.get(url, timeout=(10,60), stream=True)
                r.raise_for_status()
                ctype = r.headers.get('Content-Type','').lower()
                if 'application/pdf' in ctype or url.lower().endswith('.pdf'):
                    fname = sanitize_filename(f"{md['year']}-{pmid}-{md['author']}-{md['title']}") + ".pdf"
                    path  = os.path.join(OUTPUT_PDF_DIR, fname)
                    with open(path,'wb') as f:
                        for chunk in r.iter_content(65536):
                            f.write(chunk)
                    return pmid, True
            except Exception as e:
                logger.warning(f"Unpaywall GET ✗ {pmid}: {e}")
    # 2) PMC fallback
    pmcid = pmc_id_for_pmid(pmid)
    if pmcid:
        ok = pmc_download(pmcid, pmid, md)
        return pmid, ok

    logger.warning(f"OA ✗ {pmid}: no DOI or PMCID available")
    return pmid, False

# === STEP 3: TEST & INIT SCI-HUB DOMAINS ===
def test_scihub_domain(domain):
    test_doi = "10.1000/182"
    url = f"{domain.rstrip('/')}/{test_doi}"
    logger.info(f"Sci-Hub TEST GET → {url}")
    try:
        r = session_scihub.get(url, timeout=7)
        if r.status_code in (200,404) and 'html' in r.headers.get('Content-Type','').lower():
            return True
    except:
        pass
    return False

def init_scihub_domains():
    logger.info("Probing Sci-Hub mirrors for availability...")
    working = []
    for d in SCI_HUB_DOMAINS:
        if test_scihub_domain(d):
            working.append(d)
            if len(working) >= 2:
                break
    if not working:
        logger.error("No working Sci-Hub domains found!")
    else:
        logger.info(f"Using Sci-Hub domains: {working}")
    return working

def find_scihub_pdf(html, base_url):
    soup = BeautifulSoup(html, 'html.parser')
    base = f"{urlparse(base_url).scheme}://{urlparse(base_url).netloc}"
    # iframe#pdf / #article
    for id_ in ('pdf','article'):
        ifr = soup.find('iframe', id=id_)
        if ifr and ifr.get('src'):
            return urljoin(base, ifr['src'])
    # any iframe with .pdf
    for ifr in soup.find_all('iframe'):
        if '.pdf' in ifr.get('src',''):
            return urljoin(base, ifr['src'])
    # embed
    emb = soup.find('embed', type='application/pdf')
    if emb and emb.get('src'):
        return urljoin(base, emb['src'])
    # fallback anchors
    for sel in ['a[href$=".pdf"]','button[onclick*="location.href"]']:
        el = soup.select_one(sel)
        if el:
            href = el.get('href')
            if not href and el.get('onclick'):
                m = re.search(r"location\.href=['\"]([^'\"]+)", el['onclick'])
                href = m.group(1) if m else None
            if href:
                return urljoin(base, href)
    return None

def download_scihub(identifier, pmid, md, domains):
    fname = sanitize_filename(f"{md['year']}-{pmid}-{md['author']}-{md['title']}") + ".pdf"
    path = os.path.join(OUTPUT_PDF_DIR, fname)
    if os.path.exists(path):
        logger.info(f"Sci-Hub ✓ {pmid}: already exists")
        return True

    for dom in domains:
        url_html = f"{dom.rstrip('/')}/{identifier}"
        logger.info(f"Sci-Hub HTML GET → PMID {pmid}: {url_html}")
        try:
            r = session_scihub.get(url_html, timeout=20)
            r.raise_for_status()
            ctype = r.headers.get('Content-Type','').lower()
            # direct PDF?
            if 'application/pdf' in ctype:
                logger.info(f"Sci-Hub PDF GET  → PMID {pmid}: {url_html}")
                with open(path,'wb') as f: f.write(r.content)
                return True
            # HTML viewer → extract PDF link
            if 'html' in ctype:
                pdf_url = find_scihub_pdf(r.content, r.url)
                if pdf_url:
                    logger.info(f"Sci-Hub PDF GET  → PMID {pmid}: {pdf_url}")
                    pr = session_scihub.get(pdf_url, timeout=45, stream=True)
                    pr.raise_for_status()
                    if 'application/pdf' in pr.headers.get('Content-Type','').lower():
                        with open(path,'wb') as f:
                            for chunk in pr.iter_content(8192):
                                f.write(chunk)
                        return True
        except Exception as e:
            logger.warning(f"Sci-Hub ✗ {pmid} via {dom}: {e}")
        time.sleep(DELAY_SCIHUB)

    return False

# === MAIN ===
def main():
    t0 = time.time()
    logger.info("=== PDF Fetcher v11 started ===")

    # 1) Read PMIDs
    try:
        df = pd.read_excel(EXCEL_FILE_PATH)
    except Exception as e:
        logger.error(f"Cannot read Excel: {e}")
        return
    pmids = [str(x).split('.')[0] for x in df['PMID'].dropna().unique()]
    logger.info(f"Loaded {len(pmids)} unique PMIDs")

    # 2) Metadata
    meta = fetch_metadata(pmids)

    # 3) Prepare output
    os.makedirs(OUTPUT_PDF_DIR, exist_ok=True)

    # 4) OA-first
    oa_success, oa_failed = [], []
    with ThreadPoolExecutor(max_workers=MAX_THREADS) as exe:
        futures = {exe.submit(oa_worker, pm, meta[pm]): pm for pm in pmids if pm in meta}
        for fut in as_completed(futures):
            pm = futures[fut]
            try:
                pmid, ok = fut.result()
                (oa_success if ok else oa_failed).append(pmid)
            except Exception as e:
                logger.warning(f"OA thread error for {pm}: {e}")
                oa_failed.append(pm)
    logger.info(f"OA phase: {len(oa_success)}/{len(pmids)} succeeded; will Sci-Hub the remaining {len(oa_failed)}")

    # 5) Sci-Hub on remaining
    domains = init_scihub_domains()
    sci_success, sci_failed = [], []
    with ThreadPoolExecutor(max_workers=MAX_THREADS) as exe:
        futures = {
            exe.submit(
                download_scihub,
                (meta[pm]['doi'] if meta[pm]['doi'] else pm),
                pm,
                meta[pm],
                domains
            ): pm
            for pm in oa_failed
        }
        for fut in as_completed(futures):
            pm = futures[fut]
            try:
                ok = fut.result()
                (sci_success if ok else sci_failed).append(pm)
            except Exception as e:
                logger.warning(f"Sci-Hub thread error for {pm}: {e}")
                sci_failed.append(pm)
    logger.info(f"Sci-Hub phase: {len(sci_success)}/{len(oa_failed)} succeeded; still missing: {sci_failed}")

    total = len(oa_success) + len(sci_success)
    logger.info(f"TOTAL: {total}/{len(pmids)} PDFs fetched in {time.time()-t0:.2f}s")
    logger.info("=== PDF Fetcher v11 completed ===")

if __name__ == "__main__":
    main()


2025-05-21 08:45:10,886 - INFO - [MainThread] - === PDF Fetcher v11 started ===
2025-05-21 08:45:10,901 - INFO - [MainThread] - Loaded 32 unique PMIDs
2025-05-21 08:45:10,902 - INFO - [MainThread] - EFetch PubMed metadata for 32 PMIDs in 1 batch(es)...
2025-05-21 08:45:10,902 - INFO - [MainThread] - NCBI EFetch POST → IDs=39955421,40340819,39068053,39384309,38673038,36788057,32145713,30518491,33401363,39185540,37802689,37062759,36969299,33888360,35790215,34089071,33210165,34059337,32696123,32385680,30686518,30922685,31274269,27679510,27810148,27083963,26888001,39520824,39489669,39083294,39342249,31859070
2025-05-21 08:45:12,239 - INFO - [ThreadPoolExecutor-10_0] - Unpaywall API GET → DOI 10.1007/s00383-025-05977-0: https://api.unpaywall.org/v2/10.1007%2Fs00383-025-05977-0?email=levi4328@gmail.com
2025-05-21 08:45:12,245 - INFO - [ThreadPoolExecutor-10_1] - Unpaywall API GET → DOI 10.1136/bmjpo-2024-003280: https://api.unpaywall.org/v2/10.1136%2Fbmjpo-2024-003280?email=levi4328@gmail.co

In [12]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Combined OA-first + Sci-Hub PDF Fetcher (v11.1, standalone, with enhanced PMC/FTP fallback)
-----------------------------------------------------------------------------------------
1) Reads PMIDs from an Excel file.
2) Fetches metadata (year, first author, title, DOI) via NCBI EFetch.
3.1) Tries Open Access via Unpaywall (validates PDF content).
3.2) Tries Open Access via PMC:
    a) Standard PMC PDF URL (validates PDF content).
    b) If (a) fails or returns non-PDF, attempts fallback to NCBI FTP mirror for PMC.
4) Any remaining PMIDs are tried via Sci-Hub in parallel (validates PDF content).
5) All PDFs land in one OUTPUT_PDF_DIR named `{year}-{pmid}-{author}-{title}.pdf`.
6) Detailed logging of every URL fetched and why it failed or succeeded.
"""

import os
import re
import time
import logging
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, quote_plus
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from concurrent.futures import ThreadPoolExecutor, as_completed

# === CONFIGURATION ===
EXCEL_FILE_PATH     = r"C:\Users\Galaxy\Downloads\screening_ERAS.xlsx" # Update this path
OUTPUT_PDF_DIR      = "downloaded_pdfs_v11_fixed" # Changed output dir name slightly

# Sci-Hub
SCI_HUB_DOMAINS     = [
    "https://sci-hub.se", "https://sci-hub.ru", "https://sci-hub.ren",
    "https://sci-hub.wf", "https://sci-hub.ee", "https://sci-hub.st"
]
DELAY_SCIHUB        = 0.5   # seconds between Sci-Hub tries for a single paper

# Threads / batching
MAX_THREADS         = 10
EFETCH_BATCH_SIZE   = 100
DELAY_NCBI          = 0.35  # seconds between NCBI batch requests

# API credentials - PLEASE FILL THESE IN
NCBI_API_KEY        = "YOUR_API_KEY_HERE" # Get from NCBI settings
CROSSREF_EMAIL      = "your_email@example.com" # For polite API use
UNPAYWALL_EMAIL     = "your_email@example.com" # For polite API use (was hardcoded)

# === LOGGING SETUP ===
logger = logging.getLogger("PDFFetcherV11Fixed")
logger.setLevel(logging.INFO) # Set to logging.DEBUG for more verbose output
ch = logging.StreamHandler()
ch.setFormatter(logging.Formatter(
    '%(asctime)s - %(levelname)s - [%(threadName)s] - %(message)s'
))
logger.addHandler(ch)

# === HTTP SESSIONS WITH RETRIES ===
def make_session(user_agent):
    s = requests.Session()
    retries = Retry(
        total=3,
        backoff_factor=0.5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=frozenset(['GET', 'POST', 'HEAD'])
    )
    adapter = HTTPAdapter(
        max_retries=retries,
        pool_connections=MAX_THREADS,
        pool_maxsize=MAX_THREADS * 2 # Allow more connections in the pool
    )
    s.mount("https://", adapter)
    s.mount("http://", adapter)
    s.headers.update({'User-Agent': user_agent})
    return s

session_ncbi   = make_session(f"PDFFetcherV11Fixed/1.0 (mailto:{CROSSREF_EMAIL if CROSSREF_EMAIL != 'your_email@example.com' else 'anonymous'}) NCBI-EUtils-Client")
session_oa     = make_session(f"PDFFetcherV11Fixed/1.0 (mailto:{UNPAYWALL_EMAIL if UNPAYWALL_EMAIL != 'your_email@example.com' else 'anonymous'}) OpenAccess-Client")
session_scihub = make_session("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36")


# === UTILITIES ===
def sanitize_filename(s: str) -> str:
    s = re.sub(r'[\\/*?:"<>|]', "", s)
    s = re.sub(r'\s+', " ", s).strip()
    return s.replace(" ", "_")[:200] # Max 200 chars for the 'prefix' part, .pdf added later

def _get_ncbi_params(extra=None):
    params = {"tool": "PDFFetcherV11Fixed"}
    if CROSSREF_EMAIL and CROSSREF_EMAIL != "your_email@example.com":
        params["email"] = CROSSREF_EMAIL
    if NCBI_API_KEY and NCBI_API_KEY != "YOUR_API_KEY_HERE":
        params["api_key"] = NCBI_API_KEY
    if extra:
        params.update(extra)
    return params

# === PDF DOWNLOAD AND VERIFICATION HELPER ===
def download_and_verify_pdf(session, url, pmid, source_name, output_path):
    logger.info(f"{source_name} GET → PMID {pmid}: Attempting download from {url}")
    try:
        # Use specific Accept header for PDF
        r = session.get(url, timeout=(15, 60), stream=True, headers={"Accept": "application/pdf,application/octet-stream"})
        r.raise_for_status() # Check for HTTP errors
        
        content_type = r.headers.get('Content-Type', '').lower()
        
        first_bytes = b""
        can_peek = hasattr(r.raw, 'peek') and callable(r.raw.peek)
        
        if can_peek:
            try:
                peeked_content = r.raw.peek(512) # Peek enough for magic number and some HTML tags
                first_bytes = peeked_content[:32] # Use a portion for checks
            except Exception as e_peek:
                logger.warning(f"{source_name} Peek ✗ PMID {pmid}: Error peeking stream from {url}: {e_peek}")
                can_peek = False 
        else:
            logger.warning(f"{source_name} Peek ? PMID {pmid}: Stream from {url} does not support peek(). Relying on Content-Type/URL.")
        
        is_likely_pdf = first_bytes.startswith(b'%PDF-') if first_bytes else False
        is_html = False
        if first_bytes:
            fb_lower = first_bytes.lower() # Check a small portion
            is_html = b'<html' in fb_lower or b'<!doctype' in fb_lower or b'<head' in fb_lower or b'<body' in fb_lower
        
        # Decision logic for saving
        pdf_confirmed = False
        if is_likely_pdf and ('application/pdf' in content_type or 'application/x-pdf' in content_type):
            logger.info(f"{source_name} ✓ PMID {pmid}: Valid PDF (magic number, type '{content_type}') from {url}.")
            pdf_confirmed = True
        elif 'application/pdf' in content_type and not is_html and url.lower().endswith('.pdf'):
            log_msg_detail = f"(No peek data available)" if not can_peek else f"(Magic number missing in peek: {first_bytes!r})"
            logger.info(f"{source_name} ✓ PMID {pmid}: PDF content type '{content_type}', not HTML, URL ends .pdf from {url}. {log_msg_detail}. Assuming PDF.")
            pdf_confirmed = True
        elif is_likely_pdf and not is_html: # Magic number present, not HTML, content_type might be generic (e.g. octet-stream)
             logger.warning(f"{source_name} ✓ PMID {pmid}: PDF magic number found (in {first_bytes!r}), Content-Type '{content_type}', not HTML. Assuming PDF from {url}.")
             pdf_confirmed = True
        
        if pdf_confirmed:
            with open(output_path, 'wb') as f:
                for chunk in r.iter_content(65536): # 64KB chunks
                    f.write(chunk)
            logger.info(f"{source_name} ✓ PMID {pmid}: Successfully saved PDF to {output_path}")
            return True
        else:
            # Detailed failure log
            details = []
            if not first_bytes and not can_peek: details.append("cannot peek stream")
            if first_bytes: details.append(f"first_bytes: {first_bytes!r}")
            if not is_likely_pdf and first_bytes: details.append("no PDF magic number")
            if is_html: details.append("detected HTML structure")
            if 'application/pdf' not in content_type : details.append(f"Content-Type not PDF ('{content_type}')")
            if not url.lower().endswith('.pdf'): details.append("URL does not end with .pdf")

            logger.warning(f"{source_name} ✗ PMID {pmid}: Not a PDF from {url}. Reasons: {'; '.join(details) if details else 'unknown criteria mismatch'}.")
            
            if is_html or 'text/html' in content_type:
                debug_html_path = output_path + ".html_error_debug.html"
                try:
                    with open(debug_html_path, 'wb') as f_html:
                        # Write initial peeked part if available and then rest of stream
                        if can_peek and peeked_content: # peeked_content is the full peeked data
                             f_html.write(peeked_content)
                        for chunk in r.iter_content(8192):
                            f_html.write(chunk)
                            if f_html.tell() > 2 * 1024 * 1024: # Limit debug HTML size to 2MB
                                logger.warning(f"{source_name} ✗ PMID {pmid}: Truncated debug HTML at 2MB for {debug_html_path}")
                                break
                    logger.warning(f"{source_name} ✗ PMID {pmid}: Saved debug HTML (or suspected HTML) to {debug_html_path}")
                except Exception as e_debug_save:
                    logger.error(f"Error saving debug HTML for {pmid} from {url}: {e_debug_save}")
            return False
            
    except requests.exceptions.RequestException as e:
        logger.warning(f"{source_name} GET ✗ PMID {pmid} from {url}: RequestException {e}")
    except Exception as e: # Catch-all for other unexpected errors during download/verification
        logger.warning(f"{source_name} GET ✗ PMID {pmid} from {url}: Unexpected error {e.__class__.__name__}: {e}")
    return False

# === STEP 1: FETCH METADATA FROM PUBMED ===
def fetch_metadata(pmids):
    meta = {}
    batches = [pmids[i:i+EFETCH_BATCH_SIZE] for i in range(0, len(pmids), EFETCH_BATCH_SIZE)]
    logger.info(f"EFetch PubMed metadata for {len(pmids)} PMIDs in {len(batches)} batch(es)...")
    
    for i, batch in enumerate(batches):
        data = _get_ncbi_params({"db":"pubmed", "retmode":"xml", "id":",".join(map(str,batch))})
        logger.info(f"NCBI EFetch POST (batch {i+1}/{len(batches)}) → IDs={','.join(map(str,batch))}")
        try:
            resp = session_ncbi.post(
                "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
                data=data,
                timeout=60 # Increased timeout for potentially large batches
            )
            resp.raise_for_status()
            root = ET.fromstring(resp.content)
            for art_node in root.findall(".//PubmedArticle"):
                pmid_el = art_node.find(".//PMID")
                pmid = pmid_el.text.strip() if pmid_el is not None and pmid_el.text else None
                if not pmid: continue

                doi_el = art_node.find(".//ArticleId[@IdType='doi']") or \
                         art_node.find(".//ELocationID[@EIdType='doi'][@ValidYN='Y']")
                doi = doi_el.text.strip() if doi_el is not None and doi_el.text else None
                
                year_el = art_node.find(".//Journal/JournalIssue/PubDate/Year")
                year_text = None
                if year_el is not None and year_el.text:
                    year_text = year_el.text.strip()
                else:
                    medline_date_el = art_node.find(".//Journal/JournalIssue/PubDate/MedlineDate")
                    if medline_date_el is not None and medline_date_el.text:
                        year_text = medline_date_el.text.strip()[:4] # Extract first 4 chars for year

                year = year_text if year_text and year_text.isdigit() and len(year_text) == 4 else "UnknownYear"

                author_el = art_node.find(".//AuthorList/Author[1]/LastName") # First author's last name
                author = author_el.text.strip() if author_el is not None and author_el.text else "UnknownAuthor"
                
                title_el = art_node.find(".//ArticleTitle")
                # Join all text nodes within ArticleTitle, handles tags like <i>, <b>, <sup> etc.
                title = "".join(title_el.itertext()).strip() if title_el is not None else f"UnknownTitle_PMID_{pmid}"
                
                meta[pmid] = {'doi': doi, 'year': year, 'author': author, 'title': title}
        except requests.exceptions.RequestException as e:
            logger.error(f"NCBI EFetch batch error (RequestException): {e} for batch {batch}")
        except ET.ParseError as e:
            logger.error(f"NCBI EFetch batch error (XML ParseError): {e} for batch {batch}")
        except Exception as e:
            logger.error(f"NCBI EFetch batch error (General Exception {e.__class__.__name__}): {e} for batch {batch}")
        
        if i < len(batches) - 1: # Don't sleep after the last batch
            time.sleep(DELAY_NCBI)
            
    missing_pmids = [p for p in pmids if p not in meta]
    if missing_pmids:
        logger.warning(f"Metadata retrieval failed or incomplete for {len(missing_pmids)} PMIDs: {missing_pmids[:10]}...") # Log first 10
    return meta

# === STEP 2: OPEN ACCESS (Unpaywall, PMC with FTP fallback) ===
def unpaywall_api_get_pdf_url(doi): # Renamed to clarify it calls API
    if not UNPAYWALL_EMAIL or UNPAYWALL_EMAIL == "your_email@example.com":
        logger.warning(f"Unpaywall ✗ DOI {doi}: Email not configured. Skipping Unpaywall.")
        return None
        
    api_call_url = f"https://api.unpaywall.org/v2/{quote_plus(doi)}?email={UNPAYWALL_EMAIL}"
    logger.info(f"Unpaywall API GET → DOI {doi}: {api_call_url}")
    try:
        r = session_oa.get(api_call_url, timeout=20)
        r.raise_for_status()
        data = r.json()
        if data.get("is_oa"):
            best_oa_loc = data.get("best_oa_location")
            pdf_url = None
            if best_oa_loc and best_oa_loc.get("url_for_pdf"):
                pdf_url = best_oa_loc.get("url_for_pdf")
            else: # Check other OA locations
                for loc in data.get("oa_locations", []):
                    if loc.get("url_for_pdf"):
                        pdf_url = loc.get("url_for_pdf")
                        break
            if pdf_url:
                logger.info(f"Unpaywall API ✓ DOI {doi}: Found PDF URL: {pdf_url}")
                return pdf_url
            else:
                logger.info(f"Unpaywall API ✓ DOI {doi}: Article is OA, but no direct PDF URL found in Unpaywall response.")
        else:
            logger.info(f"Unpaywall API ✓ DOI {doi}: Not OA according to Unpaywall.")
    except requests.exceptions.RequestException as e:
        logger.warning(f"Unpaywall API ✗ DOI {doi}: RequestException: {e}")
    except ValueError as e: # Handles JSONDecodeError
        logger.warning(f"Unpaywall API ✗ DOI {doi}: JSON Decode Error: {e}")
    except Exception as e:
        logger.warning(f"Unpaywall API ✗ DOI {doi}: Unexpected error {e.__class__.__name__}: {e}")
    return None

def pmc_id_for_pmid(pmid):
    params = _get_ncbi_params({"dbfrom":"pubmed", "db":"pmc", "id":pmid, "cmd":"neighbor_score"})
    for linkname in ("pubmed_pmc_refs", "pubmed_pmc"):
        params["linkname"] = linkname
        logger.debug(f"PMC ID ELINK → PMID {pmid} (linkname: {linkname})")
        try:
            r = session_ncbi.post(
                "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi",
                data=params,
                timeout=20 # Increased timeout
            )
            r.raise_for_status()
            root = ET.fromstring(r.content)
            # More precise XPath for typical neighbor_score results
            id_el = root.find(".//LinkSetDb[DbTo='pmc']/Link/Id") 
            if id_el is not None and id_el.text:
                pmc_id_raw = id_el.text.strip()
                if pmc_id_raw.upper().startswith("PMC"):
                    logger.info(f"PMC ID ELINK ✓ PMID {pmid}: Found PMCID {pmc_id_raw}")
                    return pmc_id_raw
                elif pmc_id_raw.isdigit(): # If just number, prefix with PMC
                    corrected_pmcid = "PMC" + pmc_id_raw
                    logger.info(f"PMC ID ELINK ✓ PMID {pmid}: Found numeric ID {pmc_id_raw}, corrected to {corrected_pmcid}")
                    return corrected_pmcid
                else: # Non-standard ID, log and return as is
                    logger.warning(f"PMC ID ELINK ? PMID {pmid}: Found non-standard ID '{pmc_id_raw}'. Using as is.")
                    return pmc_id_raw
        except requests.exceptions.RequestException as e:
            logger.warning(f"PMC ID ELINK ✗ PMID {pmid} (linkname {linkname}): RequestException {e}")
        except ET.ParseError as e:
            logger.warning(f"PMC ID ELINK ✗ PMID {pmid} (linkname {linkname}): XML ParseError {e}")
        except Exception as e:
            logger.warning(f"PMC ID ELINK ✗ PMID {pmid} (linkname {linkname}): Unexpected error {e.__class__.__name__}: {e}")
        
        time.sleep(DELAY_NCBI / 2) # Short delay between elink attempts for the same PMID

    logger.warning(f"PMC ID ELINK ✗ PMID {pmid}: No PMCID found after trying linknames.")
    return None

def pmc_download(pmcid, pmid, md):
    fname_base = sanitize_filename(f"{md['year']}-{pmid}-{md['author']}-{md['title']}")
    output_path = os.path.join(OUTPUT_PDF_DIR, fname_base + ".pdf")

    if os.path.exists(output_path):
        logger.info(f"PMC ✓ {pmid} ({pmcid}): already exists at {output_path}")
        return True

    pmc_article_pdf_landing_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/pdf/"
    final_pdf_url_from_pmc_site = None
    pdf_filename_segment_for_ftp = None

    logger.info(f"PMC HEAD → PMID {pmid} ({pmcid}): Probing {pmc_article_pdf_landing_url} for final PDF URL.")
    try:
        head_resp = session_oa.head(
            pmc_article_pdf_landing_url,
            headers={"Accept": "application/pdf, */*"}, # Be broad for HEAD
            timeout=(10, 25), # connect, read timeouts
            allow_redirects=True
        )
        head_resp.raise_for_status()
        final_pdf_url_from_pmc_site = head_resp.url
        
        parsed_url = urlparse(final_pdf_url_from_pmc_site)
        if parsed_url.path and parsed_url.path.lower().endswith(".pdf"):
            pdf_filename_segment_for_ftp = os.path.basename(parsed_url.path)
            logger.info(f"PMC HEAD ✓ PMID {pmid} ({pmcid}): Redirected to PDF-like URL: {final_pdf_url_from_pmc_site} (Filename for FTP: {pdf_filename_segment_for_ftp})")
        else:
            logger.warning(f"PMC HEAD ? PMID {pmid} ({pmcid}): Redirected URL {final_pdf_url_from_pmc_site} does not end with .pdf. FTP fallback might not be possible if filename unknown.")
            # Still, final_pdf_url_from_pmc_site might be downloadable itself

    except requests.exceptions.RequestException as e:
        logger.warning(f"PMC HEAD ✗ {pmid} ({pmcid}) for {pmc_article_pdf_landing_url}: {e}. Will proceed to FTP if filename was previously known or can be guessed (not implemented).")
    except Exception as e:
        logger.warning(f"PMC HEAD ✗ {pmid} ({pmcid}) unexpected error for {pmc_article_pdf_landing_url}: {e.__class__.__name__} {e}")

    # Attempt 1: Download from the URL obtained via HEAD (if successful)
    if final_pdf_url_from_pmc_site:
        logger.info(f"PMC Attempt 1 (Direct from PMC site) → PMID {pmid} ({pmcid}) URL: {final_pdf_url_from_pmc_site}")
        if download_and_verify_pdf(session_oa, final_pdf_url_from_pmc_site, pmid, f"PMC Direct ({pmcid})", output_path):
            return True
        else: # Failed, will try FTP
            logger.info(f"PMC Attempt 1 (Direct from PMC site) ✗ PMID {pmid} ({pmcid}): Failed. Will try FTP fallback if possible.")
    else:
        logger.info(f"PMC Attempt 1 (Direct from PMC site) skipped for PMID {pmid} ({pmcid}): No final PDF URL resolved from HEAD operation.")

    # Attempt 2: FTP Fallback
    if pmcid and pdf_filename_segment_for_ftp:
        ftp_url = f"https://ftp.ncbi.nlm.nih.gov/pub/pmc/articles/{pmcid}/pdf/{pdf_filename_segment_for_ftp}"
        logger.info(f"PMC Attempt 2 (FTP Fallback) → PMID {pmid} ({pmcid}) URL: {ftp_url}")
        if download_and_verify_pdf(session_oa, ftp_url, pmid, f"PMC FTP ({pmcid})", output_path):
            return True
        else:
            logger.warning(f"PMC Attempt 2 (FTP Fallback) ✗ PMID {pmid} ({pmcid}): Failed for {ftp_url}.")
    elif not pdf_filename_segment_for_ftp:
         logger.warning(f"PMC Attempt 2 (FTP Fallback) ✗ PMID {pmid} ({pmcid}): Skipped, PDF filename for FTP path unknown (final URL from HEAD was {final_pdf_url_from_pmc_site}).")

    logger.warning(f"PMC ✗ PMID {pmid} ({pmcid}): All download attempts (direct, FTP) failed.")
    return False

def oa_worker(pmid, md): # md is the metadata dictionary
    doi = md.get('doi')
    # Generate base filename once, used by download_and_verify_pdf indirectly via output_path
    fname_base = sanitize_filename(f"{md['year']}-{pmid}-{md['author']}-{md['title']}")
    pdf_output_path = os.path.join(OUTPUT_PDF_DIR, fname_base + ".pdf")

    if os.path.exists(pdf_output_path): # Check if already downloaded
        logger.info(f"OA ✓ {pmid}: PDF already exists at {pdf_output_path}")
        return pmid, True

    # 1) Unpaywall
    if doi:
        direct_pdf_url_from_unpaywall = unpaywall_api_get_pdf_url(doi)
        if direct_pdf_url_from_unpaywall:
            if download_and_verify_pdf(session_oa, direct_pdf_url_from_unpaywall, pmid, f"Unpaywall (DOI:{doi})", pdf_output_path):
                return pmid, True
            # else: Unpaywall download failed or wasn't a PDF, proceed to PMC
    
    # 2) PMC fallback (uses its own pmc_download logic including FTP)
    pmcid = pmc_id_for_pmid(pmid)
    if pmcid:
        logger.info(f"OA: Trying PMC for PMID {pmid} (PMCID: {pmcid}) as Unpaywall attempt was insufficient or skipped.")
        # pmc_download handles its own file path generation and existence check based on md
        if pmc_download(pmcid, pmid, md):
            return pmid, True
    
    logger.warning(f"OA ✗ {pmid}: No PDF found via Unpaywall or PMC.")
    return pmid, False

# === STEP 3: SCI-HUB (Test domains & Download) ===
def test_scihub_domain(domain_url):
    # Use a known valid DOI that usually works on Sci-Hub
    test_doi_for_scihub = "10.1016/j.cell.2020.03.036" # Example DOI
    target_url = f"{domain_url.rstrip('/')}/{test_doi_for_scihub}"
    logger.debug(f"Sci-Hub TEST GET → {target_url}")
    try:
        r = session_scihub.get(target_url, timeout=10) # Increased timeout for test
        # Check for status 200 and if it looks like an HTML page (Sci-Hub viewer)
        # Some mirrors might directly 404 if DOI not found, that's also "working" in a sense of being responsive
        if r.status_code in (200, 404) and 'html' in r.headers.get('Content-Type','').lower():
            logger.info(f"Sci-Hub TEST ✓ {domain_url} is responsive.")
            return True
        logger.warning(f"Sci-Hub TEST ? {domain_url} responded with status {r.status_code}, Content-Type: {r.headers.get('Content-Type','')}")
    except requests.exceptions.Timeout:
        logger.warning(f"Sci-Hub TEST ✗ {domain_url} timed out.")
    except requests.exceptions.RequestException as e:
        logger.warning(f"Sci-Hub TEST ✗ {domain_url} error: {e}")
    return False

def init_scihub_domains():
    logger.info("Probing Sci-Hub mirrors for availability...")
    working_domains = []
    # Use a ThreadPoolExecutor to test domains in parallel for speed
    with ThreadPoolExecutor(max_workers=min(len(SCI_HUB_DOMAINS), 5)) as executor:
        future_to_domain = {executor.submit(test_scihub_domain, d): d for d in SCI_HUB_DOMAINS}
        for future in as_completed(future_to_domain):
            domain = future_to_domain[future]
            try:
                if future.result():
                    working_domains.append(domain)
            except Exception as exc:
                logger.error(f"Sci-Hub TEST Generation ✗ Error testing domain {domain}: {exc}")
    
    if not working_domains:
        logger.error("CRITICAL: No working Sci-Hub domains found after testing!")
    else:
        logger.info(f"Using Sci-Hub domains: {working_domains}")
    return working_domains

def find_scihub_pdf_in_html(html_content, base_page_url): # Renamed for clarity
    soup = BeautifulSoup(html_content, 'html.parser')
    # Construct absolute base URL (scheme + netloc) for resolving relative links
    parsed_base_url = urlparse(base_page_url)
    absolute_base = f"{parsed_base_url.scheme}://{parsed_base_url.netloc}"

    # Try common patterns for finding PDF links on Sci-Hub
    # 1. iframe with id 'pdf' or 'article'
    for iframe_id in ('pdf', 'article'):
        iframe = soup.find('iframe', id=iframe_id)
        if iframe and iframe.get('src'):
            src = iframe['src']
            # Handle protocol-relative URLs (e.g., //example.com/file.pdf)
            if src.startswith("//"):
                src = f"{parsed_base_url.scheme}:{src}"
            return urljoin(absolute_base, src) # urljoin handles relative/absolute

    # 2. Any iframe whose src contains '.pdf'
    for iframe in soup.find_all('iframe'):
        src = iframe.get('src', '')
        if '.pdf' in src.lower():
            if src.startswith("//"):
                src = f"{parsed_base_url.scheme}:{src}"
            return urljoin(absolute_base, src)

    # 3. Embed tag with type 'application/pdf'
    embed_tag = soup.find('embed', type='application/pdf')
    if embed_tag and embed_tag.get('src'):
        src = embed_tag['src']
        if src.startswith("//"):
            src = f"{parsed_base_url.scheme}:{src}"
        return urljoin(absolute_base, src)

    # 4. Direct anchor links ending in .pdf or buttons with onclick redirecting to PDF
    # Prefer links with 'download' or 'plugin' in text or attributes for Sci-Hub
    for selector in ['a[href*=".pdf"]', 'button[onclick*="location.href"]', 'a#download']:
        elements = soup.select(selector)
        for el in elements:
            href = None
            if el.name == 'a' and el.get('href'):
                href = el['href']
            elif el.name == 'button' and el.get('onclick'):
                match = re.search(r"location\.href=['\"]([^'\"]+\.pdf[^'\"]*)['\"]", el['onclick'], re.IGNORECASE)
                if match: href = match.group(1)
            
            if href:
                if href.startswith("//"): href = f"{parsed_base_url.scheme}:{href}"
                # Check if it really points to a PDF (simple check)
                if ".pdf" in href.lower() or "sci-hub.st/downloads" in href.lower(): # Common Sci-Hub download pattern
                    return urljoin(absolute_base, href)
    
    logger.debug(f"Sci-Hub HTML Parse: No obvious PDF link found in HTML from {base_page_url}")
    return None

def download_scihub(identifier, pmid, md, active_domains): # md is metadata
    fname_base = sanitize_filename(f"{md['year']}-{pmid}-{md['author']}-{md['title']}")
    output_path = os.path.join(OUTPUT_PDF_DIR, fname_base + ".pdf")

    if os.path.exists(output_path):
        logger.info(f"Sci-Hub ✓ {pmid}: PDF already exists at {output_path}")
        return True
    
    if not active_domains:
        logger.error(f"Sci-Hub ✗ {pmid}: No active Sci-Hub domains to try for identifier '{identifier}'.")
        return False

    for domain_url in active_domains:
        # This is the URL to Sci-Hub's page for the identifier (DOI or PMID)
        scihub_page_url = f"{domain_url.rstrip('/')}/{quote_plus(identifier)}"
        logger.info(f"Sci-Hub HTML GET → PMID {pmid} from {scihub_page_url} (Identifier: {identifier})")
        
        try:
            r_page = session_scihub.get(scihub_page_url, timeout=25) # Timeout for fetching the HTML page
            r_page.raise_for_status()
            page_content_type = r_page.headers.get('Content-Type','').lower()

            # Case 1: Sci-Hub page directly serves the PDF (very rare, but check)
            # This check now relies on download_and_verify_pdf implicitly
            if 'application/pdf' in page_content_type:
                 logger.info(f"Sci-Hub ? PMID {pmid}: URL {scihub_page_url} claims to be PDF directly. Verifying...")
                 # We need to re-GET it with stream=True for download_and_verify_pdf or adapt.
                 # For simplicity, if this rare case happens and it's small, we can try to save it directly after basic check.
                 # Or, better, just pass the URL to download_and_verify_pdf.
                 if download_and_verify_pdf(session_scihub, r_page.url, pmid, f"Sci-Hub DirectPDF ({domain_url})", output_path):
                     return True
                 else: # Failed direct PDF validation
                     logger.warning(f"Sci-Hub ✗ PMID {pmid}: URL {scihub_page_url} claimed PDF but validation failed. Trying to parse as HTML just in case.")
                     # Fall through to HTML parsing if it might be misidentified HTML

            # Case 2: Sci-Hub serves an HTML viewer page (most common)
            if 'html' in page_content_type or r_page.text.strip().lower().startswith(('<html', '<!doctype')):
                # Pass r_page.content (bytes) for BeautifulSoup, and r_page.url for base URL resolution
                pdf_url_from_page = find_scihub_pdf_in_html(r_page.content, r_page.url)
                
                if pdf_url_from_page:
                    logger.info(f"Sci-Hub HTML ✓ PMID {pmid}: Found potential PDF link: {pdf_url_from_page}")
                    # Now download and verify this extracted PDF link
                    if download_and_verify_pdf(session_scihub, pdf_url_from_page, pmid, f"Sci-Hub ExtractedPDF ({domain_url})", output_path):
                        return True # Success!
                    # else: download_and_verify_pdf failed for this extracted link, try next domain
                else:
                    logger.warning(f"Sci-Hub HTML ✗ {pmid} via {domain_url}: Found HTML page but no PDF link within it from {scihub_page_url}")
            
            else: # Unexpected content type from initial Sci-Hub URL
                logger.warning(f"Sci-Hub ✗ {pmid} via {domain_url}: Unexpected Content-Type '{page_content_type}' from {scihub_page_url}. Content preview (first 200b): {r_page.content[:200]!r}")

        except requests.exceptions.RequestException as e:
            logger.warning(f"Sci-Hub ✗ {pmid} via {domain_url} ({scihub_page_url}): RequestException: {e}")
        except Exception as e: # Catch broader exceptions for one domain attempt
            logger.warning(f"Sci-Hub ✗ {pmid} via {domain_url} ({scihub_page_url}): General error {e.__class__.__name__}: {e}")
        
        if len(active_domains) > 1 : # Only sleep if there are more domains to try
             time.sleep(DELAY_SCIHUB) 

    logger.error(f"Sci-Hub ✗ {pmid}: Failed to download PDF for identifier '{identifier}' after trying all active Sci-Hub domains.")
    return False

# === MAIN ===
def main():
    t_start = time.time()
    logger.info(f"=== PDF Fetcher v11.1 (Fixed PMC/FTP) started at {time.strftime('%Y-%m-%d %H:%M:%S')} ===")

    # 1) Read PMIDs from Excel
    try:
        df = pd.read_excel(EXCEL_FILE_PATH)
        if 'PMID' not in df.columns:
            logger.error(f"Excel file {EXCEL_FILE_PATH} must contain a 'PMID' column.")
            return
    except FileNotFoundError:
        logger.error(f"Excel file not found: {EXCEL_FILE_PATH}")
        return
    except Exception as e:
        logger.error(f"Cannot read Excel file {EXCEL_FILE_PATH}: {e}")
        return
    
    # Ensure PMIDs are strings, handle potential float conversion from Excel (e.g., 12345.0)
    pmids_raw = df['PMID'].dropna().unique()
    pmids = []
    for p_raw in pmids_raw:
        try:
            pmids.append(str(int(float(str(p_raw))))) # Robust conversion
        except ValueError:
            logger.warning(f"Skipping invalid PMID format: {p_raw}")
    
    if not pmids:
        logger.error("No valid PMIDs found in the Excel file.")
        return
    logger.info(f"Loaded {len(pmids)} unique, valid PMIDs from {EXCEL_FILE_PATH}")

    # 2) Fetch metadata for all PMIDs
    # This is done sequentially with batching and delays as per NCBI guidelines
    metadata_dict = fetch_metadata(pmids)
    valid_pmids_with_meta = [p for p in pmids if p in metadata_dict]
    logger.info(f"Successfully fetched metadata for {len(valid_pmids_with_meta)} PMIDs.")
    if len(valid_pmids_with_meta) == 0:
        logger.error("No metadata could be fetched. Cannot proceed with PDF downloads.")
        return

    # 3) Prepare output directory
    os.makedirs(OUTPUT_PDF_DIR, exist_ok=True)
    logger.info(f"PDFs will be saved to: {os.path.abspath(OUTPUT_PDF_DIR)}")

    # 4) Attempt Open Access downloads (Unpaywall, PMC with FTP fallback)
    logger.info("--- Starting Open Access Download Phase ---")
    oa_succeeded_pmids, oa_failed_pmids = [], []
    with ThreadPoolExecutor(max_workers=MAX_THREADS, thread_name_prefix="OA_Worker") as executor:
        future_to_pmid = {
            executor.submit(oa_worker, pmid, metadata_dict[pmid]): pmid
            for pmid in valid_pmids_with_meta # Only process PMIDs for which we have metadata
        }
        for future in as_completed(future_to_pmid):
            pmid_processed = future_to_pmid[future]
            try:
                _, success_status = future.result() # oa_worker returns (pmid, status)
                if success_status:
                    oa_succeeded_pmids.append(pmid_processed)
                else:
                    oa_failed_pmids.append(pmid_processed)
            except Exception as e:
                logger.error(f"OA Thread ✗ Error processing PMID {pmid_processed}: {e.__class__.__name__} - {e}")
                oa_failed_pmids.append(pmid_processed)
    
    logger.info(f"Open Access Phase Summary: {len(oa_succeeded_pmids)} PDFs successfully downloaded.")
    logger.info(f"{len(oa_failed_pmids)} PMIDs could not be fetched via Open Access methods.")

    # 5) Attempt Sci-Hub for remaining PMIDs
    if oa_failed_pmids:
        logger.info("--- Starting Sci-Hub Download Phase for Remaining PMIDs ---")
        active_scihub_domains = init_scihub_domains() # Test and get working domains
        
        if not active_scihub_domains:
            logger.error("Sci-Hub phase skipped: No active Sci-Hub domains found.")
            sci_hub_succeeded_pmids = []
            sci_hub_still_failed_pmids = list(oa_failed_pmids) # All previously failed are still failed
        else:
            sci_hub_succeeded_pmids, sci_hub_still_failed_pmids = [], []
            with ThreadPoolExecutor(max_workers=MAX_THREADS, thread_name_prefix="SciHub_Worker") as executor:
                future_to_pmid_scihub = {}
                for pmid_to_try_scihub in oa_failed_pmids:
                    meta_for_pmid = metadata_dict[pmid_to_try_scihub]
                    # Sci-Hub prefers DOI, falls back to PMID if DOI is not available
                    identifier_for_scihub = meta_for_pmid.get('doi') or pmid_to_try_scihub
                    
                    future_to_pmid_scihub[executor.submit(
                        download_scihub, 
                        identifier_for_scihub, 
                        pmid_to_try_scihub, 
                        meta_for_pmid, 
                        active_scihub_domains
                    )] = pmid_to_try_scihub
                
                for future in as_completed(future_to_pmid_scihub):
                    pmid_processed_scihub = future_to_pmid_scihub[future]
                    try:
                        success_status_scihub = future.result()
                        if success_status_scihub:
                            sci_hub_succeeded_pmids.append(pmid_processed_scihub)
                        else:
                            sci_hub_still_failed_pmids.append(pmid_processed_scihub)
                    except Exception as e:
                        logger.error(f"Sci-Hub Thread ✗ Error processing PMID {pmid_processed_scihub}: {e.__class__.__name__} - {e}")
                        sci_hub_still_failed_pmids.append(pmid_processed_scihub)

            logger.info(f"Sci-Hub Phase Summary: {len(sci_hub_succeeded_pmids)} PDFs successfully downloaded.")
            logger.info(f"{len(sci_hub_still_failed_pmids)} PMIDs still missing after Sci-Hub attempts.")
            if sci_hub_still_failed_pmids:
                 logger.info(f"List of PMIDs still missing: {sci_hub_still_failed_pmids}")
    else:
        logger.info("--- Sci-Hub Download Phase Skipped: No PMIDs remaining after Open Access phase. ---")
        sci_hub_succeeded_pmids = []
        sci_hub_still_failed_pmids = []


    total_succeeded = len(oa_succeeded_pmids) + len(sci_hub_succeeded_pmids)
    total_time_taken = time.time() - t_start
    logger.info("--- Overall Summary ---")
    logger.info(f"Processed {len(pmids)} unique input PMIDs.")
    logger.info(f"Successfully fetched metadata for {len(valid_pmids_with_meta)} PMIDs.")
    logger.info(f"Total PDFs successfully downloaded: {total_succeeded} / {len(valid_pmids_with_meta)} (from those with metadata).")
    logger.info(f"  - Via Open Access (Unpaywall/PMC): {len(oa_succeeded_pmids)}")
    logger.info(f"  - Via Sci-Hub: {len(sci_hub_succeeded_pmids)}")
    if sci_hub_still_failed_pmids: # PMIDs that failed both OA and SciHub
        logger.info(f"Total PMIDs still NOT downloaded: {len(sci_hub_still_failed_pmids)}")
        logger.info(f"Sample of missing PMIDs: {sci_hub_still_failed_pmids[:20]}") # Log a sample
    
    logger.info(f"Total execution time: {total_time_taken:.2f} seconds.")
    logger.info(f"=== PDF Fetcher v11.1 (Fixed PMC/FTP) completed at {time.strftime('%Y-%m-%d %H:%M:%S')} ===")

if __name__ == "__main__":
    # Example: Configure file logging in addition to stream logging
    # log_file_path = os.path.join(OUTPUT_PDF_DIR, "pdf_fetcher_v11_fixed.log")
    # os.makedirs(OUTPUT_PDF_DIR, exist_ok=True) # Ensure dir exists for log file
    # fh = logging.FileHandler(log_file_path)
    # fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - [%(threadName)s] - %(message)s'))
    # logger.addHandler(fh)
    
    main()

2025-05-21 09:06:39,959 - INFO - [MainThread] - === PDF Fetcher v11.1 (Fixed PMC/FTP) started at 2025-05-21 09:06:39 ===
2025-05-21 09:06:39,959 - INFO - === PDF Fetcher v11.1 (Fixed PMC/FTP) started at 2025-05-21 09:06:39 ===
2025-05-21 09:06:39,977 - INFO - [MainThread] - Loaded 32 unique, valid PMIDs from C:\Users\Galaxy\Downloads\screening_ERAS.xlsx
2025-05-21 09:06:39,977 - INFO - Loaded 32 unique, valid PMIDs from C:\Users\Galaxy\Downloads\screening_ERAS.xlsx
2025-05-21 09:06:39,978 - INFO - [MainThread] - EFetch PubMed metadata for 32 PMIDs in 1 batch(es)...
2025-05-21 09:06:39,978 - INFO - EFetch PubMed metadata for 32 PMIDs in 1 batch(es)...
2025-05-21 09:06:39,979 - INFO - [MainThread] - NCBI EFetch POST (batch 1/1) → IDs=39955421,40340819,39068053,39384309,38673038,36788057,32145713,30518491,33401363,39185540,37802689,37062759,36969299,33888360,35790215,34089071,33210165,34059337,32696123,32385680,30686518,30922685,31274269,27679510,27810148,27083963,26888001,39520824,394896

In [13]:
import cloudscraper # Import cloudscraper
import logging

logging.basicConfig(level=logging.DEBUG)
logging.getLogger("urllib3").setLevel(logging.INFO)

# cloudscraper creates a session that mimics requests.Session
# It will automatically try to solve JS challenges.
scraper = cloudscraper.create_scraper() 
# You can pass a browser dict to be more specific if needed:
# scraper = cloudscraper.create_scraper(browser={'custom': 'ScraperBot/1.0'}) 
# Or just use its defaults, which are usually good.

# Use the scraper session like you would a requests.Session
# Its default headers are usually good for bypassing challenges.
# We can add our specific Accept for PDF.

pmcid = "PMC11474870"
# Start with the URL that your browser first hits for the PDF path,
# allowing cloudscraper to handle any JS challenges on intermediate pages if necessary.
# The one with the trailing slash before the filename often works.
initial_pmc_url_for_challenge = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/pdf/"

# The final PDF URL (as per your browser trace)
final_pdf_url = f"https://pmc.ncbi.nlm.nih.gov/articles/{pmcid}/pdf/bmjpo-8-1.pdf"

print(f"--- Testing GET request directly to FINAL PDF URL with cloudscraper: {final_pdf_url} ---")

# For cloudscraper, often just a direct GET to the resource is enough if it
# can handle challenges on that domain.
# It's also good to try the page that *issues* the challenge if known.
# Let's try the final PDF URL first.

try:
    # Cloudscraper's GET. It will try to solve JS challenges.
    # It handles redirects by default.
    get_resp = scraper.get(final_pdf_url, timeout=(15, 60), stream=True) 
    get_resp.raise_for_status()
    print(f"Cloudscraper GET request successful to {get_resp.url}")
    print(f"Cloudscraper GET response Content-Type: {get_resp.headers.get('Content-Type')}")
    
    first_bytes = b""
    if hasattr(get_resp.raw, 'peek') and callable(get_resp.raw.peek):
        first_bytes = get_resp.raw.peek(64)
    else: # cloudscraper responses might not always have peekable raw attribute
        # Fallback: read a small chunk directly if stream=True
        try:
            first_bytes = next(get_resp.iter_content(chunk_size=64, decode_unicode=False))
        except StopIteration:
            first_bytes = b"Could not read initial chunk"
        except Exception as e_chunk:
            first_bytes = f"Error reading chunk: {e_chunk}".encode()

    print(f"First 64 bytes peeked/read: {first_bytes!r}")

    is_pdf_magic = first_bytes.startswith(b'%PDF-')
    is_html_detected = any(tag in first_bytes.lower() for tag in [b'<html', b'<!doctype', b'<script'])

    print(f"Is PDF (magic number)? {is_pdf_magic}")
    print(f"Is HTML detected? {is_html_detected}")

    if is_pdf_magic and not is_html_detected:
        print("Cloudscraper LIKELY GOT A VALID PDF!")
        # with open("test_pmc_cloudscraper.pdf", "wb") as f:
        #     if first_bytes != b"Could not read initial chunk" and not (f"Error reading chunk:".encode() in first_bytes) and hasattr(get_resp.raw, 'peek'):
        #         # If peek worked, write the peeked content then iterate
        #         # This logic needs care if peek "consumes" for iter_content
        #         # For now, just iterate the whole thing for simplicity if saving
        #         pass # Avoid complex write logic for test
        #     for chunk in get_resp.iter_content(chunk_size=8192):
        #         f.write(chunk)
        # print("Saved to test_pmc_cloudscraper.pdf")
    else:
        print("Cloudscraper does NOT look like it got a PDF.")
        print(f"Received content from cloudscraper (first 200 bytes if available):")
        # Re-fetch without stream to see content easily for debug, or iterate
        try:
            # To see content if it wasn't PDF:
            # Ensure you're reading the response content from the original get_resp
            # This might be tricky if stream=True and you already peeked/iterated.
            # For simplicity, let's assume we'd save it.
            # with open("error_content_cloudscraper.html", "wb") as f_err_cs:
            #    # similar save logic as before
            # print("Saved unexpected content to error_content_cloudscraper.html")
            pass
        except Exception as e_read_err:
            print(f"Error trying to read full error content: {e_read_err}")


except requests.exceptions.RequestException as e: # cloudscraper can raise requests exceptions
    print(f"Cloudscraper GET request failed: {e}")
    if hasattr(e, 'response') and e.response is not None:
        print(f"Cloudscraper GET response content (if error): {e.response.content[:500]}")
except Exception as e_cs: # Other cloudscraper specific errors
    print(f"Cloudscraper encountered an error: {e_cs.__class__.__name__}: {e_cs}")

--- Testing GET request directly to FINAL PDF URL with cloudscraper: https://pmc.ncbi.nlm.nih.gov/articles/PMC11474870/pdf/bmjpo-8-1.pdf ---
Cloudscraper GET request successful to https://pmc.ncbi.nlm.nih.gov/articles/PMC11474870/pdf/bmjpo-8-1.pdf
Cloudscraper GET response Content-Type: text/html; charset=utf-8
First 64 bytes peeked/read: b'\n\n\n\n<html>\n  <head>\n    <meta name="viewport" content="width=dev'
Is PDF (magic number)? False
Is HTML detected? True
Cloudscraper does NOT look like it got a PDF.
Received content from cloudscraper (first 200 bytes if available):


In [14]:
import cloudscraper
import logging
import time

logging.basicConfig(level=logging.DEBUG)
# Keep urllib3 and other libs less noisy for this specific test
logging.getLogger("urllib3").setLevel(logging.INFO)
logging.getLogger("charset_normalizer").setLevel(logging.INFO)


# --- Configuration ---
pmcid = "PMC11474870" # The PMCID you are testing with
# The article abstract page - often a good candidate for initial interaction / challenge
article_abstract_url = f"https://pmc.ncbi.nlm.nih.gov/articles/{pmcid}/"
# The intermediate PDF "directory" URL, which often redirects
intermediate_pdf_landing_url = f"https://pmc.ncbi.nlm.nih.gov/articles/{pmcid}/pdf/"
# The final PDF URL that your browser successfully fetched with cookies
final_pdf_url = f"https://pmc.ncbi.nlm.nih.gov/articles/{pmcid}/pdf/bmjpo-8-1.pdf" # Ensure filename is correct for this PMCID

# --- Cloudscraper Setup ---
# Try with more browser-like properties if default fails.
# Forcing a specific, common user agent.
ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
scraper = cloudscraper.create_scraper(
    browser={
        'browser': 'chrome', # Mimic Chrome
        'platform': 'windows',
        'mobile': False,
        'custom': ua # Pass your specific UA if desired, though cloudscraper has good defaults
    },
    delay=10 # Add a delay in seconds to allow JS challenges more time if needed
)
# Update scraper's default headers to be more browser-like for all requests in this session
scraper.headers.update({
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'en-US,en;q=0.9,pt-BR;q=0.8,pt;q=0.7', # Match your browser's language prio
    'Accept-Encoding': 'gzip, deflate, br, zstd',
    'Upgrade-Insecure-Requests': '1',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'none', # For the very first request
    'Sec-Fetch-User': '?1',
    'Cache-Control': 'no-cache', # Mimic your browser's no-cache behavior
    'Pragma': 'no-cache'
})

# --- Test Steps ---

success_stage = 0 # 0: initial, 1: abstract visited, 2: PDF obtained

# Step 1: Visit the article abstract page to potentially trigger challenges/cookie setup
print(f"--- STAGE 1: Visiting Article Abstract Page: {article_abstract_url} ---")
try:
    # This request will use the general headers set above
    resp_abstract = scraper.get(article_abstract_url, timeout=(15, 30))
    resp_abstract.raise_for_status()
    print(f"Abstract Page - Status: {resp_abstract.status_code}, URL: {resp_abstract.url}")
    print(f"Abstract Page - Content-Type: {resp_abstract.headers.get('Content-Type')}")
    print(f"Abstract Page - Cookies in scraper session after this GET: {scraper.cookies.get_dict()}")
    # Check if PoW cookie is now in the session
    if "cloudpmc-viewer-pow" in scraper.cookies.get_dict():
        print("SUCCESS: 'cloudpmc-viewer-pow' cookie found in session after visiting abstract page!")
        success_stage = 1
    else:
        print("WARNING: 'cloudpmc-viewer-pow' cookie NOT found after visiting abstract page.")
        print(f"Abstract Page - Response content (first 300 bytes): {resp_abstract.content[:300]}")

except Exception as e:
    print(f"Error visiting abstract page: {e}")
    if hasattr(e, 'response') and e.response is not None:
         print(f"Response content (if error): {e.response.content[:300]}")


# Step 2: (Optional but good) Visit the intermediate PDF landing page if abstract didn't yield PoW
# This page might be the one that more directly issues the challenge if the abstract page is too "general"
if success_stage < 1: # Only try if PoW not yet obtained
    print(f"\n--- STAGE 1.5: Visiting Intermediate PDF Landing Page: {intermediate_pdf_landing_url} ---")
    scraper.headers.update({'Sec-Fetch-Site': 'same-origin', 'Referer': article_abstract_url}) # Update for same-origin nav
    try:
        resp_intermediate = scraper.get(intermediate_pdf_landing_url, timeout=(15,30))
        resp_intermediate.raise_for_status()
        print(f"Intermediate PDF Page - Status: {resp_intermediate.status_code}, URL: {resp_intermediate.url}")
        print(f"Intermediate PDF Page - Content-Type: {resp_intermediate.headers.get('Content-Type')}")
        print(f"Intermediate PDF Page - Cookies in scraper session: {scraper.cookies.get_dict()}")
        if "cloudpmc-viewer-pow" in scraper.cookies.get_dict():
            print("SUCCESS: 'cloudpmc-viewer-pow' cookie found after visiting intermediate PDF page!")
            success_stage = 1 # or 1.5
        else:
            print("WARNING: 'cloudpmc-viewer-pow' cookie NOT found after intermediate PDF page.")
            print(f"Intermediate PDF Page - Response content (first 300 bytes): {resp_intermediate.content[:300]}")
    except Exception as e:
        print(f"Error visiting intermediate PDF page: {e}")
        if hasattr(e, 'response') and e.response is not None:
             print(f"Response content (if error): {e.response.content[:300]}")


# Step 3: Attempt to GET the final PDF URL, relying on cookies set in the scraper's session
if success_stage >= 1: # Only if we think we got the PoW cookie
    print(f"\n--- STAGE 2: Attempting to GET Final PDF URL: {final_pdf_url} ---")
    # Update headers for this specific request
    pdf_get_headers = scraper.headers.copy() # Start with session defaults
    pdf_get_headers['Accept'] = 'application/pdf,application/octet-stream,*/*;q=0.8' # Prioritize PDF
    # Set Referer to the page that linked to the PDF (either abstract or intermediate)
    pdf_get_headers['Referer'] = intermediate_pdf_landing_url if scraper.get(intermediate_pdf_landing_url, stream=True).url == intermediate_pdf_landing_url else article_abstract_url
    pdf_get_headers['Sec-Fetch-Site'] = 'same-origin'


    try:
        resp_pdf = scraper.get(final_pdf_url, headers=pdf_get_headers, stream=True, timeout=(15, 60))
        resp_pdf.raise_for_status()
        print(f"Final PDF URL - Status: {resp_pdf.status_code}, URL: {resp_pdf.url}")
        final_content_type = resp_pdf.headers.get('Content-Type','').lower()
        print(f"Final PDF URL - Content-Type: {final_content_type}")

        first_bytes = b""
        try:
            # Try to read first few bytes without consuming too much if it's a large PDF
            # iter_content is safer for potentially large files
            chunk_iter = resp_pdf.iter_content(chunk_size=64, decode_unicode=False)
            first_bytes = next(chunk_iter)
        except StopIteration:
            first_bytes = b"Empty response"
        except Exception as e_chunk:
            first_bytes = f"Error reading chunk: {e_chunk}".encode()
        
        print(f"Final PDF URL - First 64 bytes: {first_bytes!r}")

        is_pdf_magic = first_bytes.startswith(b'%PDF-')
        # A more robust HTML check on the actual bytes
        is_html_detected = b"<html" in first_bytes.lower() or \
                           b"<!doctype" in first_bytes.lower() or \
                           b"<head" in first_bytes.lower()

        if is_pdf_magic and 'application/pdf' in final_content_type and not is_html_detected:
            print("SUCCESS: Cloudscraper LIKELY GOT THE PDF!")
            success_stage = 2
            # with open("test_pmc_cloudscraper_final.pdf", "wb") as f:
            #     f.write(first_bytes) # Write the first chunk we already read
            #     for chunk in chunk_iter: # Write remaining chunks
            #         f.write(chunk)
            # print("Saved to test_pmc_cloudscraper_final.pdf")
        else:
            print("FAILURE: Cloudscraper did NOT get a PDF at the final stage.")
            print(f"Is PDF Magic: {is_pdf_magic}, Is HTML: {is_html_detected}, Content-Type: {final_content_type}")
            # print(f"Full content if not PDF (first 500 bytes): {resp_pdf.content[:500]}") # Careful with stream=True
            full_content_debug = first_bytes
            try:
                for chunk in chunk_iter: # Read a bit more for debugging
                    full_content_debug += chunk
                    if len(full_content_debug) > 1024: break
                print(f"Debug content (up to 1KB): {full_content_debug[:1024]!r}")
            except: pass


    except Exception as e:
        print(f"Error getting final PDF: {e}")
        if hasattr(e, 'response') and e.response is not None:
             print(f"Response content (if error): {e.response.content[:500]}")
else:
    print("\nSkipping final PDF GET because PoW cookie was likely not obtained in prior stages.")

if success_stage == 2:
    print("\nOverall SUCCESS: PDF was likely obtained after navigating prerequisite pages.")
else:
    print("\nOverall FAILURE: Could not obtain PDF.")

--- STAGE 1: Visiting Article Abstract Page: https://pmc.ncbi.nlm.nih.gov/articles/PMC11474870/ ---
Abstract Page - Status: 200, URL: https://pmc.ncbi.nlm.nih.gov/articles/PMC11474870/
Abstract Page - Content-Type: text/html; charset=utf-8
Abstract Page - Cookies in scraper session after this GET: {'cloudpmc-viewer-csrftoken': 'pDSwqYDv2ifYtmJEz3q8LiZ2yd8agXXg', 'ncbi_sid': '5BB8D1D782DC2ED3_24219SID'}
Abstract Page - Response content (first 300 bytes): b'\n<!DOCTYPE html>\n<html lang="en" >\n    <head >\n\n        <meta charset="UTF-8" />\n        <meta http-equiv="X-UA-Compatible" content="IE=edge" />\n        <meta name="HandheldFriendly" content="True" />\n        <meta name="MobileOptimized" content="320" />\n        <meta name="viewport" content="width='

--- STAGE 1.5: Visiting Intermediate PDF Landing Page: https://pmc.ncbi.nlm.nih.gov/articles/PMC11474870/pdf/ ---
Intermediate PDF Page - Status: 200, URL: https://pmc.ncbi.nlm.nih.gov/articles/PMC11474870/pdf/bmjpo-8-1.pdf
Inte

In [15]:
# Single Jupyter Cell for PMC PoW Test

import requests
import re
import hashlib
import time
import logging
import os
from urllib.parse import urlparse, urljoin

# === CONFIGURATION (for this test cell) ===
TARGET_PMCID = "PMC11474870"  # The PMCID we are testing
# Construct the final PDF filename if known, otherwise HEAD request is needed first
# For PMC11474870, the filename was bmjpo-8-1.pdf
KNOWN_PDF_FILENAME = "bmjpo-8-1.pdf" # If you know it for the PMCID
OUTPUT_DIR_TEST = "pmc_pow_test_output" # Directory to save any output
os.makedirs(OUTPUT_DIR_TEST, exist_ok=True)

# === LOGGING SETUP ===
logger = logging.getLogger("PMCPoWTest")
logger.handlers = [] # Clear existing handlers if any (important in Jupyter)
logger.setLevel(logging.DEBUG) # Detailed logging for the test
ch = logging.StreamHandler() # Output to console/Jupyter output
ch.setFormatter(logging.Formatter(
    '%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s'
))
logger.addHandler(ch)

# === BROWSER-LIKE HEADERS (Simplified for test, add more if needed) ===
BROWSER_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
BASE_HEADERS = {
    'User-Agent': BROWSER_UA,
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
}

# === PoW HELPER FUNCTIONS ===

def extract_pow_params_from_html(html_content: str) -> tuple[str, int, str, str, str] | None:
    """
    Parses the PMC challenge HTML to extract PoW parameters.
    Returns: (challenge_string, difficulty, cookie_name, cookie_expiration_str, cookie_path) or None
    """
    # const POW_CHALLENGE = "..."
    # const POW_DIFFICULTY = "..."
    # const POW_COOKIE_NAME = "..."
    # const POW_COOKIE_EXPIRATION = "..." (as string)
    # const POW_COOKIE_PATH = "..."
    
    challenge_match = re.search(r'const\s+POW_CHALLENGE\s*=\s*"(.*?)"', html_content)
    difficulty_match = re.search(r'const\s+POW_DIFFICULTY\s*=\s*"(.*?)"', html_content)
    cookie_name_match = re.search(r'const\s+POW_COOKIE_NAME\s*=\s*"(.*?)"', html_content)
    cookie_exp_match = re.search(r'const\s+POW_COOKIE_EXPIRATION\s*=\s*"(.*?)"', html_content)
    cookie_path_match = re.search(r'const\s+POW_COOKIE_PATH\s*=\s*"(.*?)"', html_content)

    if challenge_match and difficulty_match and cookie_name_match and cookie_exp_match and cookie_path_match:
        challenge_string = challenge_match.group(1)
        cookie_name = cookie_name_match.group(1)
        cookie_exp_str = cookie_exp_match.group(1)
        cookie_path = cookie_path_match.group(1)
        try:
            difficulty = int(difficulty_match.group(1))
            logger.info(f"Extracted PoW params: Challenge='{challenge_string[:20]}...', Difficulty={difficulty}, CookieName='{cookie_name}'")
            return challenge_string, difficulty, cookie_name, cookie_exp_str, cookie_path
        except ValueError:
            logger.error(f"Could not parse PoW difficulty as int: {difficulty_match.group(1)}")
    else:
        missing = []
        if not challenge_match: missing.append("POW_CHALLENGE")
        if not difficulty_match: missing.append("POW_DIFFICULTY")
        if not cookie_name_match: missing.append("POW_COOKIE_NAME")
        if not cookie_exp_match: missing.append("POW_COOKIE_EXPIRATION")
        if not cookie_path_match: missing.append("POW_COOKIE_PATH")
        logger.warning(f"Could not find all PoW parameters in HTML. Missing: {', '.join(missing)}")
    return None

def solve_pmc_pow(challenge_string: str, difficulty: int) -> tuple[int, str] | None:
    """
    Solves the PMC Proof-of-Work challenge.
    Returns: A tuple (nonce, resulting_hash_hex) if successful, None otherwise.
    """
    logger.info(f"Attempting to solve PoW: challenge='{challenge_string[:20]}...', difficulty={difficulty}")
    target_prefix = "0" * difficulty
    nonce = 0
    # Max nonce adjusted based on common PoW difficulties.
    # Difficulty 4 avg ~65k, Difficulty 5 avg ~1M, Difficulty 6 avg ~16M
    # Setting a higher limit just in case, but typical web PoWs are not that extreme.
    max_nonce_map = {1: 200, 2: 4000, 3: 100000, 4: 2000000, 5: 35000000} # Approx upper bounds
    max_nonce = max_nonce_map.get(difficulty, 200000000) # Default high if difficulty unknown

    start_time = time.time()
    while nonce <= max_nonce:
        test_string = challenge_string + str(nonce)
        hash_object = hashlib.sha256(test_string.encode('utf-8'))
        hex_digest = hash_object.hexdigest()

        if hex_digest.startswith(target_prefix):
            duration = time.time() - start_time
            logger.info(f"PoW SOLVED! Nonce: {nonce}, Hash: {hex_digest}, Time: {duration:.4f}s")
            return nonce, hex_digest
        
        if nonce > 0 and nonce % 500000 == 0 : # Log progress for difficult challenges
            logger.debug(f"PoW progress: nonce {nonce}, current hash {hex_digest[:10]}...")

        nonce += 1
    
    duration = time.time() - start_time
    logger.error(f"PoW FAILED to solve within max_nonce ({max_nonce}). Time: {duration:.2f}s")
    return None

# === MAIN TEST LOGIC ===

# 1. Setup Session
session = requests.Session()
session.headers.update(BASE_HEADERS)

# 2. Determine Final PDF URL
# Option A: If filename is known (as per our TARGET_PMCID example)
if KNOWN_PDF_FILENAME:
    final_pdf_url = f"https://pmc.ncbi.nlm.nih.gov/articles/{TARGET_PMCID}/pdf/{KNOWN_PDF_FILENAME}"
    logger.info(f"Using known final PDF URL: {final_pdf_url}")
else:
    # Option B: Resolve via HEAD request (more robust if filename unknown)
    intermediate_pdf_landing_url = f"https://pmc.ncbi.nlm.nih.gov/articles/{TARGET_PMCID}/pdf/"
    logger.info(f"Attempting HEAD request to resolve final PDF URL from: {intermediate_pdf_landing_url}")
    try:
        head_resp = session.head(intermediate_pdf_landing_url, allow_redirects=True, timeout=15)
        head_resp.raise_for_status()
        final_pdf_url = head_resp.url
        logger.info(f"Resolved final PDF URL via HEAD: {final_pdf_url}")
        if not final_pdf_url.lower().endswith(".pdf"):
            logger.warning("Resolved URL does not end with .pdf, might still be a challenge page.")
    except requests.exceptions.RequestException as e:
        logger.error(f"Failed to resolve final PDF URL via HEAD: {e}")
        final_pdf_url = None # Cannot proceed

pdf_saved = False
if final_pdf_url:
    # 3. Initial GET Attempt
    logger.info(f"--- STAGE 1: Initial GET to {final_pdf_url} ---")
    try:
        # Allow HTML in this first attempt to catch the challenge page
        current_headers = session.headers.copy()
        current_headers['Accept'] = 'text/html,application/pdf;q=0.9,*/*;q=0.8' 
        
        r_initial = session.get(final_pdf_url, headers=current_headers, stream=True, timeout=(10,30))
        r_initial.raise_for_status() # Check for HTTP errors

        initial_content_type = r_initial.headers.get('Content-Type', '').lower()
        logger.info(f"Initial GET - Status: {r_initial.status_code}, Content-Type: {initial_content_type}")

        # Check if it's PDF directly (unlikely without PoW)
        first_bytes_initial = b""
        is_pdf_initially = False
        if 'application/pdf' in initial_content_type:
            # Peek to confirm magic number
            if hasattr(r_initial.raw, 'peek') and callable(r_initial.raw.peek):
                first_bytes_initial = r_initial.raw.peek(16)
                if first_bytes_initial.startswith(b'%PDF-'):
                    is_pdf_initially = True
            if is_pdf_initially: # If really PDF, save it
                logger.info("SUCCESS! Received PDF on initial GET (unexpected, PoW might have been cached by session from other tests).")
                pdf_output_path = os.path.join(OUTPUT_DIR_TEST, f"{TARGET_PMCID}_initial.pdf")
                with open(pdf_output_path, 'wb') as f:
                    if first_bytes_initial: f.write(first_bytes_initial) # if peeked
                    for chunk in r_initial.iter_content(chunk_size=8192):
                        f.write(chunk)
                logger.info(f"Saved PDF to {pdf_output_path}")
                pdf_saved = True

        # If HTML, it's likely the challenge page
        if not pdf_saved and 'text/html' in initial_content_type:
            logger.info("Received HTML, assuming PoW challenge page.")
            html_content_bytes = b""
            # Read the entire HTML content
            for chunk in r_initial.iter_content(chunk_size=8192, decode_unicode=False):
                html_content_bytes += chunk
                if len(html_content_bytes) > 2 * 1024 * 1024: # Safety break for huge pages
                    logger.warning("HTML challenge page content is very large, truncating read.")
                    break
            html_content_str = html_content_bytes.decode('utf-8', errors='replace')

            # Save the challenge HTML for inspection
            challenge_html_path = os.path.join(OUTPUT_DIR_TEST, f"{TARGET_PMCID}_challenge.html")
            with open(challenge_html_path, "w", encoding="utf-8") as f_html:
                f_html.write(html_content_str)
            logger.info(f"Saved challenge HTML to {challenge_html_path}")
            
            # Extract PoW parameters
            pow_params = extract_pow_params_from_html(html_content_str)
            if pow_params:
                challenge_str, difficulty_val, cookie_name, cookie_exp_str, cookie_path = pow_params
                
                # Solve PoW
                solution = solve_pmc_pow(challenge_str, difficulty_val)
                if solution:
                    nonce_found, solved_hash = solution
                    pow_cookie_value = f"{challenge_str},{nonce_found}" # Format: "CHALLENGE_STRING,nonce"
                    
                    # Set the PoW cookie in the session
                    # Determine cookie domain. Your browser trace showed ".nih.gov" but also pmc.ncbi.nlm.nih.gov
                    # Using the specific hostname is safer for requests.
                    parsed_uri = urlparse(final_pdf_url)
                    cookie_domain_to_set = parsed_uri.hostname # e.g., pmc.ncbi.nlm.nih.gov

                    # Calculate expiration for requests.cookies.set if needed (not strictly necessary for this test)
                    # Cookie expiration from JS was a float representing days.
                    # requests expects seconds since epoch for 'expires' or None for session cookie.
                    # For this test, a session cookie (deleted when session closes) is fine.
                    
                    session.cookies.set(
                        name=cookie_name,
                        value=pow_cookie_value,
                        domain=cookie_domain_to_set,
                        path=cookie_path
                    )
                    logger.info(f"Set PoW cookie '{cookie_name}' = '{pow_cookie_value[:20]}...' in session for domain '{cookie_domain_to_set}', path '{cookie_path}'.")
                    logger.debug(f"Full session cookies: {session.cookies.get_dict()}")

                    # 4. Second GET Attempt (with PoW cookie)
                    logger.info(f"--- STAGE 2: Re-attempting GET to {final_pdf_url} with PoW cookie ---")
                    current_headers_stage2 = session.headers.copy()
                    current_headers_stage2['Accept'] = 'application/pdf,application/octet-stream,*/*;q=0.8' # Expect PDF
                    current_headers_stage2['Referer'] = final_pdf_url # Referer is the challenge page URL itself

                    try:
                        r_second = session.get(final_pdf_url, headers=current_headers_stage2, stream=True, timeout=(10,60))
                        r_second.raise_for_status()
                        
                        second_content_type = r_second.headers.get('Content-Type', '').lower()
                        logger.info(f"Second GET - Status: {r_second.status_code}, Content-Type: {second_content_type}")
                        
                        # Verify if it's a PDF
                        first_bytes_second = b""
                        is_pdf_second = False
                        if 'application/pdf' in second_content_type:
                            if hasattr(r_second.raw, 'peek') and callable(r_second.raw.peek):
                                first_bytes_second = r_second.raw.peek(16) # Peek first 16 bytes
                                if first_bytes_second.startswith(b'%PDF-'):
                                    is_pdf_second = True
                            else: # If peek not available, trust content-type for now (can read first chunk)
                                try:
                                    first_bytes_second = next(r_second.iter_content(chunk_size=16,decode_unicode=False))
                                    if first_bytes_second.startswith(b'%PDF-'):
                                        is_pdf_second = True
                                except StopIteration: # Empty response
                                    pass
                        
                        if is_pdf_second:
                            logger.info("SUCCESS! Received PDF on second GET after PoW solve.")
                            pdf_output_path = os.path.join(OUTPUT_DIR_TEST, f"{TARGET_PMCID}_final.pdf")
                            with open(pdf_output_path, 'wb') as f:
                                if first_bytes_second: f.write(first_bytes_second) # Write peeked/read chunk
                                for chunk in r_second.iter_content(chunk_size=8192): # Write rest
                                    f.write(chunk)
                            logger.info(f"Saved PDF to {pdf_output_path}")
                            pdf_saved = True
                        else:
                            logger.error(f"FAILURE: Still not PDF after PoW solve. First bytes: {first_bytes_second!r}")
                            # Save this unexpected response
                            unexpected_path = os.path.join(OUTPUT_DIR_TEST, f"{TARGET_PMCID}_after_pow_unexpected.html")
                            with open(unexpected_path, 'wb') as f_unexp:
                                if first_bytes_second: f_unexp.write(first_bytes_second)
                                for chunk in r_second.iter_content(chunk_size=8192):
                                    f_unexp.write(chunk)
                            logger.info(f"Saved unexpected content after PoW to {unexpected_path}")

                    except requests.exceptions.RequestException as e2:
                        logger.error(f"RequestException during second GET attempt: {e2}")
                else: # PoW solve failed
                    logger.error("PoW solution failed, cannot re-attempt GET.")
            else: # PoW params not extracted
                logger.error("Could not extract PoW parameters, cannot solve PoW.")
        elif not pdf_saved: # Not PDF and not HTML
             logger.error(f"Initial GET did not return PDF or HTML. Content-Type: {initial_content_type}")

    except requests.exceptions.RequestException as e1:
        logger.error(f"RequestException during initial GET: {e1}")

if pdf_saved:
    logger.info(f"--- TEST COMPLETED: PDF for {TARGET_PMCID} was successfully downloaded. ---")
else:
    logger.info(f"--- TEST COMPLETED: FAILED to download PDF for {TARGET_PMCID}. Check logs and output files in '{OUTPUT_DIR_TEST}'. ---")

2025-05-22 10:17:49,333 - INFO - [<module>] - Using known final PDF URL: https://pmc.ncbi.nlm.nih.gov/articles/PMC11474870/pdf/bmjpo-8-1.pdf
2025-05-22 10:17:49,333 - INFO - Using known final PDF URL: https://pmc.ncbi.nlm.nih.gov/articles/PMC11474870/pdf/bmjpo-8-1.pdf
2025-05-22 10:17:49,335 - INFO - [<module>] - --- STAGE 1: Initial GET to https://pmc.ncbi.nlm.nih.gov/articles/PMC11474870/pdf/bmjpo-8-1.pdf ---
2025-05-22 10:17:49,335 - INFO - --- STAGE 1: Initial GET to https://pmc.ncbi.nlm.nih.gov/articles/PMC11474870/pdf/bmjpo-8-1.pdf ---
2025-05-22 10:17:49,744 - INFO - [<module>] - Initial GET - Status: 200, Content-Type: text/html; charset=utf-8
2025-05-22 10:17:49,744 - INFO - Initial GET - Status: 200, Content-Type: text/html; charset=utf-8
2025-05-22 10:17:49,748 - INFO - [<module>] - Received HTML, assuming PoW challenge page.
2025-05-22 10:17:49,748 - INFO - Received HTML, assuming PoW challenge page.
2025-05-22 10:17:49,756 - INFO - [<module>] - Saved challenge HTML to pmc_

In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Combined OA-first + Sci-Hub PDF Fetcher (v12-pow, standalone)
-----------------------------------------------------------
1) Reads PMIDs from an Excel file.
2) Fetches metadata (year, first author, title, DOI) via NCBI EFetch.
3) Tries Open Access first:
    a) Unpaywall.
    b) PMC (handles Proof-of-Work challenge, resolves final URL).
4) Any remaining PMIDs are tried via Sci-Hub in parallel.
5) All PDFs are validated (size, content) and land in OUTPUT_PDF_DIR
   named `{year}-{pmid}-{author}-{title}.pdf`.
6) Detailed logging of every URL fetched and why it failed or succeeded.
"""

import os
import re
import time
import logging
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, quote_plus
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from concurrent.futures import ThreadPoolExecutor, as_completed
import hashlib # For PoW
try:
    from pypdf import PdfReader # Preferred (PyPDF2 successor)
    from pypdf.errors import PdfReadError
except ImportError:
    try:
        from PyPDF2 import PdfReader # Fallback
        from PyPDF2.errors import PdfReadError
    except ImportError:
        print("Please install pypdf or PyPDF2: pip install pypdf")
        exit()


# === CONFIGURATION ===
EXCEL_FILE_PATH     = r"C:\Users\Galaxy\Downloads\screening_ERAS.xlsx" # UPDATE THIS
OUTPUT_PDF_DIR      = "downloaded_pdfs_v12_pow"
SUSPICIOUS_PDF_SUBDIR = "suspicious_pdfs" # Subdirectory for failed validation PDFs

# Sci-Hub
SCI_HUB_DOMAINS     = [
    "https://sci-hub.se", "https://sci-hub.ru", "https://sci-hub.ren",
    "https://sci-hub.wf", "https://sci-hub.ee", "https://sci-hub.st"
]
DELAY_SCIHUB        = 0.5

# Threads / batching
MAX_THREADS         = 5 # Reduced default, be kind to servers
EFETCH_BATCH_SIZE   = 100
DELAY_NCBI          = 0.35

# PDF Validation Thresholds
MIN_PDF_SIZE_KB         = 20    # PDFs smaller than this are suspicious
MIN_PDF_PAGES           = 1     # PDFs with fewer pages are suspicious
MIN_TEXT_LENGTH_CHARS   = 300   # Min chars expected from first few pages of a real article

# API credentials - PLEASE FILL THESE IN
NCBI_API_KEY        = "YOUR_API_KEY_HERE"
CROSSREF_EMAIL      = "your_email@example.com" # For polite API use
UNPAYWALL_EMAIL     = "levi4328@gmail.com" # Your email for Unpaywall

# === LOGGING SETUP ===
logger = logging.getLogger("PDFFetcherV12")
logger.handlers = [] # Clear existing handlers if any (Jupyter)
logger.setLevel(logging.INFO) # DEBUG for more verbosity
ch = logging.StreamHandler()
ch.setFormatter(logging.Formatter(
    '%(asctime)s - %(levelname)s - [%(threadName)s] - %(funcName)s - %(message)s'
))
logger.addHandler(ch)

# File logging (optional, but recommended for long runs)
# log_file_path = os.path.join(OUTPUT_PDF_DIR, "pdf_fetcher_v12.log")
# os.makedirs(OUTPUT_PDF_DIR, exist_ok=True) 
# fh = logging.FileHandler(log_file_path, mode='a') 
# fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - [%(threadName)s] - %(funcName)s - %(message)s'))
# logger.addHandler(fh)


# === HTTP SESSIONS WITH RETRIES ===
# Using a common, recent Chrome User-Agent string
CHROME_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"

def make_session(user_agent, is_scihub_session=False):
    s = requests.Session()
    retries = Retry(
        total=3,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=frozenset(['GET', 'POST', 'HEAD'])
    )
    pool_connections = MAX_THREADS // 2 if is_scihub_session and MAX_THREADS > 1 else MAX_THREADS
    pool_maxsize = pool_connections * 2

    adapter = HTTPAdapter(
        max_retries=retries,
        pool_connections=pool_connections,
        pool_maxsize=pool_maxsize 
    )
    s.mount("https://", adapter)
    s.mount("http://", adapter)
    s.headers.update({'User-Agent': user_agent})
    return s

session_ncbi   = make_session(f"PDFFetcherV12/1.0 (NCBI-EUtils-Client; mailto:{CROSSREF_EMAIL if CROSSREF_EMAIL != 'your_email@example.com' else 'anonymous'})")
session_oa     = make_session(CHROME_UA)
session_scihub = make_session(CHROME_UA, is_scihub_session=True)

# Shared BROWSER_LIKE_HEADERS for GET requests that might hit web pages
BROWSER_LIKE_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Upgrade-Insecure-Requests': '1',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-User': '?1',
    'Cache-Control': 'no-cache',
    'Pragma': 'no-cache'
}

# === UTILITIES ===
def sanitize_filename(s: str) -> str:
    s = re.sub(r'[\\/*?:"<>|]', "", s)
    s = re.sub(r'\s+', " ", s).strip()
    if not s: s = "untitled_article"
    return s.replace(" ", "_")[:150]

def _get_ncbi_params(extra=None):
    params = {"tool": "PDFFetcherV12"}
    if CROSSREF_EMAIL and CROSSREF_EMAIL != "your_email@example.com":
        params["email"] = CROSSREF_EMAIL
    if NCBI_API_KEY and NCBI_API_KEY != "YOUR_API_KEY_HERE":
        params["api_key"] = NCBI_API_KEY
    if extra:
        params.update(extra)
    return params

# === PROOF-OF-WORK (PoW) SOLVING LOGIC for PMC ===
def extract_pow_params_from_html(html_content: str) -> tuple[str, int, str, str, str] | None:
    challenge_match = re.search(r'const\s+POW_CHALLENGE\s*=\s*"(.*?)"', html_content)
    difficulty_match = re.search(r'const\s+POW_DIFFICULTY\s*=\s*"(.*?)"', html_content)
    cookie_name_match = re.search(r'const\s+POW_COOKIE_NAME\s*=\s*"(.*?)"', html_content)
    cookie_exp_match = re.search(r'const\s+POW_COOKIE_EXPIRATION\s*=\s*"(.*?)"', html_content) # Currently unused
    cookie_path_match = re.search(r'const\s+POW_COOKIE_PATH\s*=\s*"(.*?)"', html_content)

    if challenge_match and difficulty_match and cookie_name_match and cookie_path_match: # exp_match not strictly needed for solving
        challenge_string = challenge_match.group(1)
        cookie_name = cookie_name_match.group(1)
        cookie_exp_str = cookie_exp_match.group(1) if cookie_exp_match else "0.208333" # Default if somehow missing
        cookie_path = cookie_path_match.group(1)
        try:
            difficulty = int(difficulty_match.group(1))
            logger.info(f"Extracted PoW params: Challenge='{challenge_string[:20]}...', Diff={difficulty}, Name='{cookie_name}'")
            return challenge_string, difficulty, cookie_name, cookie_exp_str, cookie_path
        except ValueError:
            logger.error(f"Could not parse PoW difficulty as int: '{difficulty_match.group(1)}'")
    else:
        logger.warning("Could not find all required PoW parameters in HTML content.")
    return None

def solve_pmc_pow(challenge_string: str, difficulty: int) -> tuple[int, str] | None:
    logger.info(f"Solving PoW: challenge='{challenge_string[:20]}...', difficulty={difficulty}")
    target_prefix = "0" * difficulty
    nonce = 0
    max_nonce_map = {4: 2000000, 5: 35000000} # Rough max attempts for common difficulties
    max_nonce = max_nonce_map.get(difficulty, 100000000) # High default

    start_time = time.time()
    while nonce <= max_nonce:
        test_string = challenge_string + str(nonce)
        hash_object = hashlib.sha256(test_string.encode('utf-8'))
        hex_digest = hash_object.hexdigest()
        if hex_digest.startswith(target_prefix):
            duration = time.time() - start_time
            logger.info(f"PoW SOLVED! Nonce: {nonce}, Hash: {hex_digest[:10]}..., Time: {duration:.4f}s")
            return nonce, hex_digest
        if nonce > 0 and nonce % 1000000 == 0: # Log progress
            logger.debug(f"PoW progress: nonce {nonce}...")
        nonce += 1
    duration = time.time() - start_time
    logger.error(f"PoW FAILED to solve (max_nonce {max_nonce} reached). Time: {duration:.2f}s")
    return None

# === PDF VALIDATION (with slightly enhanced logging) ===
def validate_downloaded_pdf(pdf_path: str, pmid_for_log: str) -> bool:
    """Validates a downloaded PDF for basic integrity."""
    failure_reason = ""
    try:
        # 1. Check file size
        file_size_kb = os.path.getsize(pdf_path) / 1024
        if file_size_kb < MIN_PDF_SIZE_KB:
            failure_reason = f"File size {file_size_kb:.2f} KB < threshold {MIN_PDF_SIZE_KB} KB"
            logger.warning(f"PDF Validation ✗ {pmid_for_log}: {failure_reason} for '{os.path.basename(pdf_path)}'.")
            return False

        # 2. Try to open with pypdf and check pages
        reader = None # Initialize reader
        num_pages = 0
        try:
            reader = PdfReader(pdf_path)
            num_pages = len(reader.pages)
            if num_pages < MIN_PDF_PAGES:
                failure_reason = f"Page count {num_pages} < threshold {MIN_PDF_PAGES}"
                logger.warning(f"PDF Validation ✗ {pmid_for_log}: {failure_reason} for '{os.path.basename(pdf_path)}' (Size: {file_size_kb:.2f} KB).")
                return False
        except PdfReadError as e:
            failure_reason = f"pypdf PdfReadError: {e}"
            logger.warning(f"PDF Validation ✗ {pmid_for_log}: {failure_reason} for '{os.path.basename(pdf_path)}' (Size: {file_size_kb:.2f} KB).")
            return False
        except Exception as e_open: # Other unexpected errors during open
            failure_reason = f"pypdf unexpected open error: {e_open}"
            logger.warning(f"PDF Validation ✗ {pmid_for_log}: {failure_reason} for '{os.path.basename(pdf_path)}' (Size: {file_size_kb:.2f} KB).")
            return False

        # 3. Check text content length (from first few pages)
        extracted_text_len = 0
        max_pages_to_check_text = 3 # Check first 3 pages for text
        if reader: # Ensure reader was successfully created
            for i in range(min(num_pages, max_pages_to_check_text)):
                try:
                    page = reader.pages[i]
                    extracted_text_len += len(page.extract_text() or "")
                except Exception as e_text_extract: # Catch potential errors during text extraction itself
                    logger.warning(f"PDF Validation ? {pmid_for_log}: Error extracting text from page {i+1} of '{os.path.basename(pdf_path)}': {e_text_extract}. Continuing text check.")
        
        if extracted_text_len < MIN_TEXT_LENGTH_CHARS:
            failure_reason = f"Insufficient text ({extracted_text_len} chars from first {min(num_pages, max_pages_to_check_text)} page(s) < threshold {MIN_TEXT_LENGTH_CHARS})"
            logger.warning(f"PDF Validation ✗ {pmid_for_log}: {failure_reason} for '{os.path.basename(pdf_path)}' (Size: {file_size_kb:.2f} KB).")
            return False

        logger.info(f"PDF Validation ✓ {pmid_for_log}: File '{os.path.basename(pdf_path)}' (Size: {file_size_kb:.2f}KB, Pages: {num_pages}, TextLen: {extracted_text_len} from first {min(num_pages, max_pages_to_check_text)} page(s)) passed validation.")
        return True

    except FileNotFoundError:
        logger.error(f"PDF Validation ✗ {pmid_for_log}: File not found at {pdf_path} for validation.")
        return False
    except Exception as e:
        failure_reason = f"Unexpected error during validation setup: {e}"
        logger.error(f"PDF Validation ✗ {pmid_for_log}: {failure_reason} for {pdf_path}", exc_info=True)
        return False

# === CENTRALIZED PDF DOWNLOADER (handles PoW for PMC) ===
def download_and_save_pdf(
    session: requests.Session, 
    pdf_url: str, 
    output_path: str, 
    pmid_for_log: str, 
    source_name: str, # e.g., "PMC", "Unpaywall", "SciHub"
    referer: str | None = None
    ) -> bool:
    
    logger.info(f"{source_name} → PMID {pmid_for_log}: Attempting download from {pdf_url}")
    
    # Initial GET request
    current_headers = session.headers.copy() # Start with session's base User-Agent
    current_headers.update(BROWSER_LIKE_HEADERS) # Add general browser-like headers
    # For first attempt, accept HTML too, in case it's a challenge page
    current_headers['Accept'] = 'application/pdf,text/html;q=0.9,application/xhtml+xml,application/xml;q=0.8,*/*;q=0.5'
    if referer:
        current_headers['Referer'] = referer
    if 'pmc.ncbi.nlm.nih.gov' in urlparse(pdf_url).netloc:
        current_headers['Sec-Fetch-Site'] = 'same-origin' if referer else 'none'

    try:
        r = session.get(pdf_url, headers=current_headers, stream=True, timeout=(15, 60), allow_redirects=True)
        r.raise_for_status()
        final_url_after_redirects = r.url # Could have redirected
        content_type = r.headers.get('Content-Type', '').lower()
        logger.debug(f"{source_name} → PMID {pmid_for_log}: Initial GET to {pdf_url} (final: {final_url_after_redirects}), Content-Type: {content_type}")

        # Handle PMC PoW Challenge if HTML is received from a PMC URL
        if "pmc.ncbi.nlm.nih.gov" in urlparse(final_url_after_redirects).netloc and 'text/html' in content_type:
            logger.info(f"{source_name} → PMID {pmid_for_log}: Received HTML from PMC URL, attempting PoW solve.")
            html_content_bytes = b"".join(chunk for chunk in r.iter_content(chunk_size=8192, decode_unicode=False)) # Read full HTML
            html_content_str = html_content_bytes.decode('utf-8', errors='replace')
            
            debug_html_path = os.path.join(OUTPUT_PDF_DIR, SUSPICIOUS_PDF_SUBDIR, f"{pmid_for_log}_{source_name}_challenge.html")
            os.makedirs(os.path.dirname(debug_html_path), exist_ok=True)
            with open(debug_html_path, "w", encoding="utf-8") as f_debug: f_debug.write(html_content_str)
            logger.info(f"Saved PMC challenge HTML to {debug_html_path}")

            pow_params = extract_pow_params_from_html(html_content_str)
            if not pow_params:
                logger.error(f"{source_name} → PMID {pmid_for_log}: Failed to extract PoW params from PMC HTML.")
                return False
            
            challenge_str, difficulty_val, cookie_name, _, cookie_path = pow_params
            solution = solve_pmc_pow(challenge_str, difficulty_val)
            if not solution:
                logger.error(f"{source_name} → PMID {pmid_for_log}: Failed to solve PMC PoW.")
                return False

            nonce_found, _ = solution
            pow_cookie_value = f"{challenge_str},{nonce_found}"
            
            parsed_uri = urlparse(final_url_after_redirects)
            session.cookies.set(name=cookie_name, value=pow_cookie_value, domain=parsed_uri.hostname, path=cookie_path)
            logger.info(f"Set PoW cookie '{cookie_name}' in session for {parsed_uri.hostname}.")

            # Re-attempt GET with PoW cookie
            logger.info(f"{source_name} → PMID {pmid_for_log}: Re-attempting GET to {final_url_after_redirects} WITH PoW cookie.")
            current_headers['Accept'] = 'application/pdf,application/octet-stream,*/*;q=0.8' # Now strictly expect PDF
            current_headers['Referer'] = final_url_after_redirects # Referer is the challenge page URL

            r = session.get(final_url_after_redirects, headers=current_headers, stream=True, timeout=(15, 60))
            r.raise_for_status()
            content_type = r.headers.get('Content-Type', '').lower() # Update content_type from new response
            logger.debug(f"{source_name} → PMID {pmid_for_log}: Second GET (post-PoW), Content-Type: {content_type}")


        # Check content type of the (potentially second) response
        if 'application/pdf' in content_type or final_url_after_redirects.lower().endswith(".pdf"):
            # Save the PDF content
            temp_pdf_path = output_path + ".tmp" # Save to temp first
            with open(temp_pdf_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=65536): # 64KB
                    f.write(chunk)
            
            # Validate the downloaded temporary PDF
            if validate_downloaded_pdf(temp_pdf_path, pmid_for_log):
                os.rename(temp_pdf_path, output_path) # Move to final path if valid
                logger.info(f"{source_name} ✓ PMID {pmid_for_log}: Successfully downloaded and validated PDF to {output_path}")
                return True
            else:
                logger.warning(f"{source_name} ✗ PMID {pmid_for_log}: PDF from {final_url_after_redirects} failed validation.")
                # Optionally move suspicious file to a specific directory
                suspicious_dir = os.path.join(OUTPUT_PDF_DIR, SUSPICIOUS_PDF_SUBDIR)
                os.makedirs(suspicious_dir, exist_ok=True)
                suspicious_path = os.path.join(suspicious_dir, os.path.basename(output_path) + ".suspicious_validation_failed.pdf")
                try:
                    os.rename(temp_pdf_path, suspicious_path)
                    logger.info(f"Moved suspicious PDF to {suspicious_path}")
                except OSError: # e.g. if temp_pdf_path was already moved/deleted by another thread for same file
                    if os.path.exists(temp_pdf_path): os.remove(temp_pdf_path) # Clean up temp
                return False
        else:
            logger.warning(f"{source_name} ✗ PMID {pmid_for_log}: Non-PDF content from {final_url_after_redirects}. Content-Type: {content_type}")
            # Save non-PDF content for debugging
            debug_content_path = os.path.join(OUTPUT_PDF_DIR, SUSPICIOUS_PDF_SUBDIR, f"{pmid_for_log}_{source_name}_unexpected_content.dat")
            os.makedirs(os.path.dirname(debug_content_path), exist_ok=True)
            with open(debug_content_path, 'wb') as f_debug:
                 for chunk in r.iter_content(chunk_size=8192): f_debug.write(chunk)
            logger.info(f"Saved unexpected content to {debug_content_path}")
            return False

    except requests.exceptions.RequestException as e:
        logger.warning(f"{source_name} ✗ PMID {pmid_for_log}: RequestException for {pdf_url}: {e}")
    except Exception as e_main:
        logger.error(f"{source_name} ✗ PMID {pmid_for_log}: Unexpected error for {pdf_url}: {e_main}", exc_info=True)
    
    # Cleanup temp file if it exists and we failed before validation
    if 'temp_pdf_path' in locals() and os.path.exists(temp_pdf_path):
        try:
            os.remove(temp_pdf_path)
        except OSError:
            pass
    return False


# === STEP 1: FETCH METADATA FROM PUBMED ===
def fetch_metadata(pmids):
    # (This function can remain largely the same as your v11, just ensure logger name matches)
    meta = {}
    batches = [pmids[i:i+EFETCH_BATCH_SIZE] for i in range(0, len(pmids), EFETCH_BATCH_SIZE)]
    logger.info(f"EFetch PubMed metadata for {len(pmids)} PMIDs in {len(batches)} batch(es)...")
    for i_batch, batch in enumerate(batches): # Added index for logging
        data = _get_ncbi_params({"db":"pubmed","retmode":"xml","id":",".join(map(str,batch))})
        logger.info(f"NCBI EFetch POST (batch {i_batch+1}/{len(batches)}) → IDs={','.join(map(str,batch[:5]))}...")
        try:
            resp = session_ncbi.post(
                "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
                data=data,
                timeout=60
            )
            resp.raise_for_status()
            root = ET.fromstring(resp.content)
            for art in root.findall(".//PubmedArticle"):
                pmid_el = art.find(".//PMID")
                pmid = pmid_el.text.strip() if pmid_el is not None and pmid_el.text else None
                if not pmid: continue

                doi_el = art.find(".//ArticleId[@IdType='doi']") or \
                         art.find(".//ELocationID[@EIdType='doi'][@ValidYN='Y']")
                doi = doi_el.text.strip() if doi_el is not None and doi_el.text else None
                
                year_el = art.find(".//PubDate/Year") # More general Year path
                if year_el is None: year_el = art.find(".//Journal/JournalIssue/PubDate/Year") # Fallback

                if year_el is not None and year_el.text:
                    year = year_el.text.strip()
                else:
                    med_el = art.find(".//PubDate/MedlineDate") # More general MedlineDate
                    if med_el is None: med_el = art.find(".//Article/Journal/JournalIssue/PubDate/MedlineDate") # Fallback
                    year = med_el.text[:4].strip() if med_el is not None and med_el.text else "UnknownYear"
                
                year = year if year.isdigit() and len(year) == 4 else "UnknownYear"


                fa_el = art.find(".//AuthorList/Author[1]/LastName") # First author
                author = fa_el.text.strip() if fa_el is not None and fa_el.text else "UnknownAuthor"
                
                title_el = art.find(".//ArticleTitle")
                title = "".join(title_el.itertext()).strip() if title_el is not None else f"NoTitle_{pmid}"
                meta[pmid] = {'doi':doi, 'year':year, 'author':author, 'title':title}
        except Exception as e:
            logger.warning(f"NCBI EFetch batch {i_batch+1} error: {e}")
        
        if i_batch < len(batches) -1 : time.sleep(DELAY_NCBI)

    missing = [pm for pm in pmids if pm not in meta]
    if missing:
        logger.warning(f"Metadata missing for {len(missing)} PMIDs: {missing[:10]}...")
    return meta

# === STEP 2: OPEN ACCESS (Unpaywall, PMC with PoW) ===
def unpaywall_get_pdf_url(doi): # Renamed for clarity
    # (This function can remain largely the same as your v11, just ensure logger name matches)
    # Make sure UNPAYWALL_EMAIL is set for this to work effectively
    if not UNPAYWALL_EMAIL or UNPAYWALL_EMAIL == "your_email@example.com":
        logger.debug(f"Unpaywall API skipped for DOI {doi}: UNPAYWALL_EMAIL not configured.")
        return None

    api_url = f"https://api.unpaywall.org/v2/{quote_plus(doi)}?email={UNPAYWALL_EMAIL}"
    logger.info(f"Unpaywall API GET → DOI {doi}: {api_url}")
    try:
        r = session_oa.get(api_url, timeout=20) # Use session_oa
        r.raise_for_status()
        data = r.json()
        if data.get("is_oa"):
            best_loc = data.get("best_oa_location")
            pdf_url = None
            if best_loc and best_loc.get("url_for_pdf"):
                pdf_url = best_loc.get("url_for_pdf")
            else: # Check other OA locations
                for loc in data.get("oa_locations", []):
                    if loc.get("url_for_pdf"):
                        pdf_url = loc.get("url_for_pdf")
                        break
            if pdf_url:
                logger.info(f"Unpaywall API ✓ DOI {doi}: Found PDF URL: {pdf_url}")
                return pdf_url
            else:
                logger.info(f"Unpaywall API ? DOI {doi}: Article is OA, but no direct PDF URL in Unpaywall response.")
        else:
            logger.info(f"Unpaywall API ~ DOI {doi}: Not OA according to Unpaywall.")
    except requests.exceptions.RequestException as e:
        logger.warning(f"Unpaywall API ✗ DOI {doi}: RequestException: {e}")
    except ValueError as e: # Handles JSONDecodeError
        logger.warning(f"Unpaywall API ✗ DOI {doi}: JSON Decode Error: {e}")
    return None


def pmc_id_for_pmid(pmid: str, article_metadata: dict) -> str | None: # article_metadata is md from oa_worker
    """
    Finds the PMCID for a given PMID using Entrez ELink.
    Prioritizes direct 'pubmed_pmc' link.
    """
    linkname_to_try = "pubmed_pmc" # Query only for direct PMC links

    params = _get_ncbi_params({
        "dbfrom": "pubmed",
        "db": "pmc",
        "id": pmid,
        "cmd": "neighbor_score",
        "linkname": linkname_to_try
    })

    logger.info(f"PMC ID ELink → PMID {pmid}: Querying with linkname '{linkname_to_try}'.")
    try:
        # It's good practice to have a specific timeout for network requests
        r = session_ncbi.post(
            "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi",
            data=params,
            timeout=20 # Timeout for this specific call (connect, read)
        )
        r.raise_for_status() # Will raise an HTTPError for bad responses (4XX or 5XX)
        root = ET.fromstring(r.content)

        # Find the LinkSetDb specifically for the linkname we used
        link_set_db_el = root.find(f".//LinkSetDb[LinkName='{linkname_to_try}']")

        if link_set_db_el is None:
            # This case means the specific LinkSetDb for 'pubmed_pmc' was not found.
            # It could be that there are no links, and NCBI returns a LinkSet without LinkSetDb,
            # or with a general <Info> tag at a higher level.
            info_el_general = root.find(".//Info") # Check for any general Info tag in the response
            if info_el_general is not None:
                 logger.info(f"PMC ID ELink ~ PMID {pmid}: General Info from NCBI for ELink query: {info_el_general.text}")
            else:
                # Log a snippet of the response if the expected structure is missing
                logger.debug(f"PMC ID ELink ~ PMID {pmid}: No <LinkSetDb> found for linkname '{linkname_to_try}'. Response snippet: {r.text[:250]}")
            return None

        ids = [el.text.strip() for el in link_set_db_el.findall("./Link/Id") if el.text]

        if ids:
            # For 'pubmed_pmc', we typically expect at most one ID.
            # If multiple are returned, taking the first one is a common approach,
            # but one might investigate why multiple direct links would exist.
            pmc_candidate_id_num = ids[0]

            returned_pmcid = ""
            if pmc_candidate_id_num.upper().startswith("PMC"):
                returned_pmcid = pmc_candidate_id_num
            elif pmc_candidate_id_num.isdigit():
                # NCBI often returns just the number part for PMCID in ELink results
                returned_pmcid = "PMC" + pmc_candidate_id_num
            else:
                logger.warning(f"PMC ID ELink ? PMID {pmid}: Found non-standard ID format '{pmc_candidate_id_num}' via '{linkname_to_try}'.")
                return None # Invalid format

            # At this point, you have a potential PMCID.
            # The more robust (but optional) verification step against article_metadata (DOI, title) would go here.
            # For now, we are trusting the 'pubmed_pmc' link if it returns an ID.
            # Example (if you choose to implement verify_pmcid_against_metadata):
            # if not verify_pmcid_against_metadata(returned_pmcid, pmid, article_metadata.get('doi'), article_metadata.get('title')):
            #     logger.warning(f"PMC ID ELink ✗ PMID {pmid}: PMCID {returned_pmcid} (from '{linkname_to_try}') FAILED verification against original metadata. Discarding.")
            #     return None

            logger.info(f"PMC ID ELink ✓ PMID {pmid}: Found PMCID {returned_pmcid} via '{linkname_to_try}'.")
            return returned_pmcid
        else:
            # No <Id> elements found under the specific LinkSetDb for this linkname.
            # Check for an <Info> tag within this LinkSetDb.
            info_el = link_set_db_el.find("./Info")
            if info_el is not None:
                logger.info(f"PMC ID ELink ~ PMID {pmid}: Info from NCBI for linkname '{linkname_to_try}': {info_el.text}")
            else:
                logger.info(f"PMC ID ELink ~ PMID {pmid}: No specific PMCID <Id> elements found for linkname '{linkname_to_try}'.")
            return None # No IDs found

    except requests.exceptions.RequestException as e_req:
        logger.warning(f"PMC ID ELink ✗ PMID {pmid} (linkname '{linkname_to_try}'): RequestException: {e_req}")
    except ET.ParseError as e_xml:
        # Added 'r' in locals() check for safety, though it should be defined if post() was successful
        response_text_snippet = r.text[:250] if 'r' in locals() and hasattr(r, 'text') else "Response object not available or no text."
        logger.warning(f"PMC ID ELink ✗ PMID {pmid} (linkname '{linkname_to_try}'): XML ParseError: {e_xml}. Content snippet: {response_text_snippet}")
    except Exception as e_generic: # Catch any other unexpected errors
        logger.warning(f"PMC ID ELink ✗ PMID {pmid} (linkname '{linkname_to_try}'): Unexpected error: {e_generic}", exc_info=True)

    # Fallthrough: if any error occurred or no ID was found and returned.
    logger.warning(f"PMC ID ELink ✗ PMID {pmid}: No confirmed PMCID found after attempting with '{linkname_to_try}'.")
    return None

def pmc_attempt_download(pmcid: str, pmid: str, md: dict) -> bool:
    """Attempts to download a PDF from PMC, handling PoW."""
    fname_base = sanitize_filename(f"{md['year']}-{pmid}-{md['author']}-{md['title']}")
    output_pdf_path  = os.path.join(OUTPUT_PDF_DIR, fname_base + ".pdf")

    if os.path.exists(output_pdf_path):
        logger.info(f"PMC ✓ {pmid} ({pmcid}): PDF already exists at {output_pdf_path}")
        return True

    # This is the "landing page" for the PDF, which should redirect to the actual PDF file URL
    pmc_article_pdf_landing_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/pdf/"
    final_pdf_url_from_pmc = None
    
    logger.info(f"PMC HEAD → PMID {pmid} ({pmcid}): Probing {pmc_article_pdf_landing_url} for final PDF URL.")
    head_req_headers = session_oa.headers.copy()
    head_req_headers.update(BROWSER_LIKE_HEADERS)
    head_req_headers['Accept'] = 'application/pdf, text/html;q=0.9, */*;q=0.8' # Accept PDF or HTML for HEAD

    try:
        head_resp = session_oa.head(
            pmc_article_pdf_landing_url,
            headers=head_req_headers,
            timeout=(10, 25), 
            allow_redirects=True
        )
        head_resp.raise_for_status()
        final_pdf_url_from_pmc = head_resp.url
        
        if not (final_pdf_url_from_pmc.lower().endswith(".pdf") or "pdf" in final_pdf_url_from_pmc.lower()): # Broader check
            logger.warning(f"PMC HEAD ? PMID {pmid} ({pmcid}): Resolved URL {final_pdf_url_from_pmc} doesn't strongly indicate PDF. Proceeding cautiously.")
        else:
            logger.info(f"PMC HEAD ✓ PMID {pmid} ({pmcid}): Resolved potential PDF URL: {final_pdf_url_from_pmc}")

    except requests.exceptions.RequestException as e:
        logger.warning(f"PMC HEAD ✗ PMID {pmid} ({pmcid}) for {pmc_article_pdf_landing_url}: {e}. Cannot resolve final PDF URL via HEAD.")
        # Fallback: try to construct a plausible final URL if filename pattern is known (e.g. from other PMCID for same journal)
        # For now, if HEAD fails, we might not have a reliable final_pdf_url_from_pmc to try.
        # One could try the landing page itself for the GET if HEAD fails.
        final_pdf_url_from_pmc = pmc_article_pdf_landing_url # Risky, but might trigger PoW on landing
        logger.warning(f"PMC HEAD ✗ PMID {pmid} ({pmcid}): Using landing page URL {final_pdf_url_from_pmc} as fallback for GET.")


    if final_pdf_url_from_pmc:
        return download_and_save_pdf(
            session_oa, 
            final_pdf_url_from_pmc, 
            output_pdf_path, 
            pmid, 
            source_name=f"PMC({pmcid})",
            referer=pmc_article_pdf_landing_url # Referer for the GET
        )
    else:
        logger.error(f"PMC ✗ PMID {pmid} ({pmcid}): No URL determined for download attempt.")
        return False


def oa_worker(pmid: str, md: dict) -> tuple[str, bool]:
    """Worker for Open Access PDF fetching (Unpaywall then PMC)."""
    doi = md.get('doi')
    fname_base = sanitize_filename(f"{md['year']}-{pmid}-{md['author']}-{md['title']}")
    output_pdf_path  = os.path.join(OUTPUT_PDF_DIR, fname_base + ".pdf")

    if os.path.exists(output_pdf_path): # Check once at the beginning of worker
        logger.info(f"OA ✓ {pmid}: PDF already exists at {output_pdf_path}")
        return pmid, True

    # 1) Unpaywall
    if doi:
        unpaywall_url = unpaywall_get_pdf_url(doi)
        if unpaywall_url:
            if download_and_save_pdf(session_oa, unpaywall_url, output_pdf_path, pmid, source_name=f"Unpaywall(DOI:{doi})", referer=f"https://doi.org/{doi}"):
                return pmid, True
            else:
                logger.info(f"OA: Unpaywall attempt for PMID {pmid} (DOI {doi}) failed download/validation. Trying PMC.")
        else:
            logger.info(f"OA: No PDF URL from Unpaywall for PMID {pmid} (DOI {doi}). Trying PMC.")
    else:
        logger.info(f"OA: No DOI for PMID {pmid}. Skipping Unpaywall, trying PMC.")
    
    # 2) PMC fallback
    pmcid = pmc_id_for_pmid(pmid)
    if pmcid:
        if pmc_attempt_download(pmcid, pmid, md):
            return pmid, True
    
    logger.warning(f"OA ✗ {pmid}: No PDF found via Unpaywall or PMC.")
    return pmid, False

# === STEP 3: SCI-HUB ===
def test_scihub_domain(domain):
    # (This function can remain largely the same as your v11, just ensure logger name matches)
    test_doi = "10.1000/182" # A generic test DOI
    url = f"{domain.rstrip('/')}/{test_doi}"
    logger.debug(f"Sci-Hub TEST GET → {url}")
    try:
        r = session_scihub.get(url, timeout=10) # Slightly longer timeout for test
        # Sci-Hub can return 200 with HTML, or 404 if DOI not found, both indicate responsiveness
        if r.status_code in (200, 404) and ('html' in r.headers.get('Content-Type','').lower() or 'sci-hub' in r.text.lower()):
            logger.info(f"Sci-Hub TEST ✓ {domain} is responsive.")
            return True
        else:
            logger.warning(f"Sci-Hub TEST ? {domain} responded status {r.status_code}, CT: {r.headers.get('Content-Type','')}")
    except requests.exceptions.Timeout:
        logger.warning(f"Sci-Hub TEST ✗ {domain} timed out.")
    except requests.exceptions.RequestException as e:
        logger.warning(f"Sci-Hub TEST ✗ {domain} error: {e}")
    return False

def init_scihub_domains():
    # (This function can remain largely the same as your v11, parallel testing is good)
    logger.info("Probing Sci-Hub mirrors for availability...")
    working_domains = []
    # Use a ThreadPoolExecutor to test domains in parallel for speed
    with ThreadPoolExecutor(max_workers=min(len(SCI_HUB_DOMAINS), 5)) as executor:
        future_to_domain = {executor.submit(test_scihub_domain, d): d for d in SCI_HUB_DOMAINS}
        for future in as_completed(future_to_domain):
            domain = future_to_domain[future]
            try:
                if future.result():
                    working_domains.append(domain)
            except Exception as exc: # Catch any exception during future.result()
                logger.error(f"Sci-Hub domain test for {domain} generated an exception: {exc}")
    
    if not working_domains:
        logger.error("CRITICAL: No working Sci-Hub domains found after testing!")
    else:
        # Prioritize .se if available as it's often reliable
        if "https://sci-hub.se" in working_domains:
            working_domains.insert(0, working_domains.pop(working_domains.index("https://sci-hub.se")))
        logger.info(f"Using Sci-Hub domains: {working_domains}")
    return working_domains


def find_scihub_pdf_in_html(html_content: bytes, base_page_url: str) -> str | None:
    # (This function can remain largely the same, ensure it uses bytes for BS)
    soup = BeautifulSoup(html_content, 'html.parser')
    parsed_base_url = urlparse(base_page_url)
    absolute_base = f"{parsed_base_url.scheme}://{parsed_base_url.netloc}"

    selectors_and_attrs = [
        ('iframe#pdf', 'src'), ('iframe#article', 'src'), # Specific iframes
        ('iframe[src*=".pdf"]', 'src'), # Iframes with .pdf in src
        ('embed[type="application/pdf"]', 'src'), # Embed tags
        ('a[href*=".pdf"]', 'href'), # Anchor tags directly linking PDFs
        ('a#download', 'href'), # Common download button ID
        ('div.buttons > a', 'href') # General buttons that might be download links
    ]

    for selector, attr in selectors_and_attrs:
        element = soup.select_one(selector)
        if element and element.get(attr):
            src = element.get(attr)
            if src.startswith("//"): src = f"{parsed_base_url.scheme}:{src}"
            # Simple check to ensure it's not a data URI or JS link for PDFs
            if not src.lower().startswith(('data:', 'javascript:')):
                resolved_url = urljoin(absolute_base, src)
                if ".pdf" in resolved_url.lower() or "sci-hub" in urlparse(resolved_url).netloc: # Basic check
                    return resolved_url

    # Fallback for onclick attributes (less reliable)
    buttons_onclick = soup.select('button[onclick*="location.href"], button[onclick*=".pdf"]')
    for button in buttons_onclick:
        onclick_val = button.get('onclick', '')
        match = re.search(r"location\.href=['\"]([^'\"]+\.pdf[^'\"]*)['\"]", onclick_val, re.IGNORECASE)
        if match:
            href = match.group(1)
            if href.startswith("//"): href = f"{parsed_base_url.scheme}:{href}"
            return urljoin(absolute_base, href)
            
    logger.debug(f"Sci-Hub HTML Parse: No obvious PDF link found in HTML from {base_page_url}")
    return None

def scihub_worker(identifier: str, pmid: str, md: dict, active_domains: list) -> tuple[str, bool]:
    """Worker for Sci-Hub PDF fetching."""
    fname_base = sanitize_filename(f"{md['year']}-{pmid}-{md['author']}-{md['title']}")
    output_pdf_path = os.path.join(OUTPUT_PDF_DIR, fname_base + ".pdf")

    if os.path.exists(output_pdf_path):
        logger.info(f"Sci-Hub ✓ {pmid}: PDF already exists at {output_pdf_path}")
        return pmid, True
    
    if not active_domains:
        logger.error(f"Sci-Hub ✗ {pmid}: No active Sci-Hub domains to try for identifier '{identifier}'.")
        return pmid, False

    for domain_url in active_domains:
        scihub_page_url = f"{domain_url.rstrip('/')}/{quote_plus(identifier)}"
        logger.info(f"Sci-Hub HTML GET → PMID {pmid} from {scihub_page_url}")
        
        sh_headers = session_scihub.headers.copy()
        sh_headers.update(BROWSER_LIKE_HEADERS)
        sh_headers['Sec-Fetch-Site'] = 'none' # Initial request to SH

        try:
            r_page = session_scihub.get(scihub_page_url, headers=sh_headers, timeout=25)
            r_page.raise_for_status()
            page_content_type = r_page.headers.get('Content-Type','').lower()

            # Sci-Hub sometimes directly serves PDF if it's cached that way
            if 'application/pdf' in page_content_type:
                 logger.info(f"Sci-Hub ? PMID {pmid}: URL {scihub_page_url} claims PDF directly. Attempting download...")
                 if download_and_save_pdf(session_scihub, r_page.url, output_pdf_path, pmid, source_name=f"SciHub_Direct({domain_url})", referer=domain_url):
                     return pmid, True
                 else:
                     logger.warning(f"Sci-Hub ✗ PMID {pmid}: Direct PDF from {scihub_page_url} failed validation or download.")
                     continue # Try next domain

            # Most common: HTML viewer page
            if 'html' in page_content_type or r_page.content[:15].lower().startswith((b'<!doctype html', b'<html')):
                pdf_url_from_html = find_scihub_pdf_in_html(r_page.content, r_page.url) # Pass bytes content
                
                if pdf_url_from_html:
                    logger.info(f"Sci-Hub HTML ✓ PMID {pmid}: Found potential PDF link: {pdf_url_from_html}")
                    if download_and_save_pdf(session_scihub, pdf_url_from_html, output_pdf_path, pmid, source_name=f"SciHub_Extracted({domain_url})", referer=scihub_page_url):
                        return pmid, True
                else:
                    logger.warning(f"Sci-Hub HTML ✗ {pmid} via {domain_url}: No PDF link found within {scihub_page_url}")
            else: 
                logger.warning(f"Sci-Hub ✗ {pmid} via {domain_url}: Unexpected Content-Type '{page_content_type}' from {scihub_page_url}.")

        except requests.exceptions.RequestException as e:
            logger.warning(f"Sci-Hub ✗ {pmid} via {domain_url} ({scihub_page_url}): RequestException: {e}")
        except Exception as e_sh_domain: # Catch broader exceptions for one domain attempt
            logger.error(f"Sci-Hub ✗ {pmid} via {domain_url} ({scihub_page_url}): General error {e_sh_domain.__class__.__name__}: {e_sh_domain}", exc_info=True)
        
        if len(active_domains) > 1: time.sleep(DELAY_SCIHUB) 

    logger.error(f"Sci-Hub ✗ {pmid}: Failed for identifier '{identifier}' after trying all active domains.")
    return pmid, False


# === MAIN SCRIPT EXECUTION ===
def main():
    t_start = time.time()
    logger.info(f"=== PDF Fetcher v12-pow started at {time.strftime('%Y-%m-%d %H:%M:%S')} ===")

    # Add file logging if handler was configured (example shown in LOGGING_SETUP)
    # if any(isinstance(h, logging.FileHandler) for h in logger.handlers):
    #    logger.info(f"Logging also to file: {log_file_path}") # Assuming log_file_path is global

    try:
        df = pd.read_excel(EXCEL_FILE_PATH)
        if 'PMID' not in df.columns:
            logger.error(f"Excel file {EXCEL_FILE_PATH} must contain a 'PMID' column.")
            return
    except FileNotFoundError:
        logger.error(f"Excel file not found: {EXCEL_FILE_PATH}")
        return
    except Exception as e:
        logger.error(f"Cannot read Excel file {EXCEL_FILE_PATH}: {e}", exc_info=True)
        return
    
    pmids_raw = df['PMID'].dropna().unique()
    pmids = []
    for p_raw in pmids_raw:
        try:
            pmids.append(str(int(float(str(p_raw))))) 
        except ValueError:
            logger.warning(f"Skipping invalid PMID format in Excel: '{p_raw}'")
    
    if not pmids:
        logger.error("No valid PMIDs found in the Excel file.")
        return
    logger.info(f"Loaded {len(pmids)} unique, valid PMIDs from {EXCEL_FILE_PATH}")

    metadata_dict = fetch_metadata(pmids)
    valid_pmids_with_meta = [p for p in pmids if p in metadata_dict] # PMIDs for which we got metadata
    logger.info(f"Successfully fetched metadata for {len(valid_pmids_with_meta)} PMIDs.")
    if not valid_pmids_with_meta:
        logger.error("No metadata could be fetched. Cannot proceed with PDF downloads.")
        return

    os.makedirs(OUTPUT_PDF_DIR, exist_ok=True)
    os.makedirs(os.path.join(OUTPUT_PDF_DIR, SUSPICIOUS_PDF_SUBDIR), exist_ok=True)
    logger.info(f"PDFs will be saved to: {os.path.abspath(OUTPUT_PDF_DIR)}")
    logger.info(f"Suspicious/failed validation files will be in: {os.path.abspath(os.path.join(OUTPUT_PDF_DIR, SUSPICIOUS_PDF_SUBDIR))}")


    logger.info("--- Starting Open Access Download Phase ---")
    oa_succeeded_pmids, oa_failed_pmids = [], []
    with ThreadPoolExecutor(max_workers=MAX_THREADS, thread_name_prefix="OA_Worker") as executor:
        future_to_pmid_oa = {
            executor.submit(oa_worker, pmid, metadata_dict[pmid]): pmid
            for pmid in valid_pmids_with_meta 
        }
        for future in as_completed(future_to_pmid_oa):
            pmid_processed = future_to_pmid_oa[future]
            try:
                _, success_status = future.result() 
                if success_status:
                    oa_succeeded_pmids.append(pmid_processed)
                else:
                    oa_failed_pmids.append(pmid_processed)
            except Exception as e_thread:
                logger.error(f"OA Thread ✗ Error processing PMID {pmid_processed}: {e_thread}", exc_info=True)
                oa_failed_pmids.append(pmid_processed)
    
    logger.info(f"Open Access Phase Summary: {len(oa_succeeded_pmids)} PDFs successfully downloaded and validated.")
    if oa_failed_pmids:
        logger.info(f"{len(oa_failed_pmids)} PMIDs not fetched via OA or failed validation: {oa_failed_pmids[:10]}...")


    if oa_failed_pmids:
        logger.info("--- Starting Sci-Hub Download Phase for Remaining PMIDs ---")
        active_scihub_domains = init_scihub_domains()
        
        if not active_scihub_domains:
            logger.error("Sci-Hub phase skipped: No active Sci-Hub domains found.")
            sci_hub_succeeded_pmids = []
            # All previously failed in OA are now the ones that are definitely failed
            final_still_failed_pmids = list(oa_failed_pmids) 
        else:
            sci_hub_succeeded_pmids = [] 
            final_still_failed_pmids = [] # PMIDs that fail both OA and SciHub
            with ThreadPoolExecutor(max_workers=max(1, MAX_THREADS // 2), thread_name_prefix="SciHub_Worker") as executor_sh:
                future_to_pmid_scihub = {}
                for pmid_to_try_scihub in oa_failed_pmids: # Only try those that failed OA
                    meta_for_pmid = metadata_dict[pmid_to_try_scihub]
                    # Sci-Hub prefers DOI, falls back to PMID if DOI is not available
                    identifier_for_scihub = meta_for_pmid.get('doi') or pmid_to_try_scihub
                    
                    future_to_pmid_scihub[executor_sh.submit(
                        scihub_worker, 
                        identifier_for_scihub, 
                        pmid_to_try_scihub, 
                        meta_for_pmid, 
                        active_scihub_domains
                    )] = pmid_to_try_scihub
                
                for future_sh in as_completed(future_to_pmid_scihub):
                    pmid_processed_scihub = future_to_pmid_scihub[future_sh]
                    try:
                        _, success_status_scihub = future_sh.result()
                        if success_status_scihub:
                            sci_hub_succeeded_pmids.append(pmid_processed_scihub)
                        else:
                            final_still_failed_pmids.append(pmid_processed_scihub)
                    except Exception as e_sh_thread:
                        logger.error(f"Sci-Hub Thread ✗ Error processing PMID {pmid_processed_scihub}: {e_sh_thread}", exc_info=True)
                        final_still_failed_pmids.append(pmid_processed_scihub)

            logger.info(f"Sci-Hub Phase Summary: {len(sci_hub_succeeded_pmids)} PDFs successfully downloaded and validated.")
            if final_still_failed_pmids:
                 logger.info(f"{len(final_still_failed_pmids)} PMIDs still missing after Sci-Hub attempts or validation: {final_still_failed_pmids[:10]}...")
    else: # All succeeded in OA phase
        logger.info("--- Sci-Hub Download Phase Skipped: All PMIDs successfully processed in Open Access phase. ---")
        sci_hub_succeeded_pmids = []
        final_still_failed_pmids = []


    total_succeeded = len(oa_succeeded_pmids) + len(sci_hub_succeeded_pmids)
    total_time_taken = time.time() - t_start
    logger.info("--- Overall Summary ---")
    logger.info(f"Processed {len(pmids)} unique input PMIDs.")
    logger.info(f"Attempted downloads for {len(valid_pmids_with_meta)} PMIDs (those with metadata).")
    logger.info(f"Total PDFs successfully downloaded & validated: {total_succeeded} / {len(valid_pmids_with_meta)}.")
    logger.info(f"  - Via Open Access (Unpaywall/PMC): {len(oa_succeeded_pmids)}")
    logger.info(f"  - Via Sci-Hub: {len(sci_hub_succeeded_pmids)}")
    
    if final_still_failed_pmids:
        logger.info(f"Total PMIDs ultimately NOT downloaded or failed validation: {len(final_still_failed_pmids)}")
        # Log all failed PMIDs for easier review
        failed_pmids_log_path = os.path.join(OUTPUT_PDF_DIR, "failed_pmids_v12.log")
        with open(failed_pmids_log_path, "w") as f_failed:
            for p_fail in sorted(list(set(final_still_failed_pmids))): # Unique and sorted
                f_failed.write(f"{p_fail}\n")
        logger.info(f"List of all failed/missing PMIDs saved to: {failed_pmids_log_path}")
    else:
        if total_succeeded == len(valid_pmids_with_meta) and len(valid_pmids_with_meta) > 0:
             logger.info("All requested PDFs (with metadata) successfully downloaded and validated!")
        elif len(valid_pmids_with_meta) == 0:
            logger.info("No PMIDs with metadata were available to attempt download.")


    logger.info(f"Total execution time: {total_time_taken:.2f} seconds.")
    logger.info(f"=== PDF Fetcher v12-pow completed at {time.strftime('%Y-%m-%d %H:%M:%S')} ===")

if __name__ == "__main__":
    main()

2025-05-22 12:14:42,284 - INFO - [MainThread] - main - === PDF Fetcher v12-pow started at 2025-05-22 12:14:42 ===
2025-05-22 12:14:42,300 - INFO - [MainThread] - main - Loaded 32 unique, valid PMIDs from C:\Users\Galaxy\Downloads\screening_ERAS.xlsx
2025-05-22 12:14:42,301 - INFO - [MainThread] - fetch_metadata - EFetch PubMed metadata for 32 PMIDs in 1 batch(es)...
2025-05-22 12:14:42,301 - INFO - [MainThread] - fetch_metadata - NCBI EFetch POST (batch 1/1) → IDs=39955421,40340819,39068053,39384309,38673038...
2025-05-22 12:14:43,196 - INFO - [MainThread] - main - Successfully fetched metadata for 32 PMIDs.
2025-05-22 12:14:43,197 - INFO - [MainThread] - main - PDFs will be saved to: c:\Users\Galaxy\LEVI\jupyter\litscape\downloaded_pdfs_v12_pow
2025-05-22 12:14:43,198 - INFO - [MainThread] - main - Suspicious/failed validation files will be in: c:\Users\Galaxy\LEVI\jupyter\litscape\downloaded_pdfs_v12_pow\suspicious_pdfs
2025-05-22 12:14:43,199 - INFO - [MainThread] - main - --- Start

In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Test script for pmc_id_for_pmid function.
"""
import os
import re
import time
import logging
import requests
import xml.etree.ElementTree as ET
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# === CONFIGURATION (subset from main script) ===
NCBI_API_KEY = "YOUR_API_KEY_HERE"  # IMPORTANT: Fill if you have one
CROSSREF_EMAIL = "your_email@example.com" # IMPORTANT: Fill with your email for polite API use

# === LOGGING SETUP ===
logger = logging.getLogger("PMCIDResolverTest")
logger.handlers = []
logger.setLevel(logging.DEBUG) # Use DEBUG for verbose output during testing
ch = logging.StreamHandler()
ch.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - [%(threadName)s] - %(funcName)s - %(message)s'))
logger.addHandler(ch)

# === HTTP SESSION (simplified from main script) ===
def make_session(user_agent_detail="PMCIDResolverTest/1.0"):
    s = requests.Session()
    retries = Retry(total=3, backoff_factor=0.5, status_forcelist=[429, 500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retries)
    s.mount("https://", adapter)
    s.mount("http://", adapter)
    email_for_ua = CROSSREF_EMAIL if CROSSREF_EMAIL != "your_email@example.com" else "anonymous_user"
    s.headers.update({'User-Agent': f"PythonScript/{user_agent_detail} (mailto:{email_for_ua})"})
    return s

session_ncbi = make_session()

def _get_ncbi_params(extra=None):
    params = {"tool": "PMCIDResolverTestScript"}
    if CROSSREF_EMAIL and CROSSREF_EMAIL != "your_email@example.com":
        params["email"] = CROSSREF_EMAIL
    if NCBI_API_KEY and NCBI_API_KEY != "YOUR_API_KEY_HERE":
        params["api_key"] = NCBI_API_KEY
    if extra:
        params.update(extra)
    return params

# === Paste the chosen pmc_id_for_pmid function here ===
# For example, the one modified above that uses only "pubmed_pmc"
# And if you use it, paste verify_pmcid_against_metadata too.

def pmc_id_for_pmid(pmid: str, article_metadata: dict) -> str | None:
    """
    Finds the PMCID for a given PMID using Entrez ELink.
    Prioritizes direct 'pubmed_pmc' link.
    """
    linkname_to_try = "pubmed_pmc"
    params = _get_ncbi_params({
        "dbfrom": "pubmed", "db": "pmc", "id": pmid,
        "cmd": "neighbor_score", "linkname": linkname_to_try
    })
    logger.info(f"PMC ID ELink → PMID {pmid}: Querying with linkname '{linkname_to_try}'.")
    try:
        r = session_ncbi.post(
            "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi",
            data=params, timeout=20
        )
        r.raise_for_status()
        root = ET.fromstring(r.content)
        link_set_db_el = root.find(f".//LinkSetDb[LinkName='{linkname_to_try}']")
        if link_set_db_el is None:
            info_el_general = root.find(".//Info")
            if info_el_general is not None: logger.info(f"PMC ID ELink ~ PMID {pmid}: General Info: {info_el_general.text}")
            else: logger.debug(f"PMC ID ELink ~ PMID {pmid}: No <LinkSetDb> for '{linkname_to_try}'. Resp: {r.text[:200]}")
            return None
        ids = [el.text.strip() for el in link_set_db_el.findall("./Link/Id") if el.text]
        if ids:
            pmc_candidate_id_num = ids[0]
            returned_pmcid = ""
            if pmc_candidate_id_num.upper().startswith("PMC"): returned_pmcid = pmc_candidate_id_num
            elif pmc_candidate_id_num.isdigit(): returned_pmcid = "PMC" + pmc_candidate_id_num
            else:
                logger.warning(f"PMC ID ELink ? PMID {pmid}: Non-standard ID '{pmc_candidate_id_num}' from '{linkname_to_try}'.")
                return None
            # Optional: Verification (would need verify_pmcid_against_metadata and article_metadata)
            # if not verify_pmcid_against_metadata(returned_pmcid, pmid, article_metadata.get('doi'), article_metadata.get('title')):
            #     logger.warning(f"PMC ID ELink ✗ PMID {pmid}: PMCID {returned_pmcid} FAILED verification.")
            #     return None
            logger.info(f"PMC ID ELink ✓ PMID {pmid}: Found PMCID {returned_pmcid} via '{linkname_to_try}'.")
            return returned_pmcid
        else:
            info_el = link_set_db_el.find("./Info")
            if info_el is not None: logger.info(f"PMC ID ELink ~ PMID {pmid}: Info for '{linkname_to_try}': {info_el.text}")
            else: logger.info(f"PMC ID ELink ~ PMID {pmid}: No PMCID <Id> for '{linkname_to_try}'.")
            return None
    except requests.exceptions.RequestException as e_req: logger.warning(f"PMC ID ELink ✗ PMID {pmid} ('{linkname_to_try}'): RequestException: {e_req}")
    except ET.ParseError as e_xml: logger.warning(f"PMC ID ELink ✗ PMID {pmid} ('{linkname_to_try}'): XML ParseError: {e_xml}. Content: {r.text[:200] if 'r' in locals() else 'N/A'}")
    except Exception as e_generic: logger.warning(f"PMC ID ELink ✗ PMID {pmid} ('{linkname_to_try}'): Error: {e_generic}", exc_info=True)
    logger.warning(f"PMC ID ELink ✗ PMID {pmid}: No PMCID from '{linkname_to_try}'.")
    return None

# Example verification function (simplified, needs testing and error handling)
# def verify_pmcid_against_metadata(pmcid_to_check: str, original_pmid: str, original_doi: str | None, original_title: str | None) -> bool:
#     logger.debug(f"Verifying PMCID {pmcid_to_check} against original PMID {original_pmid} (DOI: {original_doi})")
#     # This is a placeholder for the actual verification logic using efetch/esummary for the PMCID
#     # For now, let's assume if 'pubmed_pmc' gives it, it's correct enough for this test script's version of pmc_id_for_pmid
#     return True


if __name__ == "__main__":
    # Test case 1: The problematic PMID from your example
    pmid_problem = "37802689"
    # Mock metadata, as DOI/Title might be used in a more robust verification
    metadata_problem = {
        'doi': "10.1053/j.jvca.2023.09.006",
        'title': "The Year in Thoracic Anesthesia: Selected Highlights From 2022."
    }
    logger.info(f"--- Testing PMID: {pmid_problem} (expected: No direct PMCID or careful handling) ---")
    resolved_pmcid_problem = pmc_id_for_pmid(pmid_problem, metadata_problem)
    logger.info(f"Resolved PMCID for {pmid_problem}: {resolved_pmcid_problem}\n")

    # Test case 2: A PMID known to have a direct PMCID
    pmid_direct = "38862955" # This is PMC11167804
    metadata_direct = {
        'doi': "10.1186/s12890-024-03086-7",
        'title': "Predictive significance of systemic immune-inflammation index combined with prealbumin for postoperative pneumonia following lung resection surgery"
    }
    logger.info(f"--- Testing PMID: {pmid_direct} (expected: PMC11167804) ---")
    resolved_pmcid_direct = pmc_id_for_pmid(pmid_direct, metadata_direct)
    logger.info(f"Resolved PMCID for {pmid_direct}: {resolved_pmcid_direct}\n")

    # Test case 3: Another PMID known to have a direct PMCID
    pmid_direct_2 = "35409309" # Example: PMC9002001
    metadata_direct_2 = {'doi': "10.3389/fpsyt.2022.844297", 'title': "Example Title"}
    logger.info(f"--- Testing PMID: {pmid_direct_2} (expected: PMC9002001 or similar) ---")
    resolved_pmcid_direct_2 = pmc_id_for_pmid(pmid_direct_2, metadata_direct_2)
    logger.info(f"Resolved PMCID for {pmid_direct_2}: {resolved_pmcid_direct_2}\n")

    # Test case 4: A PMID known NOT to have a direct PMCID
    pmid_no_pmc = "12345" # A very old or non-OA article unlikely to be on PMC
    metadata_no_pmc = {'doi': "10.xxxx/someold.doi", 'title': "Old Non-PMC Article"}
    logger.info(f"--- Testing PMID: {pmid_no_pmc} (expected: None) ---")
    resolved_pmcid_no_pmc = pmc_id_for_pmid(pmid_no_pmc, metadata_no_pmc)
    logger.info(f"Resolved PMCID for {pmid_no_pmc}: {resolved_pmcid_no_pmc}\n")

2025-05-22 12:28:10,519 - INFO - [MainThread] - <module> - --- Testing PMID: 37802689 (expected: No direct PMCID or careful handling) ---
2025-05-22 12:28:10,521 - INFO - [MainThread] - pmc_id_for_pmid - PMC ID ELink → PMID 37802689: Querying with linkname 'pubmed_pmc'.
2025-05-22 12:28:11,123 - DEBUG - [MainThread] - pmc_id_for_pmid - PMC ID ELink ~ PMID 37802689: No <LinkSetDb> for 'pubmed_pmc'. Resp: <?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE eLinkResult PUBLIC "-//NLM//DTD elink 20101123//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20101123/elink.dtd">
<eLinkResult>

  <LinkSet>
    <D
2025-05-22 12:28:11,124 - INFO - [MainThread] - <module> - Resolved PMCID for 37802689: None

2025-05-22 12:28:11,124 - INFO - [MainThread] - <module> - --- Testing PMID: 38862955 (expected: PMC11167804) ---
2025-05-22 12:28:11,124 - INFO - [MainThread] - pmc_id_for_pmid - PMC ID ELink → PMID 38862955: Querying with linkname 'pubmed_pmc'.
2025-05-22 12:28:11,416 - INFO - [MainThread] - pmc