In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Combined OA-first + Sci-Hub PDF Fetcher (v12-pow, standalone)
-----------------------------------------------------------
1) Reads PMIDs from an Excel file.
2) Fetches metadata (year, first author, title, DOI) via NCBI EFetch.
3) Tries Open Access first:
    a) Unpaywall.
    b) PMC (handles Proof-of-Work challenge, resolves final URL).
4) Any remaining PMIDs are tried via Sci-Hub in parallel.
5) All PDFs are validated (size, content) and land in OUTPUT_PDF_DIR
   named `{year}-{pmid}-{author}-{title}.pdf`.
6) Detailed logging of every URL fetched and why it failed or succeeded.
"""

import os
import re
import time
import logging
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, quote_plus
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from concurrent.futures import ThreadPoolExecutor, as_completed
import hashlib # For PoW
try:
    from pypdf import PdfReader # Preferred (PyPDF2 successor)
    from pypdf.errors import PdfReadError
except ImportError:
    try:
        from PyPDF2 import PdfReader # Fallback
        from PyPDF2.errors import PdfReadError
    except ImportError:
        print("Please install pypdf or PyPDF2: pip install pypdf")
        exit()


# === CONFIGURATION ===
EXCEL_FILE_PATH     = r"C:\Users\Galaxy\Downloads\screening_ERAS.xlsx" # UPDATE THIS
OUTPUT_PDF_DIR      = "downloaded_pdfs_v12_pow"
SUSPICIOUS_PDF_SUBDIR = "suspicious_pdfs" # Subdirectory for failed validation PDFs

# Sci-Hub
SCI_HUB_DOMAINS     = [
    "https://sci-hub.se", "https://sci-hub.ru", "https://sci-hub.ren",
    "https://sci-hub.wf", "https://sci-hub.ee", "https://sci-hub.st"
]
DELAY_SCIHUB        = 0.5

# Threads / batching
MAX_THREADS         = 5 # Reduced default, be kind to servers
EFETCH_BATCH_SIZE   = 100
DELAY_NCBI          = 0.35

# PDF Validation Thresholds
MIN_PDF_SIZE_KB         = 20    # PDFs smaller than this are suspicious
MIN_PDF_PAGES           = 1     # PDFs with fewer pages are suspicious
MIN_TEXT_LENGTH_CHARS   = 300   # Min chars expected from first few pages of a real article

# API credentials - PLEASE FILL THESE IN
NCBI_API_KEY        = "YOUR_API_KEY_HERE" # IMPORTANT: Fill if you have one
CROSSREF_EMAIL      = "your_email@example.com" # IMPORTANT: Fill with your email for polite API use
UNPAYWALL_EMAIL     = "levi4328@gmail.com" # Your email for Unpaywall

# === LOGGING SETUP ===
logger = logging.getLogger("PDFFetcherV12")
logger.handlers = [] # Clear existing handlers if any (Jupyter)
logger.setLevel(logging.INFO) # DEBUG for more verbosity
ch = logging.StreamHandler()
ch.setFormatter(logging.Formatter(
    '%(asctime)s - %(levelname)s - [%(threadName)s] - %(funcName)s - %(message)s'
))
logger.addHandler(ch)

# File logging (optional, but recommended for long runs)
# log_file_path = os.path.join(OUTPUT_PDF_DIR, "pdf_fetcher_v12.log")
# os.makedirs(OUTPUT_PDF_DIR, exist_ok=True) 
# fh = logging.FileHandler(log_file_path, mode='a') 
# fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - [%(threadName)s] - %(funcName)s - %(message)s'))
# logger.addHandler(fh)


# === HTTP SESSIONS WITH RETRIES ===
# Using a common, recent Chrome User-Agent string
CHROME_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"

def make_session(user_agent, is_scihub_session=False):
    s = requests.Session()
    retries = Retry(
        total=3,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=frozenset(['GET', 'POST', 'HEAD'])
    )
    # Adjust pool connections based on whether it's for Sci-Hub or general OA/NCBI
    # Sci-Hub might benefit from fewer connections per domain if MAX_THREADS is high,
    # to avoid overwhelming a single Sci-Hub mirror.
    pool_connections = MAX_THREADS // 2 if is_scihub_session and MAX_THREADS > 2 else MAX_THREADS
    pool_maxsize = pool_connections * 2 # Standard practice: pool_maxsize often 2x pool_connections

    adapter = HTTPAdapter(
        max_retries=retries,
        pool_connections=pool_connections,
        pool_maxsize=pool_maxsize 
    )
    s.mount("https://", adapter)
    s.mount("http://", adapter)
    s.headers.update({'User-Agent': user_agent})
    return s

# Create distinct sessions for different services if User-Agent policies differ
# For NCBI, it's good to identify the tool and provide an email if possible.
ncbi_ua_email_part = CROSSREF_EMAIL if CROSSREF_EMAIL and CROSSREF_EMAIL != "your_email@example.com" else "anonymous_user"
session_ncbi   = make_session(f"PDFFetcherV12/1.0 (NCBI-EUtils-Client; mailto:{ncbi_ua_email_part})")
session_oa     = make_session(CHROME_UA) # General OA sites often need browser-like UAs
session_scihub = make_session(CHROME_UA, is_scihub_session=True) # Sci-Hub also prefers browser-like UAs

# Shared BROWSER_LIKE_HEADERS for GET requests that might hit web pages
BROWSER_LIKE_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br', # Allow compressed responses
    'Upgrade-Insecure-Requests': '1',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'none', # Default, can be overridden for specific requests
    'Sec-Fetch-User': '?1',
    'Cache-Control': 'no-cache', # Try to get fresh content
    'Pragma': 'no-cache'        # For older HTTP/1.0 caches
}

# === UTILITIES ===
def sanitize_filename(s: str) -> str:
    s = str(s) # Ensure input is a string
    s = re.sub(r'[\\/*?:"<>|]', "", s) # Remove illegal filename characters
    s = re.sub(r'\s+', " ", s).strip() # Replace multiple spaces with single, strip ends
    if not s: s = "untitled_article"   # Default if string becomes empty
    # Replace spaces with underscores and limit length (common for filenames)
    return s.replace(" ", "_")[:150] # Limit length to avoid issues with long paths/names

def _get_ncbi_params(extra=None):
    params = {"tool": "PDFFetcherV12"} # Identify the tool to NCBI
    if CROSSREF_EMAIL and CROSSREF_EMAIL != "your_email@example.com":
        params["email"] = CROSSREF_EMAIL
    if NCBI_API_KEY and NCBI_API_KEY != "YOUR_API_KEY_HERE":
        params["api_key"] = NCBI_API_KEY
    if extra:
        params.update(extra)
    return params

# === PROOF-OF-WORK (PoW) SOLVING LOGIC for PMC ===
def extract_pow_params_from_html(html_content: str) -> tuple[str, int, str, str, str] | None:
    # Regex to find JavaScript const declarations for PoW parameters
    challenge_match = re.search(r'const\s+POW_CHALLENGE\s*=\s*"(.*?)"', html_content)
    difficulty_match = re.search(r'const\s+POW_DIFFICULTY\s*=\s*"(.*?)"', html_content)
    cookie_name_match = re.search(r'const\s+POW_COOKIE_NAME\s*=\s*"(.*?)"', html_content)
    cookie_exp_match = re.search(r'const\s+POW_COOKIE_EXPIRATION\s*=\s*"(.*?)"', html_content) # Currently unused in solving
    cookie_path_match = re.search(r'const\s+POW_COOKIE_PATH\s*=\s*"(.*?)"', html_content)

    if challenge_match and difficulty_match and cookie_name_match and cookie_path_match:
        challenge_string = challenge_match.group(1)
        cookie_name = cookie_name_match.group(1)
        cookie_exp_str = cookie_exp_match.group(1) if cookie_exp_match else "0.208333" # Default if somehow missing
        cookie_path = cookie_path_match.group(1)
        try:
            difficulty = int(difficulty_match.group(1))
            logger.info(f"Extracted PoW params: Challenge='{challenge_string[:20]}...', Diff={difficulty}, Name='{cookie_name}'")
            return challenge_string, difficulty, cookie_name, cookie_exp_str, cookie_path
        except ValueError:
            logger.error(f"Could not parse PoW difficulty as int: '{difficulty_match.group(1)}'")
    else:
        logger.warning("Could not find all required PoW parameters in HTML content.")
    return None

def solve_pmc_pow(challenge_string: str, difficulty: int) -> tuple[int, str] | None:
    logger.info(f"Solving PoW: challenge='{challenge_string[:20]}...', difficulty={difficulty}")
    target_prefix = "0" * difficulty
    nonce = 0
    # Max nonce attempts can be adjusted; these are rough estimates based on common difficulties
    max_nonce_map = {4: 2_000_000, 5: 35_000_000, 6: 500_000_000} # Added difficulty 6
    max_nonce = max_nonce_map.get(difficulty, 100_000_000) # Default for other difficulties

    start_time = time.time()
    while nonce <= max_nonce:
        test_string = challenge_string + str(nonce)
        hash_object = hashlib.sha256(test_string.encode('utf-8')) # SHA256 is common
        hex_digest = hash_object.hexdigest()
        if hex_digest.startswith(target_prefix):
            duration = time.time() - start_time
            logger.info(f"PoW SOLVED! Nonce: {nonce}, Hash: {hex_digest[:10]}..., Time: {duration:.4f}s")
            return nonce, hex_digest
        if nonce > 0 and nonce % 1_000_000 == 0: # Log progress every million nonces
            logger.debug(f"PoW progress: nonce {nonce}...")
        nonce += 1
    duration = time.time() - start_time
    logger.error(f"PoW FAILED to solve (max_nonce {max_nonce} reached for difficulty {difficulty}). Time: {duration:.2f}s")
    return None

# === PDF VALIDATION (returns failure reason string or None for success) ===
def validate_downloaded_pdf(pdf_path: str, pmid_for_log: str) -> str | None:
    """Validates a downloaded PDF. Returns None if valid, else a string describing failure reason."""
    failure_reason = ""
    try:
        file_size_kb = os.path.getsize(pdf_path) / 1024
        if file_size_kb < MIN_PDF_SIZE_KB:
            failure_reason = f"File size {file_size_kb:.2f} KB < threshold {MIN_PDF_SIZE_KB} KB"
            logger.warning(f"PDF Validation ✗ {pmid_for_log}: {failure_reason} for '{os.path.basename(pdf_path)}'.")
            return failure_reason

        reader = None
        num_pages = 0
        try:
            reader = PdfReader(pdf_path)
            num_pages = len(reader.pages)
            if num_pages < MIN_PDF_PAGES:
                failure_reason = f"Page count {num_pages} < threshold {MIN_PDF_PAGES}"
                logger.warning(f"PDF Validation ✗ {pmid_for_log}: {failure_reason} for '{os.path.basename(pdf_path)}' (Size: {file_size_kb:.2f} KB).")
                return failure_reason
        except PdfReadError as e:
            failure_reason = f"pypdf PdfReadError: {e}"
            logger.warning(f"PDF Validation ✗ {pmid_for_log}: {failure_reason} for '{os.path.basename(pdf_path)}' (Size: {file_size_kb:.2f} KB).")
            return failure_reason
        except Exception as e_open:
            failure_reason = f"pypdf unexpected open error: {e_open}"
            logger.warning(f"PDF Validation ✗ {pmid_for_log}: {failure_reason} for '{os.path.basename(pdf_path)}' (Size: {file_size_kb:.2f} KB).")
            return failure_reason

        extracted_text_len = 0
        max_pages_to_check_text = min(3, num_pages)
        if reader:
            for i in range(max_pages_to_check_text):
                try:
                    page = reader.pages[i]
                    page_text = page.extract_text()
                    if page_text:
                        extracted_text_len += len(page_text)
                except Exception as e_text_extract:
                    logger.warning(f"PDF Validation ? {pmid_for_log}: Error extracting text from page {i+1} of '{os.path.basename(pdf_path)}': {e_text_extract}. Continuing.")
        
        if extracted_text_len < MIN_TEXT_LENGTH_CHARS:
            failure_reason = f"Insufficient text ({extracted_text_len} chars from first {max_pages_to_check_text} page(s) < threshold {MIN_TEXT_LENGTH_CHARS})"
            logger.warning(f"PDF Validation ✗ {pmid_for_log}: {failure_reason} for '{os.path.basename(pdf_path)}' (Size: {file_size_kb:.2f} KB).")
            return failure_reason

        logger.info(f"PDF Validation ✓ {pmid_for_log}: File '{os.path.basename(pdf_path)}' (Size: {file_size_kb:.2f}KB, Pages: {num_pages}, TextLen: {extracted_text_len} from first {max_pages_to_check_text} page(s)) passed validation.")
        return None # Success

    except FileNotFoundError:
        logger.error(f"PDF Validation ✗ {pmid_for_log}: File not found at {pdf_path} for validation.")
        return "File not found for validation." # Return reason
    except Exception as e:
        failure_reason = f"Unexpected error during validation setup: {e}"
        logger.error(f"PDF Validation ✗ {pmid_for_log}: {failure_reason} for {pdf_path}", exc_info=True)
        return failure_reason # Return reason

# === CENTRALIZED PDF DOWNLOADER (handles PoW for PMC) ===
def download_and_save_pdf(
    session: requests.Session, 
    pdf_url: str, 
    output_path: str, 
    pmid_for_log: str, 
    source_name: str,
    article_metadata: dict, # MODIFICATION: Added article_metadata parameter
    referer: str | None = None
    ) -> bool:
    
    logger.info(f"{source_name} → PMID {pmid_for_log}: Attempting download from {pdf_url}")
    
    current_headers = session.headers.copy() 
    current_headers.update(BROWSER_LIKE_HEADERS) 
    current_headers['Accept'] = 'application/pdf,text/html;q=0.9,application/xhtml+xml,application/xml;q=0.8,*/*;q=0.5'
    if referer:
        current_headers['Referer'] = referer
    
    parsed_pdf_url = urlparse(pdf_url)
    if 'pmc.ncbi.nlm.nih.gov' in parsed_pdf_url.netloc:
        is_same_origin_pmc = referer and 'pmc.ncbi.nlm.nih.gov' in urlparse(referer).netloc
        current_headers['Sec-Fetch-Site'] = 'same-origin' if is_same_origin_pmc else 'cross-site'

    temp_pdf_path = output_path + ".tmp" 

    try:
        r = session.get(pdf_url, headers=current_headers, stream=True, timeout=(15, 60), allow_redirects=True) 
        r.raise_for_status() 
        final_url_after_redirects = r.url 
        content_type = r.headers.get('Content-Type', '').lower()
        logger.debug(f"{source_name} → PMID {pmid_for_log}: Initial GET to {pdf_url} (final: {final_url_after_redirects}), Content-Type: {content_type}")

        html_content_bytes = None # Define in case it's used later for saving unexpected content

        if "pmc.ncbi.nlm.nih.gov" in urlparse(final_url_after_redirects).netloc and 'text/html' in content_type:
            logger.info(f"{source_name} → PMID {pmid_for_log}: Received HTML from PMC URL, attempting PoW solve.")
            html_content_bytes = r.content 
            html_content_str = html_content_bytes.decode('utf-8', errors='replace')
            
            sanitized_source_detail_for_html = sanitize_filename(source_name)
            debug_html_path = os.path.join(OUTPUT_PDF_DIR, SUSPICIOUS_PDF_SUBDIR, f"{pmid_for_log}_{sanitized_source_detail_for_html}_challenge.html")
            os.makedirs(os.path.dirname(debug_html_path), exist_ok=True)
            with open(debug_html_path, "w", encoding="utf-8") as f_debug: f_debug.write(html_content_str)
            logger.info(f"Saved PMC challenge HTML to {debug_html_path}")

            pow_params = extract_pow_params_from_html(html_content_str)
            if not pow_params:
                logger.error(f"{source_name} → PMID {pmid_for_log}: Failed to extract PoW params from PMC HTML.")
                return False # Does not log to enhanced failure log here, as no file was downloaded.
            
            challenge_str, difficulty_val, cookie_name, _, cookie_path = pow_params
            solution = solve_pmc_pow(challenge_str, difficulty_val)
            if not solution:
                logger.error(f"{source_name} → PMID {pmid_for_log}: Failed to solve PMC PoW.")
                return False # No file downloaded yet.

            nonce_found, _ = solution
            pow_cookie_value = f"{challenge_str},{nonce_found}" 
            
            parsed_uri = urlparse(final_url_after_redirects)
            session.cookies.set(name=cookie_name, value=pow_cookie_value, domain=parsed_uri.hostname, path=cookie_path)
            logger.info(f"Set PoW cookie '{cookie_name}' in session for {parsed_uri.hostname}.")

            logger.info(f"{source_name} → PMID {pmid_for_log}: Re-attempting GET to {final_url_after_redirects} WITH PoW cookie.")
            current_headers['Accept'] = 'application/pdf,application/octet-stream,*/*;q=0.8' 
            current_headers['Referer'] = final_url_after_redirects 
            current_headers['Sec-Fetch-Site'] = 'same-origin' 

            r = session.get(final_url_after_redirects, headers=current_headers, stream=True, timeout=(15, 60))
            r.raise_for_status()
            content_type = r.headers.get('Content-Type', '').lower() 
            logger.debug(f"{source_name} → PMID {pmid_for_log}: Second GET (post-PoW), Content-Type: {content_type}")

        if 'application/pdf' in content_type or final_url_after_redirects.lower().endswith(".pdf"):
            with open(temp_pdf_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=81920): 
                    f.write(chunk)
            
            validation_failure_reason = validate_downloaded_pdf(temp_pdf_path, pmid_for_log) # MODIFICATION
            if not validation_failure_reason: # MODIFICATION: Check if reason is None (success)
                os.rename(temp_pdf_path, output_path) 
                logger.info(f"{source_name} ✓ PMID {pmid_for_log}: Successfully downloaded and validated PDF to {output_path}")
                return True
            else: # MODIFICATION: PDF failed validation
                logger.warning(f"{source_name} ✗ PMID {pmid_for_log}: PDF from {final_url_after_redirects} failed validation: {validation_failure_reason}")
                suspicious_dir = os.path.join(OUTPUT_PDF_DIR, SUSPICIOUS_PDF_SUBDIR)
                os.makedirs(suspicious_dir, exist_ok=True)
                
                base_output_filename = sanitize_filename(os.path.basename(output_path).replace(".pdf", "")) 
                sanitized_source_detail = sanitize_filename(source_name) 
                suspicious_filename = f"{base_output_filename}.{sanitized_source_detail}.validation_failed.pdf"
                if len(os.path.join(suspicious_dir, suspicious_filename)) > 250: # Basic check for path length
                    suspicious_filename = f"{base_output_filename[:100]}.{sanitized_source_detail[:50]}.validation_failed.pdf"

                suspicious_path = os.path.join(suspicious_dir, suspicious_filename)
                try:
                    if os.path.exists(temp_pdf_path): 
                        os.rename(temp_pdf_path, suspicious_path)
                        logger.info(f"Moved suspicious PDF to {suspicious_path}")
                        # MODIFICATION: Log detailed info for suspicious PDF
                        log_enhanced_failure_details(
                            "suspicious_articles_details.log", 
                            pmid_for_log, 
                            article_metadata, 
                            "Suspicious PDF (Validation Failed)", 
                            details=validation_failure_reason, 
                            file_path=suspicious_path,
                            logged_set=LOGGED_SUSPICIOUS_PMIDS
                        )
                    else: 
                        logger.warning(f"Temporary PDF {temp_pdf_path} not found for moving to suspicious.")
                except OSError as e_rename: 
                    logger.error(f"OSError moving suspicious PDF {temp_pdf_path} to {suspicious_path}: {e_rename}")
                    if os.path.exists(temp_pdf_path): os.remove(temp_pdf_path) 
                return False
        else: # Non-PDF content
            logger.warning(f"{source_name} ✗ PMID {pmid_for_log}: Non-PDF content from {final_url_after_redirects}. Content-Type: {content_type}")
            
            sanitized_source_detail_for_debug = sanitize_filename(source_name)
            debug_content_filename = f"{pmid_for_log}_{sanitized_source_detail_for_debug}_unexpected_content.dat"
            debug_content_path = os.path.join(OUTPUT_PDF_DIR, SUSPICIOUS_PDF_SUBDIR, debug_content_filename)
            os.makedirs(os.path.dirname(debug_content_path), exist_ok=True)
            try:
                # Use html_content_bytes if available (from PoW challenge), otherwise r.content
                content_to_save = html_content_bytes if html_content_bytes is not None else r.content
                with open(debug_content_path, 'wb') as f_debug:
                    f_debug.write(content_to_save)
                logger.info(f"Saved unexpected content ({len(content_to_save)} bytes) to {debug_content_path}")
                # MODIFICATION: Log detailed info for this failure type as well
                log_enhanced_failure_details(
                    "suspicious_articles_details.log", # Or a different log for "non-pdf content"
                    pmid_for_log, 
                    article_metadata,
                    "Non-PDF Content Received",
                    details=f"Content-Type: {content_type}, URL: {final_url_after_redirects}",
                    file_path=debug_content_path,
                    logged_set=LOGGED_SUSPICIOUS_PMIDS # Using same set, or create a new one
                )
            except Exception as e_save_debug:
                logger.error(f"Error saving unexpected content for PMID {pmid_for_log}: {e_save_debug}")
            return False

    except requests.exceptions.RequestException as e:
        logger.warning(f"{source_name} ✗ PMID {pmid_for_log}: RequestException for {pdf_url}: {e}")
    except Exception as e_main: 
        logger.error(f"{source_name} ✗ PMID {pmid_for_log}: Unexpected error during download from {pdf_url}: {e_main}", exc_info=True)
    
    if os.path.exists(temp_pdf_path):
        try:
            os.remove(temp_pdf_path)
        except OSError as e_remove:
            logger.warning(f"Could not remove temp PDF {temp_pdf_path}: {e_remove}")
    return False

# === STEP 1: FETCH METADATA FROM PUBMED ===
def fetch_metadata(pmids):
    meta = {}
    batches = [pmids[i:i+EFETCH_BATCH_SIZE] for i in range(0, len(pmids), EFETCH_BATCH_SIZE)]
    logger.info(f"EFetch PubMed metadata for {len(pmids)} PMIDs in {len(batches)} batch(es)...")
    
    for i_batch, batch in enumerate(batches):
        efetch_payload = _get_ncbi_params({
            "db": "pubmed",
            "retmode": "xml",
            "id": ",".join(map(str, batch))
        })
        logger.info(f"NCBI EFetch POST (batch {i_batch+1}/{len(batches)}) → IDs={','.join(map(str,batch[:3]))}...")
        try:
            resp = session_ncbi.post(
                "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
                data=efetch_payload, 
                timeout=60 
            )
            resp.raise_for_status()
            root = ET.fromstring(resp.content)
            
            for art_node in root.findall(".//PubmedArticle"):
                pmid_el = art_node.find(".//PMID")
                pmid = pmid_el.text.strip() if pmid_el is not None and pmid_el.text else None
                if not pmid: continue

                doi_el = art_node.find(".//ArticleId[@IdType='doi']") or \
                         art_node.find(".//ELocationID[@EIdType='doi'][@ValidYN='Y']")
                doi = doi_el.text.strip() if doi_el is not None and doi_el.text else None
                
                year_el = art_node.find(".//PubDate/Year") or \
                          art_node.find(".//Journal/JournalIssue/PubDate/Year")
                year = "UnknownYear"
                if year_el is not None and year_el.text and year_el.text.strip().isdigit() and len(year_el.text.strip()) == 4:
                    year = year_el.text.strip()
                else:
                    medline_date_el = art_node.find(".//PubDate/MedlineDate") or \
                                      art_node.find(".//Article/Journal/JournalIssue/PubDate/MedlineDate")
                    if medline_date_el is not None and medline_date_el.text:
                        year_match = re.match(r"^\d{4}", medline_date_el.text.strip())
                        if year_match: year = year_match.group(0)
                
                author_el = art_node.find(".//AuthorList/Author[1]/LastName")
                author = author_el.text.strip() if author_el is not None and author_el.text else "UnknownAuthor"
                
                title_el = art_node.find(".//ArticleTitle")
                title = "".join(title_el.itertext()).strip() if title_el is not None else f"NoTitle_{pmid}"

                # Extract Abstract
                abstract_parts = []
                for abstract_text_node in art_node.findall(".//Abstract/AbstractText"):
                    if abstract_text_node.text:
                        label = abstract_text_node.get("Label")
                        if label:
                            abstract_parts.append(f"[{label.upper()}] {abstract_text_node.text.strip()}")
                        else:
                            abstract_parts.append(abstract_text_node.text.strip())
                abstract = "\n".join(abstract_parts) if abstract_parts else "N/A"

                # Extract MeSH Terms
                mesh_terms = []
                for mesh_heading_node in art_node.findall(".//MeshHeadingList/MeshHeading"):
                    descriptor_name_node = mesh_heading_node.find("./DescriptorName")
                    if descriptor_name_node is not None and descriptor_name_node.text:
                        mesh_terms.append(descriptor_name_node.text.strip())
                mesh_terms_str = "; ".join(mesh_terms) if mesh_terms else "N/A"
                
                meta[pmid] = {
                    'doi': doi, 'year': year, 'author': author, 'title': title,
                    'abstract': abstract, 'mesh_terms': mesh_terms_str
                }
        except requests.exceptions.RequestException as e_req:
            logger.warning(f"NCBI EFetch batch {i_batch+1} RequestException: {e_req}")
        except ET.ParseError as e_xml:
            response_text_snippet = resp.text[:200] if 'resp' in locals() and hasattr(resp, 'text') else "N/A"
            logger.warning(f"NCBI EFetch batch {i_batch+1} XML ParseError: {e_xml}. Content: {response_text_snippet}")
        except Exception as e_generic:
            logger.error(f"NCBI EFetch batch {i_batch+1} unexpected error: {e_generic}", exc_info=True)
        
        if i_batch < len(batches) - 1: 
            time.sleep(DELAY_NCBI)

    missing_meta_pmids = [p for p in pmids if p not in meta]
    if missing_meta_pmids:
        logger.warning(f"Metadata missing for {len(missing_meta_pmids)} PMIDs: {missing_meta_pmids[:10]}...")
    return meta

# === STEP 2: OPEN ACCESS (Unpaywall, PMC with PoW) ===
def unpaywall_get_pdf_url(doi: str) -> str | None:
    if not doi: return None # Skip if no DOI
    if not UNPAYWALL_EMAIL or UNPAYWALL_EMAIL == "your_email@example.com":
        logger.debug(f"Unpaywall API skipped for DOI {doi}: UNPAYWALL_EMAIL not configured.")
        return None

    # Use quote_plus for proper URL encoding of the DOI
    api_url = f"https://api.unpaywall.org/v2/{quote_plus(doi)}?email={UNPAYWALL_EMAIL}"
    logger.info(f"Unpaywall API GET → DOI {doi}") # Removed full URL from log for brevity
    try:
        r = session_oa.get(api_url, timeout=20) # Use the general OA session
        r.raise_for_status()
        data = r.json()
        
        if data.get("is_oa"):
            pdf_url = None
            # Prioritize best_oa_location
            best_loc = data.get("best_oa_location")
            if best_loc and best_loc.get("url_for_pdf"):
                pdf_url = best_loc.get("url_for_pdf")
            
            # Fallback to checking all oa_locations if best_oa_location has no PDF URL
            if not pdf_url:
                for loc in data.get("oa_locations", []):
                    if loc.get("url_for_pdf"):
                        pdf_url = loc.get("url_for_pdf")
                        logger.debug(f"Unpaywall API: Found PDF URL in other oa_locations: {pdf_url}")
                        break # Take the first one found
            
            if pdf_url:
                logger.info(f"Unpaywall API ✓ DOI {doi}: Found PDF URL: {pdf_url.split('?')[0]}...") # Log base URL
                return pdf_url
            else:
                logger.info(f"Unpaywall API ? DOI {doi}: Article is OA, but no direct PDF URL in Unpaywall response.")
        else:
            logger.info(f"Unpaywall API ~ DOI {doi}: Not OA according to Unpaywall.")
            
    except requests.exceptions.RequestException as e_req:
        logger.warning(f"Unpaywall API ✗ DOI {doi}: RequestException: {e_req}")
    except ValueError as e_json: # Handles JSONDecodeError
        logger.warning(f"Unpaywall API ✗ DOI {doi}: JSON Decode Error: {e_json}. Response: {r.text[:200] if 'r' in locals() else 'N/A'}")
    except Exception as e_generic:
        logger.error(f"Unpaywall API ✗ DOI {doi}: Unexpected error: {e_generic}", exc_info=True)
    return None


def pmc_id_for_pmid(pmid: str, article_metadata: dict) -> str | None:
    """
    Finds the PMCID for a given PMID using Entrez ELink.
    Prioritizes direct 'pubmed_pmc' link.
    article_metadata is passed for potential future verification steps.
    """
    linkname_to_try = "pubmed_pmc" # Query only for direct PMC links

    params = _get_ncbi_params({
        "dbfrom": "pubmed",
        "db": "pmc",
        "id": pmid,
        "cmd": "neighbor_score", # Recommended by NCBI for robust linking
        "linkname": linkname_to_try
    })

    logger.info(f"PMC ID ELink → PMID {pmid}: Querying with linkname '{linkname_to_try}'.")
    try:
        r = session_ncbi.post(
            "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi",
            data=params,
            timeout=20 
        )
        r.raise_for_status()
        root = ET.fromstring(r.content)

        link_set_db_el = root.find(f".//LinkSetDb[LinkName='{linkname_to_try}']")

        if link_set_db_el is None:
            info_el_general = root.find(".//Info") 
            if info_el_general is not None and info_el_general.text:
                 logger.info(f"PMC ID ELink ~ PMID {pmid}: NCBI Info: {info_el_general.text.strip()}")
            else:
                 logger.debug(f"PMC ID ELink ~ PMID {pmid}: No <LinkSetDb> for '{linkname_to_try}'. XML: {r.text[:250]}")
            return None

        ids = [el.text.strip() for el in link_set_db_el.findall("./Link/Id") if el.text]

        if ids:
            pmc_candidate_id_num = ids[0] # Typically one direct link
            returned_pmcid = ""
            if pmc_candidate_id_num.upper().startswith("PMC"):
                returned_pmcid = pmc_candidate_id_num
            elif pmc_candidate_id_num.isdigit():
                returned_pmcid = "PMC" + pmc_candidate_id_num
            else:
                logger.warning(f"PMC ID ELink ? PMID {pmid}: Non-standard ID '{pmc_candidate_id_num}' from '{linkname_to_try}'.")
                return None 
            
            # Optional: verify_pmcid_against_metadata(returned_pmcid, pmid, article_metadata.get('doi'), article_metadata.get('title'))
            logger.info(f"PMC ID ELink ✓ PMID {pmid}: Found PMCID {returned_pmcid} via '{linkname_to_try}'.")
            return returned_pmcid
        else:
            info_el = link_set_db_el.find("./Info") # Check for <Info> within this specific LinkSetDb
            if info_el is not None and info_el.text:
                logger.info(f"PMC ID ELink ~ PMID {pmid}: NCBI Info for '{linkname_to_try}': {info_el.text.strip()}")
            else:
                logger.info(f"PMC ID ELink ~ PMID {pmid}: No PMCID <Id> elements for '{linkname_to_try}'.")
            return None

    except requests.exceptions.RequestException as e_req:
        logger.warning(f"PMC ID ELink ✗ PMID {pmid} ('{linkname_to_try}'): RequestException: {e_req}")
    except ET.ParseError as e_xml:
        response_text_snippet = r.text[:250] if 'r' in locals() and hasattr(r, 'text') else "N/A"
        logger.warning(f"PMC ID ELink ✗ PMID {pmid} ('{linkname_to_try}'): XML ParseError: {e_xml}. Content: {response_text_snippet}")
    except Exception as e_generic:
        logger.error(f"PMC ID ELink ✗ PMID {pmid} ('{linkname_to_try}'): Unexpected error: {e_generic}", exc_info=True)
    
    logger.warning(f"PMC ID ELink ✗ PMID {pmid}: No PMCID from '{linkname_to_try}' after full attempt.")
    return None


def pmc_attempt_download(pmcid: str, pmid: str, md: dict) -> bool:
    """Attempts to download a PDF from PMC, handling PoW."""
    # Ensure md has the necessary keys for filename generation, or provide defaults
    year_val = md.get('year', 'UnknownYear')
    author_val = md.get('author', 'UnknownAuthor')
    title_val = md.get('title', f'NoTitle_{pmid}')
    fname_base = sanitize_filename(f"{year_val}-{pmid}-{author_val}-{title_val}")
    output_pdf_path  = os.path.join(OUTPUT_PDF_DIR, fname_base + ".pdf")

    if os.path.exists(output_pdf_path): # Should not be strictly necessary if oa_worker checks, but good defense
        logger.info(f"PMC ✓ {pmid} ({pmcid}): PDF already exists at {output_pdf_path} (checked in pmc_attempt_download).")
        return True

    # This is the "landing page" for the PDF, which should redirect to the actual PDF file URL
    pmc_article_pdf_landing_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/pdf/"
    final_pdf_url_from_pmc = None # Will hold the URL after HEAD request and redirects
    
    logger.info(f"PMC HEAD → PMID {pmid} ({pmcid}): Probing {pmc_article_pdf_landing_url} for final PDF URL.")
    head_req_headers = session_oa.headers.copy()
    head_req_headers.update(BROWSER_LIKE_HEADERS) # Use general browser headers
    head_req_headers['Accept'] = 'application/pdf, text/html;q=0.9, */*;q=0.8' # Accept PDF or HTML for HEAD
    head_req_headers['Sec-Fetch-Site'] = 'cross-site' # Navigating to NCBI

    try:
        head_resp = session_oa.head(
            pmc_article_pdf_landing_url,
            headers=head_req_headers,
            timeout=(10, 25), # connect, read timeouts
            allow_redirects=True # Follow redirects to find the final URL
        )
        head_resp.raise_for_status()
        final_pdf_url_from_pmc = head_resp.url # This is the URL after all redirects
        
        # Check if the resolved URL looks like a PDF link
        if not (final_pdf_url_from_pmc.lower().endswith(".pdf") or "format=pdf" in final_pdf_url_from_pmc.lower() or "/pdf/" in final_pdf_url_from_pmc.lower()):
            logger.warning(f"PMC HEAD ? PMID {pmid} ({pmcid}): Resolved URL {final_pdf_url_from_pmc} doesn't strongly indicate PDF. Proceeding cautiously.")
        else:
            logger.info(f"PMC HEAD ✓ PMID {pmid} ({pmcid}): Resolved potential PDF URL: {final_pdf_url_from_pmc}")

    except requests.exceptions.RequestException as e:
        logger.warning(f"PMC HEAD ✗ PMID {pmid} ({pmcid}) for {pmc_article_pdf_landing_url}: {e}. Will attempt GET on landing URL.")
        # Fallback: if HEAD fails, try to use the original landing page URL for the GET request.
        # This might trigger the PoW challenge directly on the landing page.
        final_pdf_url_from_pmc = pmc_article_pdf_landing_url 
    except Exception as e_head_generic:
        logger.error(f"PMC HEAD ✗ PMID {pmid} ({pmcid}): Unexpected error during HEAD request: {e_head_generic}", exc_info=True)
        return False # Cannot proceed if HEAD has critical error

    if final_pdf_url_from_pmc:
        return download_and_save_pdf(
            session_oa, # Use the general OA session for PMC downloads
            final_pdf_url_from_pmc, 
            output_pdf_path, 
            pmid, 
            source_name=f"PMC({pmcid})",
            referer=pmc_article_pdf_landing_url # Referer for the GET can be the initial landing URL
        )
    else:
        logger.error(f"PMC ✗ PMID {pmid} ({pmcid}): No URL determined for download attempt after HEAD request.")
        return False


def oa_worker(pmid: str, md: dict) -> tuple[str, bool]:
    """Worker for Open Access PDF fetching (Unpaywall then PMC)."""
    try: # MODIFICATION: Added top-level try-except
        doi = md.get('doi')
        # Ensure md has the necessary keys for filename generation, or provide defaults
        year_val = md.get('year', 'UnknownYear')
        author_val = md.get('author', 'UnknownAuthor')
        title_val = md.get('title', f'NoTitle_{pmid}') # Use pmid in default title if original is missing

        fname_base = sanitize_filename(f"{year_val}-{pmid}-{author_val}-{title_val}")
        output_pdf_path  = os.path.join(OUTPUT_PDF_DIR, fname_base + ".pdf")

        if os.path.exists(output_pdf_path): # Check once at the beginning of worker
            logger.info(f"OA ✓ {pmid}: PDF already exists at {output_pdf_path}")
            return pmid, True

        # 1) Unpaywall
        if doi:
            unpaywall_url = unpaywall_get_pdf_url(doi) # Expects DOI string
            if unpaywall_url:
                # Use doi.org as referer for Unpaywall links
                referer_unpaywall = f"https://doi.org/{quote_plus(doi)}"
                # MODIFIED LINE: Added article_metadata=md
                if download_and_save_pdf(session_oa, unpaywall_url, output_pdf_path, pmid, source_name=f"Unpaywall(DOI:{doi})", article_metadata=md, referer=referer_unpaywall):
                    return pmid, True
                else:
                    logger.info(f"OA: Unpaywall attempt for PMID {pmid} (DOI {doi}) failed download/validation. Trying PMC.")
            else:
                logger.info(f"OA: No PDF URL from Unpaywall for PMID {pmid} (DOI {doi}). Trying PMC.")
        else:
            logger.info(f"OA: No DOI for PMID {pmid}. Skipping Unpaywall, trying PMC.")
        
        # 2) PMC fallback
        pmcid = pmc_id_for_pmid(pmid, md) # md is passed here
        if pmcid:
            if pmc_attempt_download(pmcid, pmid, md):
                return pmid, True
        
        logger.warning(f"OA ✗ {pmid}: No PDF found via Unpaywall or PMC.")
        return pmid, False
    
    except Exception as e_oa_worker: # Catch any unexpected error within oa_worker
        logger.error(f"OA Worker UNHANDLED EXCEPTION for PMID {pmid}: {e_oa_worker}", exc_info=True)
        return pmid, False # Ensure a tuple is always returned, flagging as failure

# === STEP 3: SCI-HUB ===
def test_scihub_domain(domain: str) -> bool:
    """Tests if a Sci-Hub domain is responsive."""
    test_doi = "10.1000/182" # A generic, usually available test DOI
    url = f"{domain.rstrip('/')}/{test_doi}"
    logger.debug(f"Sci-Hub TEST GET → {url}")
    try:
        # Use a shorter timeout for domain testing
        r = session_scihub.get(url, timeout=10, headers=BROWSER_LIKE_HEADERS) 
        # Sci-Hub can return 200 with HTML, or 404 if DOI not found, both indicate responsiveness
        # Also check for common Sci-Hub page elements if status is 200 but not obvious HTML
        if r.status_code == 200 and ('html' in r.headers.get('Content-Type','').lower() or \
                                     any(kw in r.text.lower() for kw in ['sci-hub', 'save', 'download', '<button id="download">'])):
            logger.info(f"Sci-Hub TEST ✓ {domain} is responsive (status {r.status_code}).")
            return True
        elif r.status_code == 404: # 404 for a test DOI is also a sign the domain itself is working
             logger.info(f"Sci-Hub TEST ✓ {domain} is responsive (status 404, expected for non-existent test DOI).")
             return True
        else:
            logger.warning(f"Sci-Hub TEST ? {domain} responded status {r.status_code}, CT: {r.headers.get('Content-Type','')}. Text snippet: {r.text[:100]}")
    except requests.exceptions.Timeout:
        logger.warning(f"Sci-Hub TEST ✗ {domain} timed out.")
    except requests.exceptions.RequestException as e:
        logger.warning(f"Sci-Hub TEST ✗ {domain} error: {e}")
    except Exception as e_generic_test: # Catch any other error during test
        logger.error(f"Sci-Hub TEST ✗ {domain} unexpected error: {e_generic_test}", exc_info=True)
    return False

def init_scihub_domains() -> list[str]:
    """Probes Sci-Hub domains for availability and returns a list of working ones."""
    logger.info("Probing Sci-Hub mirrors for availability...")
    working_domains = []
    # Use a ThreadPoolExecutor to test domains in parallel for speed
    # Limit max_workers for domain testing to avoid too many simultaneous requests
    with ThreadPoolExecutor(max_workers=min(len(SCI_HUB_DOMAINS), 3), thread_name_prefix="SciHub_Domain_Test") as executor:
        future_to_domain = {executor.submit(test_scihub_domain, d): d for d in SCI_HUB_DOMAINS}
        for future in as_completed(future_to_domain):
            domain = future_to_domain[future]
            try:
                if future.result(): # result() is True if domain is working
                    working_domains.append(domain)
            except Exception as exc: # Catch any exception during future.result() itself
                logger.error(f"Sci-Hub domain test for {domain} generated an exception during result retrieval: {exc}")
    
    if not working_domains:
        logger.error("CRITICAL: No working Sci-Hub domains found after testing!")
    else:
        # Prioritize .se if available as it's often reliable
        # (This is a simple heuristic, actual reliability can vary)
        if "https://sci-hub.se" in working_domains:
            working_domains.insert(0, working_domains.pop(working_domains.index("https://sci-hub.se")))
        logger.info(f"Using Sci-Hub domains: {working_domains}")
    return working_domains


def find_scihub_pdf_in_html(html_content: bytes, base_page_url: str) -> str | None:
    """Parses Sci-Hub HTML content to find the direct PDF link."""
    # Ensure html_content is bytes for BeautifulSoup, then decode for regex if needed
    # For BS4, it's often better to let it handle encoding detection from bytes.
    soup = BeautifulSoup(html_content, 'html.parser')
    parsed_base_url = urlparse(base_page_url)
    # Ensure absolute_base correctly reconstructs scheme and netloc
    absolute_base = f"{parsed_base_url.scheme}://{parsed_base_url.netloc}"

    # Selectors for common PDF embedding methods on Sci-Hub
    # Order can matter; more specific selectors first.
    selectors_and_attrs = [
        ('iframe#pdf', 'src'),             # Specific iframe ID often used
        ('iframe#article', 'src'),         # Another common iframe ID
        ('embed[type="application/pdf"]', 'src'), # Embed tag for PDFs
        ('iframe[src*=".pdf"]', 'src'),    # Any iframe whose src contains .pdf
        ('a#download', 'href'),            # Common download button ID
        ('div.buttons > a[href*=".pdf"]', 'href'), # Anchor in buttons div linking to PDF
        ('div#buttons > a[href*=".pdf"]', 'href'), # Variation
        ('a[href*=".pdf"]', 'href')        # General anchor tags linking to PDFs (less specific)
    ]

    for selector, attr in selectors_and_attrs:
        element = soup.select_one(selector)
        if element and element.get(attr):
            src_val = element.get(attr)
            # Handle protocol-relative URLs (e.g., //example.com/file.pdf)
            if src_val.startswith("//"): 
                src_val = f"{parsed_base_url.scheme}:{src_val}"
            
            # Avoid JavaScript or data URIs
            if not src_val.lower().startswith(('data:', 'javascript:')):
                # Resolve relative URLs to absolute ones
                resolved_url = urljoin(absolute_base, src_val) 
                # Basic check: does it look like a PDF link or is it on a known Sci-Hub domain?
                # This helps filter out non-PDF links that might match selectors.
                if ".pdf" in resolved_url.lower() or any(sh_domain_base in resolved_url for sh_domain_base in [urlparse(d).netloc for d in SCI_HUB_DOMAINS]):
                    logger.debug(f"Sci-Hub HTML Parse: Found PDF link '{resolved_url}' using selector '{selector}'")
                    return resolved_url

    # Fallback for 'location.href' in onclick attributes (less reliable, more prone to false positives)
    # Try to be more specific with regex to avoid matching unrelated JS.
    onclick_buttons = soup.select('button[onclick*="location.href"], a[onclick*="location.href"]')
    for button in onclick_buttons:
        onclick_val = button.get('onclick', '')
        # Regex to find .pdf URLs within location.href assignments
        match = re.search(r"location\.href\s*=\s*['\"]([^'\"]+\.pdf[^'\"]*)['\"]", onclick_val, re.IGNORECASE)
        if match:
            href = match.group(1).strip()
            if href.startswith("//"): href = f"{parsed_base_url.scheme}:{href}"
            resolved_url = urljoin(absolute_base, href)
            logger.debug(f"Sci-Hub HTML Parse: Found PDF link '{resolved_url}' from onclick attribute.")
            return resolved_url
            
    logger.debug(f"Sci-Hub HTML Parse: No obvious PDF link found in HTML from {base_page_url}")
    return None

def scihub_worker(identifier: str, pmid: str, md: dict, active_domains: list) -> tuple[str, bool]:
    """Worker for Sci-Hub PDF fetching."""
    # Ensure md has the necessary keys for filename generation
    year_val = md.get('year', 'UnknownYear')
    author_val = md.get('author', 'UnknownAuthor')
    title_val = md.get('title', f'NoTitle_{pmid}')
    fname_base = sanitize_filename(f"{year_val}-{pmid}-{author_val}-{title_val}")
    output_pdf_path = os.path.join(OUTPUT_PDF_DIR, fname_base + ".pdf")

    if os.path.exists(output_pdf_path): # Check if already downloaded
        logger.info(f"Sci-Hub ✓ {pmid}: PDF already exists at {output_pdf_path} (checked in scihub_worker).")
        return pmid, True
    
    if not active_domains: # Should be caught by main logic, but defensive check
        logger.error(f"Sci-Hub ✗ {pmid}: No active Sci-Hub domains to try for identifier '{identifier}'.")
        return pmid, False

    for i, domain_url in enumerate(active_domains): # Iterate through available domains
        # Construct Sci-Hub page URL using the identifier (DOI or PMID)
        scihub_page_url = f"{domain_url.rstrip('/')}/{quote_plus(identifier)}"
        logger.info(f"Sci-Hub HTML GET → PMID {pmid} from {scihub_page_url} (Attempt {i+1}/{len(active_domains)})")
        
        sh_headers = session_scihub.headers.copy() # Use Sci-Hub specific session
        sh_headers.update(BROWSER_LIKE_HEADERS)    # Add general browser headers
        sh_headers['Sec-Fetch-Site'] = 'none'      # Initial request to SH is 'none' or 'cross-site'

        try:
            r_page = session_scihub.get(scihub_page_url, headers=sh_headers, timeout=30) # Increased timeout for Sci-Hub page load
            r_page.raise_for_status() # Check for HTTP errors
            page_content_type = r_page.headers.get('Content-Type','').lower()

            # Case 1: Sci-Hub sometimes directly serves PDF if it's cached that way or is the final link
            if 'application/pdf' in page_content_type:
                logger.info(f"Sci-Hub ? PMID {pmid}: URL {scihub_page_url} served PDF directly. Attempting download...")
                # Use r_page.url as it might have redirected to the actual PDF URL
                if download_and_save_pdf(session_scihub, r_page.url, output_pdf_path, pmid, source_name=f"SciHub_Direct({domain_url})", referer=domain_url):
                    return pmid, True
                else:
                    logger.warning(f"Sci-Hub ✗ PMID {pmid}: Direct PDF from {scihub_page_url} failed validation or download.")
                    continue # Try next domain if direct download fails

            # Case 2: Most common - HTML viewer page
            # Check if content starts like HTML, as Content-Type can be misleading
            elif 'html' in page_content_type or r_page.content[:100].strip().lower().startswith((b'<!doctype html', b'<html')):
                # Pass r_page.content (bytes) to find_scihub_pdf_in_html
                pdf_url_from_html = find_scihub_pdf_in_html(r_page.content, r_page.url) 
                
                if pdf_url_from_html:
                    logger.info(f"Sci-Hub HTML ✓ PMID {pmid}: Found potential PDF link: {pdf_url_from_html.split('?')[0]}...")
                    # Use session_scihub for downloading the extracted PDF link
                    if download_and_save_pdf(session_scihub, pdf_url_from_html, output_pdf_path, pmid, source_name=f"SciHub_Extracted({domain_url})", referer=scihub_page_url):
                        return pmid, True
                    # else: if download_and_save_pdf fails, loop continues to next domain
                else:
                    logger.warning(f"Sci-Hub HTML ✗ {pmid} via {domain_url}: No PDF link found within HTML from {scihub_page_url}")
            else: 
                logger.warning(f"Sci-Hub ✗ {pmid} via {domain_url}: Unexpected Content-Type '{page_content_type}' from {scihub_page_url}. Snippet: {r_page.text[:100]}")

        except requests.exceptions.RequestException as e_req:
            logger.warning(f"Sci-Hub ✗ {pmid} via {domain_url} ({scihub_page_url}): RequestException: {e_req}")
        except Exception as e_sh_domain: # Catch broader exceptions for one domain attempt
            logger.error(f"Sci-Hub ✗ {pmid} via {domain_url} ({scihub_page_url}): General error {e_sh_domain.__class__.__name__}: {e_sh_domain}", exc_info=True)
        
        if len(active_domains) > 1 and i < len(active_domains) - 1: # If not the last domain
            time.sleep(DELAY_SCIHUB) # Delay before trying the next Sci-Hub domain

    logger.error(f"Sci-Hub ✗ {pmid}: Failed for identifier '{identifier}' after trying all active domains.")
    return pmid, False

def log_enhanced_failure_details(log_filename: str, pmid: str, metadata: dict, 
                                 failure_type: str, details: str | None = None, 
                                 file_path: str | None = None, logged_set: set | None = None):
    """Logs detailed information about a failed or suspicious PMID to a specified log file."""
    if logged_set is not None: # Allow disabling the duplicate check if needed for some reason
        if pmid in logged_set:
            logger.debug(f"PMID {pmid} already logged in {log_filename}. Skipping duplicate entry.")
            return 
    
    # Ensure metadata is not None and provide defaults if keys are missing
    if metadata is None: 
        metadata = {} 
        logger.warning(f"No metadata provided for PMID {pmid} during detailed logging to {log_filename}.")

    entry_lines = [
        "----------------------------------------",
        f"PMID: {pmid}",
        f"Failure Type: {failure_type}"
    ]
    if details: 
        entry_lines.append(f"Details: {details}")
    if file_path: 
        entry_lines.append(f"File Path: {os.path.abspath(file_path)}") # Log absolute path
    
    entry_lines.extend([
        f"DOI: {metadata.get('doi', 'N/A')}",
        f"Year: {metadata.get('year', 'N/A')}",
        f"Author: {metadata.get('author', 'N/A')}",
        f"Title: {metadata.get('title', 'N/A')}",
        # Abstract can be long, ensure it's handled well (already string with newlines)
        f"Abstract:\n{metadata.get('abstract', 'N/A')}",
        f"MeSH Terms: {metadata.get('mesh_terms', 'N/A')}",
        "----------------------------------------\n" # Extra newline for separation
    ])
    
    log_file_full_path = os.path.join(OUTPUT_PDF_DIR, log_filename)
    try:
        with open(log_file_full_path, "a", encoding="utf-8") as f_log:
            f_log.write("\n".join(entry_lines))
        if logged_set is not None:
            logged_set.add(pmid)
    except IOError as e:
        logger.error(f"Could not write to enhanced log file {log_file_full_path}: {e}")
        
# === MAIN SCRIPT EXECUTION ===
def main():
    t_start = time.time()
    logger.info(f"=== PDF Fetcher v12-pow started at {time.strftime('%Y-%m-%d %H:%M:%S')} ===")

    # --- File Logging Setup (Optional) ---
    # log_file_handler_path = os.path.join(OUTPUT_PDF_DIR, "pdf_fetcher_v12_run.log")
    # try:
    #     os.makedirs(OUTPUT_PDF_DIR, exist_ok=True) 
    #     fh = logging.FileHandler(log_file_handler_path, mode='a') # Append mode
    #     fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - [%(threadName)s] - %(funcName)s - %(message)s'))
    #     logger.addHandler(fh)
    #     logger.info(f"Detailed logging to file: {os.path.abspath(log_file_handler_path)}")
    # except Exception as e_log_file:
    #     logger.error(f"Could not set up file logging to {log_file_handler_path}: {e_log_file}")
    # --- End File Logging Setup ---

    try:
        df = pd.read_excel(EXCEL_FILE_PATH)
        if 'PMID' not in df.columns:
            logger.error(f"Excel file {EXCEL_FILE_PATH} must contain a 'PMID' column.")
            return
    except FileNotFoundError:
        logger.error(f"Excel file not found: {EXCEL_FILE_PATH}")
        return
    except Exception as e_excel: # Catch other potential pandas/excel reading errors
        logger.error(f"Cannot read Excel file {EXCEL_FILE_PATH}: {e_excel}", exc_info=True)
        return
    
    # Clean and validate PMIDs from the Excel sheet
    pmids_raw = df['PMID'].dropna().unique()
    pmids = []
    for p_raw in pmids_raw:
        try:
            # Attempt to convert to float then int to handle numbers like "12345.0"
            pmids.append(str(int(float(str(p_raw))))) 
        except ValueError:
            logger.warning(f"Skipping invalid PMID format in Excel: '{p_raw}'")
    
    if not pmids:
        logger.error("No valid PMIDs found in the Excel file.")
        return
    logger.info(f"Loaded {len(pmids)} unique, valid PMIDs from {EXCEL_FILE_PATH}")

    metadata_dict = fetch_metadata(pmids)
    # Filter PMIDs to only those for which metadata was successfully fetched
    valid_pmids_with_meta = [p for p in pmids if p in metadata_dict and metadata_dict[p]]
    
    logger.info(f"Successfully fetched metadata for {len(valid_pmids_with_meta)} PMIDs.")
    if not valid_pmids_with_meta:
        logger.error("No metadata could be fetched for any valid PMIDs. Cannot proceed.")
        return

    # Create output directories if they don't exist
    try:
        os.makedirs(OUTPUT_PDF_DIR, exist_ok=True)
        os.makedirs(os.path.join(OUTPUT_PDF_DIR, SUSPICIOUS_PDF_SUBDIR), exist_ok=True)
        logger.info(f"PDFs will be saved to: {os.path.abspath(OUTPUT_PDF_DIR)}")
        logger.info(f"Suspicious/failed validation files will be in: {os.path.abspath(os.path.join(OUTPUT_PDF_DIR, SUSPICIOUS_PDF_SUBDIR))}")
    except OSError as e_mkdir:
        logger.error(f"Could not create output directories: {e_mkdir}. Exiting.")
        return


    logger.info("--- Starting Open Access Download Phase ---")
    oa_succeeded_pmids, oa_failed_pmids = [], []
    # Using ThreadPoolExecutor for concurrent OA downloads
    with ThreadPoolExecutor(max_workers=MAX_THREADS, thread_name_prefix="OA_Worker") as executor:
        # Submit all OA tasks
        future_to_pmid_oa = {
            executor.submit(oa_worker, pmid, metadata_dict[pmid]): pmid
            for pmid in valid_pmids_with_meta # Only process PMIDs for which we have metadata
        }
        
        # Process results as they complete
        for future in as_completed(future_to_pmid_oa):
            pmid_processed = future_to_pmid_oa[future]
            try:
                # future.result() will return the (pmid, success_status) tuple from oa_worker
                # or re-raise an exception if one occurred in the worker and wasn't caught by its top-level try-except
                worker_result = future.result() 
                
                if worker_result is None: # Should ideally not happen with the worker's top-level try-except
                    logger.error(f"OA Thread ✗ For PMID {pmid_processed}, future.result() was None. Treating as failure.")
                    oa_failed_pmids.append(pmid_processed)
                elif isinstance(worker_result, tuple) and len(worker_result) == 2:
                    returned_pmid, success_status = worker_result
                    # Sanity check: returned_pmid should match pmid_processed
                    if returned_pmid != pmid_processed:
                         logger.warning(f"OA Thread ? Mismatch in returned PMID: expected {pmid_processed}, got {returned_pmid}. Processing with {pmid_processed}.")
                    
                    if success_status:
                        oa_succeeded_pmids.append(pmid_processed)
                    else:
                        oa_failed_pmids.append(pmid_processed)
                else: # Worker returned something unexpected
                    logger.error(f"OA Thread ✗ For PMID {pmid_processed}, worker returned unexpected result: {worker_result}. Treating as failure.")
                    oa_failed_pmids.append(pmid_processed)

            except Exception as e_thread: # Catch exceptions raised by future.result()
                logger.error(f"OA Thread ✗ Exception processing PMID {pmid_processed}: {e_thread}", exc_info=True)
                oa_failed_pmids.append(pmid_processed) # Add to failed list if an exception occurs
    
    logger.info(f"Open Access Phase Summary: {len(oa_succeeded_pmids)} PDFs successfully downloaded and validated.")
    if oa_failed_pmids:
        logger.info(f"{len(oa_failed_pmids)} PMIDs not fetched via OA or failed validation: {sorted(oa_failed_pmids)[:10]}...")


    # --- Sci-Hub Phase for PMIDs that failed OA ---
    sci_hub_succeeded_pmids = []
    final_still_failed_pmids = list(oa_failed_pmids) # Initialize with those that failed OA

    if oa_failed_pmids: # Only run Sci-Hub phase if there are PMIDs that failed OA
        logger.info("--- Starting Sci-Hub Download Phase for Remaining PMIDs ---")
        active_scihub_domains = init_scihub_domains()
        
        if not active_scihub_domains:
            logger.error("Sci-Hub phase skipped: No active Sci-Hub domains found.")
            # final_still_failed_pmids remains as oa_failed_pmids
        else:
            # PMIDs that succeeded in OA are removed from the list to try with Sci-Hub
            pmids_for_scihub = list(oa_failed_pmids) # Create a copy to modify
            final_still_failed_pmids = [] # Reset for this phase; will be populated by Sci-Hub failures

            # Reduce threads for Sci-Hub to be kinder, max(1, ...) ensures at least 1 worker
            scihub_max_workers = max(1, MAX_THREADS // 2 if MAX_THREADS > 1 else 1)
            with ThreadPoolExecutor(max_workers=scihub_max_workers, thread_name_prefix="SciHub_Worker") as executor_sh:
                future_to_pmid_scihub = {}
                for pmid_to_try_scihub in pmids_for_scihub:
                    if pmid_to_try_scihub not in metadata_dict: # Should not happen if using valid_pmids_with_meta
                        logger.warning(f"Sci-Hub: Metadata missing for PMID {pmid_to_try_scihub}, skipping.")
                        final_still_failed_pmids.append(pmid_to_try_scihub)
                        continue
                        
                    meta_for_pmid = metadata_dict[pmid_to_try_scihub]
                    # Sci-Hub prefers DOI, falls back to PMID if DOI is not available or invalid
                    identifier_for_scihub = meta_for_pmid.get('doi') if meta_for_pmid.get('doi') else pmid_to_try_scihub
                    
                    future_to_pmid_scihub[executor_sh.submit(
                        scihub_worker, 
                        identifier_for_scihub, 
                        pmid_to_try_scihub, 
                        meta_for_pmid, 
                        active_scihub_domains
                    )] = pmid_to_try_scihub
                
                for future_sh in as_completed(future_to_pmid_scihub):
                    pmid_processed_scihub = future_to_pmid_scihub[future_sh]
                    try:
                        sh_worker_result = future_sh.result()
                        if sh_worker_result is None:
                            logger.error(f"Sci-Hub Thread ✗ For PMID {pmid_processed_scihub}, future.result() was None. Treating as failure.")
                            final_still_failed_pmids.append(pmid_processed_scihub)
                        elif isinstance(sh_worker_result, tuple) and len(sh_worker_result) == 2:
                            _, success_status_scihub = sh_worker_result
                            if success_status_scihub:
                                sci_hub_succeeded_pmids.append(pmid_processed_scihub)
                            else:
                                final_still_failed_pmids.append(pmid_processed_scihub)
                        else:
                            logger.error(f"Sci-Hub Thread ✗ For PMID {pmid_processed_scihub}, worker returned unexpected result: {sh_worker_result}. Treating as failure.")
                            final_still_failed_pmids.append(pmid_processed_scihub)
                    except Exception as e_sh_thread:
                        logger.error(f"Sci-Hub Thread ✗ Exception processing PMID {pmid_processed_scihub}: {e_sh_thread}", exc_info=True)
                        final_still_failed_pmids.append(pmid_processed_scihub)

            logger.info(f"Sci-Hub Phase Summary: {len(sci_hub_succeeded_pmids)} PDFs successfully downloaded and validated.")
            if final_still_failed_pmids:
                logger.info(f"{len(final_still_failed_pmids)} PMIDs still missing after Sci-Hub attempts or validation: {sorted(final_still_failed_pmids)[:10]}...")
    else: 
        logger.info("--- Sci-Hub Download Phase Skipped: No PMIDs failed the Open Access phase. ---")
        # final_still_failed_pmids is already an empty list if oa_failed_pmids was empty


    total_succeeded = len(oa_succeeded_pmids) + len(sci_hub_succeeded_pmids)
    total_time_taken = time.time() - t_start
    logger.info("--- Overall Summary ---")
    logger.info(f"Processed {len(pmids)} unique input PMIDs.")
    logger.info(f"Attempted downloads for {len(valid_pmids_with_meta)} PMIDs (those with metadata).")
    logger.info(f"Total PDFs successfully downloaded & validated: {total_succeeded} / {len(valid_pmids_with_meta)}.")
    logger.info(f"  - Via Open Access (Unpaywall/PMC): {len(oa_succeeded_pmids)}")
    logger.info(f"  - Via Sci-Hub: {len(sci_hub_succeeded_pmids)}")
    
    # Ensure final_still_failed_pmids is a unique list of PMIDs that were attempted but not successfully retrieved
    # This list should contain PMIDs that failed both OA and Sci-Hub (if Sci-Hub was attempted for them)
    # Or PMIDs that failed OA and Sci-Hub was not run/successful for them.
    
    # Recalculate final_still_failed_pmids based on what was actually attempted and not in succeeded lists
    all_attempted_pmids = set(valid_pmids_with_meta)
    all_succeeded_pmids = set(oa_succeeded_pmids) | set(sci_hub_succeeded_pmids)
    final_truly_failed_pmids = sorted(list(all_attempted_pmids - all_succeeded_pmids))

    if final_truly_failed_pmids:
        logger.info(f"Total PMIDs ultimately NOT downloaded or failed validation: {len(final_truly_failed_pmids)}")
        # Log all failed PMIDs for easier review
        failed_pmids_log_path = os.path.join(OUTPUT_PDF_DIR, "failed_pmids_v12.log")
        try:
            with open(failed_pmids_log_path, "w") as f_failed:
                for p_fail in final_truly_failed_pmids:
                    f_failed.write(f"{p_fail}\n")
            logger.info(f"List of all failed/missing PMIDs saved to: {failed_pmids_log_path}")
        except IOError as e_io_failed:
            logger.error(f"Could not write failed PMIDs log to {failed_pmids_log_path}: {e_io_failed}")
    else:
        if total_succeeded == len(valid_pmids_with_meta) and len(valid_pmids_with_meta) > 0:
             logger.info("All requested PDFs (with metadata) successfully downloaded and validated!")
        elif len(valid_pmids_with_meta) == 0 : # Should have been caught earlier
             logger.info("No PMIDs with metadata were available to attempt download.")
        else: # total_succeeded might be less than valid_pmids_with_meta but list is empty (should not happen)
            logger.info("No PMIDs failed, but counts suggest some were not processed. Review logs.")


    logger.info(f"Total execution time: {total_time_taken:.2f} seconds.")
    logger.info(f"=== PDF Fetcher v12-pow completed at {time.strftime('%Y-%m-%d %H:%M:%S')} ===")

if __name__ == "__main__":
    main()

2025-05-22 13:15:32,840 - INFO - [MainThread] - main - === PDF Fetcher v12-pow started at 2025-05-22 13:15:32 ===
2025-05-22 13:15:32,867 - INFO - [MainThread] - main - Loaded 32 unique, valid PMIDs from C:\Users\Galaxy\Downloads\screening_ERAS.xlsx
2025-05-22 13:15:32,867 - INFO - [MainThread] - fetch_metadata - EFetch PubMed metadata for 32 PMIDs in 1 batch(es)...
2025-05-22 13:15:32,869 - INFO - [MainThread] - fetch_metadata - NCBI EFetch POST (batch 1/1) → IDs=39955421,40340819,39068053...
2025-05-22 13:15:34,391 - INFO - [MainThread] - main - Successfully fetched metadata for 32 PMIDs.
2025-05-22 13:15:34,392 - INFO - [MainThread] - main - PDFs will be saved to: c:\Users\Galaxy\LEVI\jupyter\litscape\downloaded_pdfs_v12_pow
2025-05-22 13:15:34,392 - INFO - [MainThread] - main - Suspicious/failed validation files will be in: c:\Users\Galaxy\LEVI\jupyter\litscape\downloaded_pdfs_v12_pow\suspicious_pdfs
2025-05-22 13:15:34,393 - INFO - [MainThread] - main - --- Starting Open Access Do

In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Combined OA-first + Sci-Hub PDF Fetcher (v12-pow, standalone)
-----------------------------------------------------------
1) Reads PMIDs from an Excel file.
2) Fetches metadata (year, first author, title, DOI, Abstract, MeSH) via NCBI EFetch.
3) Tries Open Access first:
    a) Unpaywall.
    b) PMC (handles Proof-of-Work challenge, resolves final URL).
4) Any remaining PMIDs are tried via Sci-Hub in parallel.
5) All PDFs are validated (size, content) and land in OUTPUT_PDF_DIR
   named `{year}-{pmid}-{author}-{title}.pdf`.
6) Detailed logging of every URL fetched and why it failed or succeeded.
7) Enhanced, structured logging for PMIDs that fail retrieval or have suspicious PDFs.
"""

import os
import re
import time
import logging
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, quote_plus
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from concurrent.futures import ThreadPoolExecutor, as_completed
import hashlib # For PoW
try:
    from pypdf import PdfReader # Preferred (PyPDF2 successor)
    from pypdf.errors import PdfReadError
except ImportError:
    try:
        from PyPDF2 import PdfReader # Fallback
        from PyPDF2.errors import PdfReadError
    except ImportError:
        print("Please install pypdf or PyPDF2: pip install pypdf")
        exit()


# === CONFIGURATION ===
EXCEL_FILE_PATH     = r"C:\Users\Galaxy\Downloads\screening_ERAS.xlsx" # UPDATE THIS
OUTPUT_PDF_DIR      = "downloaded_pdfs_v12_pow"
SUSPICIOUS_PDF_SUBDIR = "suspicious_pdfs" # Subdirectory for failed validation PDFs

# Sci-Hub
SCI_HUB_DOMAINS     = [
    "https://sci-hub.se", "https://sci-hub.ru", "https://sci-hub.ren",
    "https://sci-hub.wf", "https://sci-hub.ee", "https://sci-hub.st"
]
DELAY_SCIHUB        = 0.5

# Threads / batching
MAX_THREADS         = 5 
EFETCH_BATCH_SIZE   = 100
DELAY_NCBI          = 0.35 # Seconds between NCBI EUtils calls

# PDF Validation Thresholds
MIN_PDF_SIZE_KB         = 20    
MIN_PDF_PAGES           = 1     
MIN_TEXT_LENGTH_CHARS   = 300   

# API credentials - PLEASE FILL THESE IN
NCBI_API_KEY        = "YOUR_API_KEY_HERE" 
CROSSREF_EMAIL      = "your_email@example.com" 
UNPAYWALL_EMAIL     = "levi4328@gmail.com" 

# === Global sets for tracking logged PMIDs to avoid duplicates in detailed logs ===
LOGGED_SUSPICIOUS_PMIDS = set()
LOGGED_FAILED_PMIDS = set()

# === LOGGING SETUP ===
logger = logging.getLogger("PDFFetcherV12")
logger.handlers = [] 
logger.setLevel(logging.INFO) 
ch = logging.StreamHandler()
ch.setFormatter(logging.Formatter(
    '%(asctime)s - %(levelname)s - [%(threadName)s] - %(funcName)s - %(message)s'
))
logger.addHandler(ch)

# === HTTP SESSIONS WITH RETRIES ===
CHROME_UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"

def make_session(user_agent, is_scihub_session=False):
    s = requests.Session()
    retries = Retry(
        total=3,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=frozenset(['GET', 'POST', 'HEAD'])
    )
    pool_connections = MAX_THREADS // 2 if is_scihub_session and MAX_THREADS > 2 else MAX_THREADS
    pool_maxsize = pool_connections * 2 

    adapter = HTTPAdapter(
        max_retries=retries,
        pool_connections=pool_connections,
        pool_maxsize=pool_maxsize 
    )
    s.mount("https://", adapter)
    s.mount("http://", adapter)
    s.headers.update({'User-Agent': user_agent})
    return s

ncbi_ua_email_part = CROSSREF_EMAIL if CROSSREF_EMAIL and CROSSREF_EMAIL != "your_email@example.com" else "anonymous_user"
session_ncbi   = make_session(f"PDFFetcherV12/1.0 (NCBI-EUtils-Client; mailto:{ncbi_ua_email_part})")
session_oa     = make_session(CHROME_UA) 
session_scihub = make_session(CHROME_UA, is_scihub_session=True) 

BROWSER_LIKE_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br', 
    'Upgrade-Insecure-Requests': '1',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'none', 
    'Sec-Fetch-User': '?1',
    'Cache-Control': 'no-cache', 
    'Pragma': 'no-cache'        
}

# === UTILITIES ===
def sanitize_filename(s: str) -> str:
    s = str(s) 
    s = re.sub(r'[\\/*?:"<>|]', "", s) 
    s = re.sub(r'\s+', " ", s).strip() 
    if not s: s = "untitled_article"   
    return s.replace(" ", "_")[:150] 

def _get_ncbi_params(extra=None):
    params = {"tool": "PDFFetcherV12"} 
    if CROSSREF_EMAIL and CROSSREF_EMAIL != "your_email@example.com":
        params["email"] = CROSSREF_EMAIL
    if NCBI_API_KEY and NCBI_API_KEY != "YOUR_API_KEY_HERE":
        params["api_key"] = NCBI_API_KEY
    if extra:
        params.update(extra)
    return params

# === PROOF-OF-WORK (PoW) SOLVING LOGIC for PMC ===
def extract_pow_params_from_html(html_content: str) -> tuple[str, int, str, str, str] | None:
    challenge_match = re.search(r'const\s+POW_CHALLENGE\s*=\s*"(.*?)"', html_content)
    difficulty_match = re.search(r'const\s+POW_DIFFICULTY\s*=\s*"(.*?)"', html_content)
    cookie_name_match = re.search(r'const\s+POW_COOKIE_NAME\s*=\s*"(.*?)"', html_content)
    cookie_exp_match = re.search(r'const\s+POW_COOKIE_EXPIRATION\s*=\s*"(.*?)"', html_content) 
    cookie_path_match = re.search(r'const\s+POW_COOKIE_PATH\s*=\s*"(.*?)"', html_content)

    if challenge_match and difficulty_match and cookie_name_match and cookie_path_match:
        challenge_string = challenge_match.group(1)
        cookie_name = cookie_name_match.group(1)
        cookie_exp_str = cookie_exp_match.group(1) if cookie_exp_match else "0.208333" 
        cookie_path = cookie_path_match.group(1)
        try:
            difficulty = int(difficulty_match.group(1))
            logger.info(f"Extracted PoW params: Challenge='{challenge_string[:20]}...', Diff={difficulty}, Name='{cookie_name}'")
            return challenge_string, difficulty, cookie_name, cookie_exp_str, cookie_path
        except ValueError:
            logger.error(f"Could not parse PoW difficulty as int: '{difficulty_match.group(1)}'")
    else:
        logger.warning("Could not find all required PoW parameters in HTML content.")
    return None

def solve_pmc_pow(challenge_string: str, difficulty: int) -> tuple[int, str] | None:
    logger.info(f"Solving PoW: challenge='{challenge_string[:20]}...', difficulty={difficulty}")
    target_prefix = "0" * difficulty
    nonce = 0
    max_nonce_map = {4: 2_000_000, 5: 35_000_000, 6: 500_000_000} 
    max_nonce = max_nonce_map.get(difficulty, 100_000_000) 

    start_time = time.time()
    while nonce <= max_nonce:
        test_string = challenge_string + str(nonce)
        hash_object = hashlib.sha256(test_string.encode('utf-8')) 
        hex_digest = hash_object.hexdigest()
        if hex_digest.startswith(target_prefix):
            duration = time.time() - start_time
            logger.info(f"PoW SOLVED! Nonce: {nonce}, Hash: {hex_digest[:10]}..., Time: {duration:.4f}s")
            return nonce, hex_digest
        if nonce > 0 and nonce % 1_000_000 == 0: 
            logger.debug(f"PoW progress: nonce {nonce}...")
        nonce += 1
    duration = time.time() - start_time
    logger.error(f"PoW FAILED to solve (max_nonce {max_nonce} reached for difficulty {difficulty}). Time: {duration:.2f}s")
    return None

# === PDF VALIDATION (returns failure reason string or None for success) ===
def validate_downloaded_pdf(pdf_path: str, pmid_for_log: str) -> str | None:
    """Validates a downloaded PDF. Returns None if valid, else a string describing failure reason."""
    failure_reason = ""
    try:
        file_size_kb = os.path.getsize(pdf_path) / 1024
        if file_size_kb < MIN_PDF_SIZE_KB:
            failure_reason = f"File size {file_size_kb:.2f} KB < threshold {MIN_PDF_SIZE_KB} KB"
            logger.warning(f"PDF Validation ✗ {pmid_for_log}: {failure_reason} for '{os.path.basename(pdf_path)}'.")
            return failure_reason

        reader = None
        num_pages = 0
        try:
            reader = PdfReader(pdf_path)
            num_pages = len(reader.pages)
            if num_pages < MIN_PDF_PAGES:
                failure_reason = f"Page count {num_pages} < threshold {MIN_PDF_PAGES}"
                logger.warning(f"PDF Validation ✗ {pmid_for_log}: {failure_reason} for '{os.path.basename(pdf_path)}' (Size: {file_size_kb:.2f} KB).")
                return failure_reason
        except PdfReadError as e:
            failure_reason = f"pypdf PdfReadError: {e}"
            logger.warning(f"PDF Validation ✗ {pmid_for_log}: {failure_reason} for '{os.path.basename(pdf_path)}' (Size: {file_size_kb:.2f} KB).")
            return failure_reason
        except Exception as e_open:
            failure_reason = f"pypdf unexpected open error: {e_open}"
            logger.warning(f"PDF Validation ✗ {pmid_for_log}: {failure_reason} for '{os.path.basename(pdf_path)}' (Size: {file_size_kb:.2f} KB).")
            return failure_reason

        extracted_text_len = 0
        max_pages_to_check_text = min(3, num_pages)
        if reader:
            for i in range(max_pages_to_check_text):
                try:
                    page = reader.pages[i]
                    page_text = page.extract_text()
                    if page_text:
                        extracted_text_len += len(page_text)
                except Exception as e_text_extract:
                    logger.warning(f"PDF Validation ? {pmid_for_log}: Error extracting text from page {i+1} of '{os.path.basename(pdf_path)}': {e_text_extract}. Continuing.")
        
        if extracted_text_len < MIN_TEXT_LENGTH_CHARS:
            failure_reason = f"Insufficient text ({extracted_text_len} chars from first {max_pages_to_check_text} page(s) < threshold {MIN_TEXT_LENGTH_CHARS})"
            logger.warning(f"PDF Validation ✗ {pmid_for_log}: {failure_reason} for '{os.path.basename(pdf_path)}' (Size: {file_size_kb:.2f} KB).")
            return failure_reason

        logger.info(f"PDF Validation ✓ {pmid_for_log}: File '{os.path.basename(pdf_path)}' (Size: {file_size_kb:.2f}KB, Pages: {num_pages}, TextLen: {extracted_text_len} from first {max_pages_to_check_text} page(s)) passed validation.")
        return None # Success

    except FileNotFoundError:
        logger.error(f"PDF Validation ✗ {pmid_for_log}: File not found at {pdf_path} for validation.")
        return "File not found for validation." 
    except Exception as e:
        failure_reason = f"Unexpected error during validation setup: {e}"
        logger.error(f"PDF Validation ✗ {pmid_for_log}: {failure_reason} for {pdf_path}", exc_info=True)
        return failure_reason 

# === ENHANCED LOGGING FOR FAILURES/SUSPICIOUS FILES ===
def log_enhanced_failure_details(log_filename: str, pmid: str, metadata: dict, 
                                 failure_type: str, details: str | None = None, 
                                 file_path: str | None = None, logged_set: set | None = None):
    """Logs detailed information about a failed or suspicious PMID to a specified log file."""
    if logged_set is not None: 
        if pmid in logged_set:
            logger.debug(f"PMID {pmid} already logged in {log_filename}. Skipping duplicate entry.")
            return 
    
    if metadata is None: 
        metadata = {} 
        logger.warning(f"No metadata provided for PMID {pmid} during detailed logging to {log_filename}.")

    entry_lines = [
        "----------------------------------------",
        f"PMID: {pmid}",
        f"Failure Type: {failure_type}"
    ]
    if details: 
        entry_lines.append(f"Details: {details}")
    if file_path: 
        entry_lines.append(f"File Path: {os.path.abspath(file_path)}") 
    
    entry_lines.extend([
        f"DOI: {metadata.get('doi', 'N/A')}",
        f"Year: {metadata.get('year', 'N/A')}",
        f"Author: {metadata.get('author', 'N/A')}",
        f"Title: {metadata.get('title', 'N/A')}",
        f"Abstract:\n{metadata.get('abstract', 'N/A')}",
        f"MeSH Terms: {metadata.get('mesh_terms', 'N/A')}",
        "----------------------------------------\n" 
    ])
    
    log_file_full_path = os.path.join(OUTPUT_PDF_DIR, log_filename)
    try:
        with open(log_file_full_path, "a", encoding="utf-8") as f_log:
            f_log.write("\n".join(entry_lines))
        if logged_set is not None:
            logged_set.add(pmid)
    except IOError as e:
        logger.error(f"Could not write to enhanced log file {log_file_full_path}: {e}")


# === CENTRALIZED PDF DOWNLOADER (handles PoW for PMC) ===
def download_and_save_pdf(
    session: requests.Session, 
    pdf_url: str, 
    output_path: str, 
    pmid_for_log: str, 
    source_name: str,
    article_metadata: dict, 
    referer: str | None = None
    ) -> bool:
    
    logger.info(f"{source_name} → PMID {pmid_for_log}: Attempting download from {pdf_url}")
    
    current_headers = session.headers.copy() 
    current_headers.update(BROWSER_LIKE_HEADERS) 
    current_headers['Accept'] = 'application/pdf,text/html;q=0.9,application/xhtml+xml,application/xml;q=0.8,*/*;q=0.5'
    if referer:
        current_headers['Referer'] = referer
    
    parsed_pdf_url = urlparse(pdf_url)
    if 'pmc.ncbi.nlm.nih.gov' in parsed_pdf_url.netloc:
        is_same_origin_pmc = referer and 'pmc.ncbi.nlm.nih.gov' in urlparse(referer).netloc
        current_headers['Sec-Fetch-Site'] = 'same-origin' if is_same_origin_pmc else 'cross-site'

    temp_pdf_path = output_path + ".tmp" 

    try:
        r = session.get(pdf_url, headers=current_headers, stream=True, timeout=(15, 60), allow_redirects=True) 
        r.raise_for_status() 
        final_url_after_redirects = r.url 
        content_type = r.headers.get('Content-Type', '').lower()
        logger.debug(f"{source_name} → PMID {pmid_for_log}: Initial GET to {pdf_url} (final: {final_url_after_redirects}), Content-Type: {content_type}")

        html_content_bytes = None 

        if "pmc.ncbi.nlm.nih.gov" in urlparse(final_url_after_redirects).netloc and 'text/html' in content_type:
            logger.info(f"{source_name} → PMID {pmid_for_log}: Received HTML from PMC URL, attempting PoW solve.")
            html_content_bytes = r.content 
            html_content_str = html_content_bytes.decode('utf-8', errors='replace')
            
            sanitized_source_detail_for_html = sanitize_filename(source_name)
            debug_html_path = os.path.join(OUTPUT_PDF_DIR, SUSPICIOUS_PDF_SUBDIR, f"{pmid_for_log}_{sanitized_source_detail_for_html}_challenge.html")
            os.makedirs(os.path.dirname(debug_html_path), exist_ok=True)
            with open(debug_html_path, "w", encoding="utf-8") as f_debug: f_debug.write(html_content_str)
            logger.info(f"Saved PMC challenge HTML to {debug_html_path}")

            pow_params = extract_pow_params_from_html(html_content_str)
            if not pow_params:
                logger.error(f"{source_name} → PMID {pmid_for_log}: Failed to extract PoW params from PMC HTML.")
                return False 
            
            challenge_str, difficulty_val, cookie_name, _, cookie_path = pow_params
            solution = solve_pmc_pow(challenge_str, difficulty_val)
            if not solution:
                logger.error(f"{source_name} → PMID {pmid_for_log}: Failed to solve PMC PoW.")
                return False 

            nonce_found, _ = solution
            pow_cookie_value = f"{challenge_str},{nonce_found}" 
            
            parsed_uri = urlparse(final_url_after_redirects)
            session.cookies.set(name=cookie_name, value=pow_cookie_value, domain=parsed_uri.hostname, path=cookie_path)
            logger.info(f"Set PoW cookie '{cookie_name}' in session for {parsed_uri.hostname}.")

            logger.info(f"{source_name} → PMID {pmid_for_log}: Re-attempting GET to {final_url_after_redirects} WITH PoW cookie.")
            current_headers['Accept'] = 'application/pdf,application/octet-stream,*/*;q=0.8' 
            current_headers['Referer'] = final_url_after_redirects 
            current_headers['Sec-Fetch-Site'] = 'same-origin' 

            r = session.get(final_url_after_redirects, headers=current_headers, stream=True, timeout=(15, 60))
            r.raise_for_status()
            content_type = r.headers.get('Content-Type', '').lower() 
            logger.debug(f"{source_name} → PMID {pmid_for_log}: Second GET (post-PoW), Content-Type: {content_type}")

        if 'application/pdf' in content_type or final_url_after_redirects.lower().endswith(".pdf"):
            with open(temp_pdf_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=81920): 
                    f.write(chunk)
            
            validation_failure_reason = validate_downloaded_pdf(temp_pdf_path, pmid_for_log)
            if not validation_failure_reason: 
                os.rename(temp_pdf_path, output_path) 
                logger.info(f"{source_name} ✓ PMID {pmid_for_log}: Successfully downloaded and validated PDF to {output_path}")
                return True
            else: 
                logger.warning(f"{source_name} ✗ PMID {pmid_for_log}: PDF from {final_url_after_redirects} failed validation: {validation_failure_reason}")
                suspicious_dir = os.path.join(OUTPUT_PDF_DIR, SUSPICIOUS_PDF_SUBDIR)
                os.makedirs(suspicious_dir, exist_ok=True)
                
                base_output_filename = sanitize_filename(os.path.basename(output_path).replace(".pdf", "")) 
                sanitized_source_detail = sanitize_filename(source_name) 
                suspicious_filename = f"{base_output_filename}.{sanitized_source_detail}.validation_failed.pdf"
                if len(os.path.join(suspicious_dir, suspicious_filename)) > 250: 
                    suspicious_filename = f"{base_output_filename[:100]}.{sanitized_source_detail[:50]}.validation_failed.pdf"

                suspicious_path = os.path.join(suspicious_dir, suspicious_filename)
                try:
                    if os.path.exists(temp_pdf_path): 
                        os.rename(temp_pdf_path, suspicious_path)
                        logger.info(f"Moved suspicious PDF to {suspicious_path}")
                        log_enhanced_failure_details(
                            "suspicious_articles_details.log", 
                            pmid_for_log, 
                            article_metadata, 
                            "Suspicious PDF (Validation Failed)", 
                            details=validation_failure_reason, 
                            file_path=suspicious_path,
                            logged_set=LOGGED_SUSPICIOUS_PMIDS
                        )
                    else: 
                        logger.warning(f"Temporary PDF {temp_pdf_path} not found for moving to suspicious.")
                except OSError as e_rename: 
                    logger.error(f"OSError moving suspicious PDF {temp_pdf_path} to {suspicious_path}: {e_rename}")
                    if os.path.exists(temp_pdf_path): os.remove(temp_pdf_path) 
                return False
        else: 
            logger.warning(f"{source_name} ✗ PMID {pmid_for_log}: Non-PDF content from {final_url_after_redirects}. Content-Type: {content_type}")
            
            sanitized_source_detail_for_debug = sanitize_filename(source_name)
            debug_content_filename = f"{pmid_for_log}_{sanitized_source_detail_for_debug}_unexpected_content.dat"
            debug_content_path = os.path.join(OUTPUT_PDF_DIR, SUSPICIOUS_PDF_SUBDIR, debug_content_filename)
            os.makedirs(os.path.dirname(debug_content_path), exist_ok=True)
            try:
                content_to_save = html_content_bytes if html_content_bytes is not None else r.content
                with open(debug_content_path, 'wb') as f_debug:
                    f_debug.write(content_to_save)
                logger.info(f"Saved unexpected content ({len(content_to_save)} bytes) to {debug_content_path}")
                log_enhanced_failure_details(
                    "suspicious_articles_details.log", 
                    pmid_for_log, 
                    article_metadata,
                    "Non-PDF Content Received",
                    details=f"Content-Type: {content_type}, URL: {final_url_after_redirects}",
                    file_path=debug_content_path,
                    logged_set=LOGGED_SUSPICIOUS_PMIDS 
                )
            except Exception as e_save_debug:
                logger.error(f"Error saving unexpected content for PMID {pmid_for_log}: {e_save_debug}")
            return False

    except requests.exceptions.RequestException as e:
        logger.warning(f"{source_name} ✗ PMID {pmid_for_log}: RequestException for {pdf_url}: {e}")
    except Exception as e_main: 
        logger.error(f"{source_name} ✗ PMID {pmid_for_log}: Unexpected error during download from {pdf_url}: {e_main}", exc_info=True)
    
    if os.path.exists(temp_pdf_path):
        try:
            os.remove(temp_pdf_path)
        except OSError as e_remove:
            logger.warning(f"Could not remove temp PDF {temp_pdf_path}: {e_remove}")
    return False

# === STEP 1: FETCH METADATA FROM PUBMED ===
def fetch_metadata(pmids):
    meta = {}
    batches = [pmids[i:i+EFETCH_BATCH_SIZE] for i in range(0, len(pmids), EFETCH_BATCH_SIZE)]
    logger.info(f"EFetch PubMed metadata for {len(pmids)} PMIDs in {len(batches)} batch(es)...")
    
    for i_batch, batch in enumerate(batches):
        efetch_payload = _get_ncbi_params({
            "db": "pubmed",
            "retmode": "xml",
            "id": ",".join(map(str, batch))
        })
        logger.info(f"NCBI EFetch POST (batch {i_batch+1}/{len(batches)}) → IDs={','.join(map(str,batch[:3]))}...")
        try:
            resp = session_ncbi.post(
                "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
                data=efetch_payload, 
                timeout=60 
            )
            resp.raise_for_status()
            root = ET.fromstring(resp.content)
            
            for art_node in root.findall(".//PubmedArticle"):
                pmid_el = art_node.find(".//PMID")
                pmid = pmid_el.text.strip() if pmid_el is not None and pmid_el.text else None
                if not pmid: continue

                doi_el = art_node.find(".//ArticleId[@IdType='doi']") or \
                         art_node.find(".//ELocationID[@EIdType='doi'][@ValidYN='Y']")
                doi = doi_el.text.strip() if doi_el is not None and doi_el.text else None
                
                year_el = art_node.find(".//PubDate/Year") or \
                          art_node.find(".//Journal/JournalIssue/PubDate/Year")
                year = "UnknownYear"
                if year_el is not None and year_el.text and year_el.text.strip().isdigit() and len(year_el.text.strip()) == 4:
                    year = year_el.text.strip()
                else:
                    medline_date_el = art_node.find(".//PubDate/MedlineDate") or \
                                      art_node.find(".//Article/Journal/JournalIssue/PubDate/MedlineDate")
                    if medline_date_el is not None and medline_date_el.text:
                        year_match = re.match(r"^\d{4}", medline_date_el.text.strip())
                        if year_match: year = year_match.group(0)
                
                author_el = art_node.find(".//AuthorList/Author[1]/LastName")
                author = author_el.text.strip() if author_el is not None and author_el.text else "UnknownAuthor"
                
                title_el = art_node.find(".//ArticleTitle")
                title = "".join(title_el.itertext()).strip() if title_el is not None else f"NoTitle_{pmid}"

                abstract_parts = []
                for abstract_text_node in art_node.findall(".//Abstract/AbstractText"):
                    if abstract_text_node.text:
                        label = abstract_text_node.get("Label")
                        if label:
                            abstract_parts.append(f"[{label.upper()}] {abstract_text_node.text.strip()}")
                        else:
                            abstract_parts.append(abstract_text_node.text.strip())
                abstract = "\n".join(abstract_parts) if abstract_parts else "N/A"

                mesh_terms = []
                for mesh_heading_node in art_node.findall(".//MeshHeadingList/MeshHeading"):
                    descriptor_name_node = mesh_heading_node.find("./DescriptorName")
                    if descriptor_name_node is not None and descriptor_name_node.text:
                        mesh_terms.append(descriptor_name_node.text.strip())
                mesh_terms_str = "; ".join(mesh_terms) if mesh_terms else "N/A"
                
                meta[pmid] = {
                    'doi': doi, 'year': year, 'author': author, 'title': title,
                    'abstract': abstract, 'mesh_terms': mesh_terms_str
                }
        except requests.exceptions.RequestException as e_req:
            logger.warning(f"NCBI EFetch batch {i_batch+1} RequestException: {e_req}")
        except ET.ParseError as e_xml:
            response_text_snippet = resp.text[:200] if 'resp' in locals() and hasattr(resp, 'text') else "N/A"
            logger.warning(f"NCBI EFetch batch {i_batch+1} XML ParseError: {e_xml}. Content: {response_text_snippet}")
        except Exception as e_generic:
            logger.error(f"NCBI EFetch batch {i_batch+1} unexpected error: {e_generic}", exc_info=True)
        
        if i_batch < len(batches) - 1: 
            time.sleep(DELAY_NCBI)

    missing_meta_pmids = [p for p in pmids if p not in meta]
    if missing_meta_pmids:
        logger.warning(f"Metadata missing for {len(missing_meta_pmids)} PMIDs: {missing_meta_pmids[:10]}...")
    return meta

# === STEP 2: OPEN ACCESS (Unpaywall, PMC with PoW) ===
def unpaywall_get_pdf_url(doi: str) -> str | None:
    if not doi: return None 
    if not UNPAYWALL_EMAIL or UNPAYWALL_EMAIL == "your_email@example.com":
        logger.debug(f"Unpaywall API skipped for DOI {doi}: UNPAYWALL_EMAIL not configured.")
        return None

    api_url = f"https://api.unpaywall.org/v2/{quote_plus(doi)}?email={UNPAYWALL_EMAIL}"
    logger.info(f"Unpaywall API GET → DOI {doi}") 
    try:
        r = session_oa.get(api_url, timeout=20) 
        r.raise_for_status()
        data = r.json()
        
        if data.get("is_oa"):
            pdf_url = None
            best_loc = data.get("best_oa_location")
            if best_loc and best_loc.get("url_for_pdf"):
                pdf_url = best_loc.get("url_for_pdf")
            
            if not pdf_url:
                for loc in data.get("oa_locations", []):
                    if loc.get("url_for_pdf"):
                        pdf_url = loc.get("url_for_pdf")
                        logger.debug(f"Unpaywall API: Found PDF URL in other oa_locations: {pdf_url}")
                        break 
            
            if pdf_url:
                logger.info(f"Unpaywall API ✓ DOI {doi}: Found PDF URL: {pdf_url.split('?')[0]}...") 
                return pdf_url
            else:
                logger.info(f"Unpaywall API ? DOI {doi}: Article is OA, but no direct PDF URL in Unpaywall response.")
        else:
            logger.info(f"Unpaywall API ~ DOI {doi}: Not OA according to Unpaywall.")
            
    except requests.exceptions.RequestException as e_req:
        logger.warning(f"Unpaywall API ✗ DOI {doi}: RequestException: {e_req}")
    except ValueError as e_json: 
        logger.warning(f"Unpaywall API ✗ DOI {doi}: JSON Decode Error: {e_json}. Response: {r.text[:200] if 'r' in locals() else 'N/A'}")
    except Exception as e_generic:
        logger.error(f"Unpaywall API ✗ DOI {doi}: Unexpected error: {e_generic}", exc_info=True)
    return None


def pmc_id_for_pmid(pmid: str, article_metadata: dict) -> str | None:
    """
    Finds the PMCID for a given PMID using Entrez ELink.
    Prioritizes direct 'pubmed_pmc' link.
    article_metadata is passed for potential future verification steps.
    """
    linkname_to_try = "pubmed_pmc" 

    params = _get_ncbi_params({
        "dbfrom": "pubmed",
        "db": "pmc",
        "id": pmid,
        "cmd": "neighbor_score", 
        "linkname": linkname_to_try
    })

    logger.info(f"PMC ID ELink → PMID {pmid}: Querying with linkname '{linkname_to_try}'.")
    try:
        r = session_ncbi.post(
            "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi",
            data=params,
            timeout=20 
        )
        r.raise_for_status()
        root = ET.fromstring(r.content)

        link_set_db_el = root.find(f".//LinkSetDb[LinkName='{linkname_to_try}']")

        if link_set_db_el is None:
            info_el_general = root.find(".//Info") 
            if info_el_general is not None and info_el_general.text:
                 logger.info(f"PMC ID ELink ~ PMID {pmid}: NCBI Info: {info_el_general.text.strip()}")
            else:
                 logger.debug(f"PMC ID ELink ~ PMID {pmid}: No <LinkSetDb> for '{linkname_to_try}'. XML: {r.text[:250]}")
            return None

        ids = [el.text.strip() for el in link_set_db_el.findall("./Link/Id") if el.text]

        if ids:
            pmc_candidate_id_num = ids[0] 
            returned_pmcid = ""
            if pmc_candidate_id_num.upper().startswith("PMC"):
                returned_pmcid = pmc_candidate_id_num
            elif pmc_candidate_id_num.isdigit():
                returned_pmcid = "PMC" + pmc_candidate_id_num
            else:
                logger.warning(f"PMC ID ELink ? PMID {pmid}: Non-standard ID '{pmc_candidate_id_num}' from '{linkname_to_try}'.")
                return None 
            
            logger.info(f"PMC ID ELink ✓ PMID {pmid}: Found PMCID {returned_pmcid} via '{linkname_to_try}'.")
            return returned_pmcid
        else:
            info_el = link_set_db_el.find("./Info") 
            if info_el is not None and info_el.text:
                logger.info(f"PMC ID ELink ~ PMID {pmid}: NCBI Info for '{linkname_to_try}': {info_el.text.strip()}")
            else:
                logger.info(f"PMC ID ELink ~ PMID {pmid}: No PMCID <Id> elements for '{linkname_to_try}'.")
            return None

    except requests.exceptions.RequestException as e_req:
        logger.warning(f"PMC ID ELink ✗ PMID {pmid} ('{linkname_to_try}'): RequestException: {e_req}")
    except ET.ParseError as e_xml:
        response_text_snippet = r.text[:250] if 'r' in locals() and hasattr(r, 'text') else "N/A"
        logger.warning(f"PMC ID ELink ✗ PMID {pmid} ('{linkname_to_try}'): XML ParseError: {e_xml}. Content: {response_text_snippet}")
    except Exception as e_generic:
        logger.error(f"PMC ID ELink ✗ PMID {pmid} ('{linkname_to_try}'): Unexpected error: {e_generic}", exc_info=True)
    
    logger.warning(f"PMC ID ELink ✗ PMID {pmid}: No PMCID from '{linkname_to_try}' after full attempt.")
    return None


def pmc_attempt_download(pmcid: str, pmid: str, md: dict) -> bool:
    """Attempts to download a PDF from PMC, handling PoW."""
    year_val = md.get('year', 'UnknownYear')
    author_val = md.get('author', 'UnknownAuthor')
    title_val = md.get('title', f'NoTitle_{pmid}')
    fname_base = sanitize_filename(f"{year_val}-{pmid}-{author_val}-{title_val}")
    output_pdf_path  = os.path.join(OUTPUT_PDF_DIR, fname_base + ".pdf")

    if os.path.exists(output_pdf_path): 
        logger.info(f"PMC ✓ {pmid} ({pmcid}): PDF already exists at {output_pdf_path} (checked in pmc_attempt_download).")
        return True

    pmc_article_pdf_landing_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/pdf/"
    final_pdf_url_from_pmc = None 
    
    logger.info(f"PMC HEAD → PMID {pmid} ({pmcid}): Probing {pmc_article_pdf_landing_url} for final PDF URL.")
    head_req_headers = session_oa.headers.copy()
    head_req_headers.update(BROWSER_LIKE_HEADERS) 
    head_req_headers['Accept'] = 'application/pdf, text/html;q=0.9, */*;q=0.8' 
    head_req_headers['Sec-Fetch-Site'] = 'cross-site' 

    try:
        head_resp = session_oa.head(
            pmc_article_pdf_landing_url,
            headers=head_req_headers,
            timeout=(10, 25), 
            allow_redirects=True 
        )
        head_resp.raise_for_status()
        final_pdf_url_from_pmc = head_resp.url 
        
        if not (final_pdf_url_from_pmc.lower().endswith(".pdf") or "format=pdf" in final_pdf_url_from_pmc.lower() or "/pdf/" in final_pdf_url_from_pmc.lower()):
            logger.warning(f"PMC HEAD ? PMID {pmid} ({pmcid}): Resolved URL {final_pdf_url_from_pmc} doesn't strongly indicate PDF. Proceeding cautiously.")
        else:
            logger.info(f"PMC HEAD ✓ PMID {pmid} ({pmcid}): Resolved potential PDF URL: {final_pdf_url_from_pmc}")

    except requests.exceptions.RequestException as e:
        logger.warning(f"PMC HEAD ✗ PMID {pmid} ({pmcid}) for {pmc_article_pdf_landing_url}: {e}. Will attempt GET on landing URL.")
        final_pdf_url_from_pmc = pmc_article_pdf_landing_url 
    except Exception as e_head_generic:
        logger.error(f"PMC HEAD ✗ PMID {pmid} ({pmcid}): Unexpected error during HEAD request: {e_head_generic}", exc_info=True)
        return False 

    if final_pdf_url_from_pmc:
        # Pass md (article_metadata) to download_and_save_pdf
        return download_and_save_pdf(
            session_oa, 
            final_pdf_url_from_pmc, 
            output_pdf_path, 
            pmid, 
            source_name=f"PMC({pmcid})", 
            article_metadata=md, # Passing md here
            referer=pmc_article_pdf_landing_url 
        )
    else:
        logger.error(f"PMC ✗ PMID {pmid} ({pmcid}): No URL determined for download attempt after HEAD request.")
        return False


def oa_worker(pmid: str, md: dict) -> tuple[str, bool]:
    """Worker for Open Access PDF fetching (Unpaywall then PMC)."""
    try: 
        doi = md.get('doi')
        year_val = md.get('year', 'UnknownYear')
        author_val = md.get('author', 'UnknownAuthor')
        title_val = md.get('title', f'NoTitle_{pmid}') 

        fname_base = sanitize_filename(f"{year_val}-{pmid}-{author_val}-{title_val}")
        output_pdf_path  = os.path.join(OUTPUT_PDF_DIR, fname_base + ".pdf")

        if os.path.exists(output_pdf_path): 
            logger.info(f"OA ✓ {pmid}: PDF already exists at {output_pdf_path}")
            return pmid, True

        if doi:
            unpaywall_url = unpaywall_get_pdf_url(doi) 
            if unpaywall_url:
                referer_unpaywall = f"https://doi.org/{quote_plus(doi)}"
                # Pass md (article_metadata) to download_and_save_pdf
                if download_and_save_pdf(session_oa, unpaywall_url, output_pdf_path, pmid, source_name=f"Unpaywall(DOI:{doi})", article_metadata=md, referer=referer_unpaywall):
                    return pmid, True
                else:
                    logger.info(f"OA: Unpaywall attempt for PMID {pmid} (DOI {doi}) failed download/validation. Trying PMC.")
            else:
                logger.info(f"OA: No PDF URL from Unpaywall for PMID {pmid} (DOI {doi}). Trying PMC.")
        else:
            logger.info(f"OA: No DOI for PMID {pmid}. Skipping Unpaywall, trying PMC.")
        
        pmcid = pmc_id_for_pmid(pmid, md) 
        if pmcid:
            if pmc_attempt_download(pmcid, pmid, md):
                return pmid, True
        
        logger.warning(f"OA ✗ {pmid}: No PDF found via Unpaywall or PMC.")
        return pmid, False
    
    except Exception as e_oa_worker: 
        logger.error(f"OA Worker UNHANDLED EXCEPTION for PMID {pmid}: {e_oa_worker}", exc_info=True)
        return pmid, False 

# === STEP 3: SCI-HUB ===
def test_scihub_domain(domain: str) -> bool:
    """Tests if a Sci-Hub domain is responsive."""
    test_doi = "10.1000/182" 
    url = f"{domain.rstrip('/')}/{test_doi}"
    logger.debug(f"Sci-Hub TEST GET → {url}")
    try:
        r = session_scihub.get(url, timeout=10, headers=BROWSER_LIKE_HEADERS) 
        if r.status_code == 200 and ('html' in r.headers.get('Content-Type','').lower() or \
                                     any(kw in r.text.lower() for kw in ['sci-hub', 'save', 'download', '<button id="download">'])):
            logger.info(f"Sci-Hub TEST ✓ {domain} is responsive (status {r.status_code}).")
            return True
        elif r.status_code == 404: 
             logger.info(f"Sci-Hub TEST ✓ {domain} is responsive (status 404, expected for non-existent test DOI).")
             return True
        else:
            logger.warning(f"Sci-Hub TEST ? {domain} responded status {r.status_code}, CT: {r.headers.get('Content-Type','')}. Text snippet: {r.text[:100]}")
    except requests.exceptions.Timeout:
        logger.warning(f"Sci-Hub TEST ✗ {domain} timed out.")
    except requests.exceptions.RequestException as e:
        logger.warning(f"Sci-Hub TEST ✗ {domain} error: {e}")
    except Exception as e_generic_test: 
        logger.error(f"Sci-Hub TEST ✗ {domain} unexpected error: {e_generic_test}", exc_info=True)
    return False

def init_scihub_domains() -> list[str]:
    """Probes Sci-Hub domains for availability and returns a list of working ones."""
    logger.info("Probing Sci-Hub mirrors for availability...")
    working_domains = []
    with ThreadPoolExecutor(max_workers=min(len(SCI_HUB_DOMAINS), 3), thread_name_prefix="SciHub_Domain_Test") as executor:
        future_to_domain = {executor.submit(test_scihub_domain, d): d for d in SCI_HUB_DOMAINS}
        for future in as_completed(future_to_domain):
            domain = future_to_domain[future]
            try:
                if future.result(): 
                    working_domains.append(domain)
            except Exception as exc: 
                logger.error(f"Sci-Hub domain test for {domain} generated an exception during result retrieval: {exc}")
    
    if not working_domains:
        logger.error("CRITICAL: No working Sci-Hub domains found after testing!")
    else:
        if "https://sci-hub.se" in working_domains:
            working_domains.insert(0, working_domains.pop(working_domains.index("https://sci-hub.se")))
        logger.info(f"Using Sci-Hub domains: {working_domains}")
    return working_domains


def find_scihub_pdf_in_html(html_content: bytes, base_page_url: str) -> str | None:
    """Parses Sci-Hub HTML content to find the direct PDF link."""
    soup = BeautifulSoup(html_content, 'html.parser')
    parsed_base_url = urlparse(base_page_url)
    absolute_base = f"{parsed_base_url.scheme}://{parsed_base_url.netloc}"

    selectors_and_attrs = [
        ('iframe#pdf', 'src'),             
        ('iframe#article', 'src'),         
        ('embed[type="application/pdf"]', 'src'), 
        ('iframe[src*=".pdf"]', 'src'),    
        ('a#download', 'href'),            
        ('div.buttons > a[href*=".pdf"]', 'href'), 
        ('div#buttons > a[href*=".pdf"]', 'href'), 
        ('a[href*=".pdf"]', 'href')        
    ]

    for selector, attr in selectors_and_attrs:
        element = soup.select_one(selector)
        if element and element.get(attr):
            src_val = element.get(attr)
            if src_val.startswith("//"): 
                src_val = f"{parsed_base_url.scheme}:{src_val}"
            
            if not src_val.lower().startswith(('data:', 'javascript:')):
                resolved_url = urljoin(absolute_base, src_val) 
                if ".pdf" in resolved_url.lower() or any(sh_domain_base in resolved_url for sh_domain_base in [urlparse(d).netloc for d in SCI_HUB_DOMAINS]):
                    logger.debug(f"Sci-Hub HTML Parse: Found PDF link '{resolved_url}' using selector '{selector}'")
                    return resolved_url

    onclick_buttons = soup.select('button[onclick*="location.href"], a[onclick*="location.href"]')
    for button in onclick_buttons:
        onclick_val = button.get('onclick', '')
        match = re.search(r"location\.href\s*=\s*['\"]([^'\"]+\.pdf[^'\"]*)['\"]", onclick_val, re.IGNORECASE)
        if match:
            href = match.group(1).strip()
            if href.startswith("//"): href = f"{parsed_base_url.scheme}:{href}"
            resolved_url = urljoin(absolute_base, href)
            logger.debug(f"Sci-Hub HTML Parse: Found PDF link '{resolved_url}' from onclick attribute.")
            return resolved_url
            
    logger.debug(f"Sci-Hub HTML Parse: No obvious PDF link found in HTML from {base_page_url}")
    return None

def scihub_worker(identifier: str, pmid: str, md: dict, active_domains: list) -> tuple[str, bool]:
    """Worker for Sci-Hub PDF fetching."""
    year_val = md.get('year', 'UnknownYear')
    author_val = md.get('author', 'UnknownAuthor')
    title_val = md.get('title', f'NoTitle_{pmid}')
    fname_base = sanitize_filename(f"{year_val}-{pmid}-{author_val}-{title_val}")
    output_pdf_path = os.path.join(OUTPUT_PDF_DIR, fname_base + ".pdf")

    if os.path.exists(output_pdf_path): 
        logger.info(f"Sci-Hub ✓ {pmid}: PDF already exists at {output_pdf_path} (checked in scihub_worker).")
        return pmid, True
    
    if not active_domains: 
        logger.error(f"Sci-Hub ✗ {pmid}: No active Sci-Hub domains to try for identifier '{identifier}'.")
        return pmid, False

    for i, domain_url in enumerate(active_domains): 
        scihub_page_url = f"{domain_url.rstrip('/')}/{quote_plus(identifier)}"
        logger.info(f"Sci-Hub HTML GET → PMID {pmid} from {scihub_page_url} (Attempt {i+1}/{len(active_domains)})")
        
        sh_headers = session_scihub.headers.copy() 
        sh_headers.update(BROWSER_LIKE_HEADERS)    
        sh_headers['Sec-Fetch-Site'] = 'none'      

        try:
            r_page = session_scihub.get(scihub_page_url, headers=sh_headers, timeout=30) 
            r_page.raise_for_status() 
            page_content_type = r_page.headers.get('Content-Type','').lower()

            if 'application/pdf' in page_content_type:
                logger.info(f"Sci-Hub ? PMID {pmid}: URL {scihub_page_url} served PDF directly. Attempting download...")
                # Pass md (article_metadata) to download_and_save_pdf
                if download_and_save_pdf(session_scihub, r_page.url, output_pdf_path, pmid, source_name=f"SciHub_Direct({domain_url})", article_metadata=md, referer=domain_url):
                    return pmid, True
                else:
                    logger.warning(f"Sci-Hub ✗ PMID {pmid}: Direct PDF from {scihub_page_url} failed validation or download.")
                    continue 

            elif 'html' in page_content_type or r_page.content[:100].strip().lower().startswith((b'<!doctype html', b'<html')):
                pdf_url_from_html = find_scihub_pdf_in_html(r_page.content, r_page.url) 
                
                if pdf_url_from_html:
                    logger.info(f"Sci-Hub HTML ✓ PMID {pmid}: Found potential PDF link: {pdf_url_from_html.split('?')[0]}...")
                    # Pass md (article_metadata) to download_and_save_pdf
                    if download_and_save_pdf(session_scihub, pdf_url_from_html, output_pdf_path, pmid, source_name=f"SciHub_Extracted({domain_url})", article_metadata=md, referer=scihub_page_url):
                        return pmid, True
                else:
                    logger.warning(f"Sci-Hub HTML ✗ {pmid} via {domain_url}: No PDF link found within HTML from {scihub_page_url}")
            else: 
                logger.warning(f"Sci-Hub ✗ {pmid} via {domain_url}: Unexpected Content-Type '{page_content_type}' from {scihub_page_url}. Snippet: {r_page.text[:100]}")

        except requests.exceptions.RequestException as e_req:
            logger.warning(f"Sci-Hub ✗ {pmid} via {domain_url} ({scihub_page_url}): RequestException: {e_req}")
        except Exception as e_sh_domain: 
            logger.error(f"Sci-Hub ✗ {pmid} via {domain_url} ({scihub_page_url}): General error {e_sh_domain.__class__.__name__}: {e_sh_domain}", exc_info=True)
        
        if len(active_domains) > 1 and i < len(active_domains) - 1: 
            time.sleep(DELAY_SCIHUB) 

    logger.error(f"Sci-Hub ✗ {pmid}: Failed for identifier '{identifier}' after trying all active domains.")
    return pmid, False


# === MAIN SCRIPT EXECUTION ===
def main():
    t_start = time.time()
    logger.info(f"=== PDF Fetcher v12-pow started at {time.strftime('%Y-%m-%d %H:%M:%S')} ===")

    # Optional File Logging Setup
    log_file_handler_path = os.path.join(OUTPUT_PDF_DIR, "pdf_fetcher_v12_run.log")
    try:
        os.makedirs(OUTPUT_PDF_DIR, exist_ok=True) 
        # fh = logging.FileHandler(log_file_handler_path, mode='a') 
        # fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - [%(threadName)s] - %(funcName)s - %(message)s'))
        # logger.addHandler(fh) # Uncomment to enable file logging
        # logger.info(f"Detailed run logging to file: {os.path.abspath(log_file_handler_path)}")
    except Exception as e_log_file:
        logger.error(f"Could not set up file logging to {log_file_handler_path}: {e_log_file}")
    
    try:
        df = pd.read_excel(EXCEL_FILE_PATH)
        if 'PMID' not in df.columns:
            logger.error(f"Excel file {EXCEL_FILE_PATH} must contain a 'PMID' column.")
            return
    except FileNotFoundError:
        logger.error(f"Excel file not found: {EXCEL_FILE_PATH}")
        return
    except Exception as e_excel: 
        logger.error(f"Cannot read Excel file {EXCEL_FILE_PATH}: {e_excel}", exc_info=True)
        return
    
    pmids_raw = df['PMID'].dropna().unique()
    pmids = []
    for p_raw in pmids_raw:
        try:
            pmids.append(str(int(float(str(p_raw))))) 
        except ValueError:
            logger.warning(f"Skipping invalid PMID format in Excel: '{p_raw}'")
    
    if not pmids:
        logger.error("No valid PMIDs found in the Excel file.")
        return
    logger.info(f"Loaded {len(pmids)} unique, valid PMIDs from {EXCEL_FILE_PATH}")

    metadata_dict = fetch_metadata(pmids)
    valid_pmids_with_meta = [p for p in pmids if p in metadata_dict and metadata_dict[p]]
    
    logger.info(f"Successfully fetched metadata for {len(valid_pmids_with_meta)} PMIDs.")
    if not valid_pmids_with_meta:
        logger.error("No metadata could be fetched for any valid PMIDs. Cannot proceed.")
        return

    try:
        os.makedirs(OUTPUT_PDF_DIR, exist_ok=True)
        os.makedirs(os.path.join(OUTPUT_PDF_DIR, SUSPICIOUS_PDF_SUBDIR), exist_ok=True)
        logger.info(f"PDFs will be saved to: {os.path.abspath(OUTPUT_PDF_DIR)}")
        logger.info(f"Suspicious/failed validation files will be in: {os.path.abspath(os.path.join(OUTPUT_PDF_DIR, SUSPICIOUS_PDF_SUBDIR))}")
    except OSError as e_mkdir:
        logger.error(f"Could not create output directories: {e_mkdir}. Exiting.")
        return

    logger.info("--- Starting Open Access Download Phase ---")
    oa_succeeded_pmids, oa_failed_pmids = [], []
    with ThreadPoolExecutor(max_workers=MAX_THREADS, thread_name_prefix="OA_Worker") as executor:
        future_to_pmid_oa = {
            executor.submit(oa_worker, pmid, metadata_dict[pmid]): pmid
            for pmid in valid_pmids_with_meta 
        }
        
        for future in as_completed(future_to_pmid_oa):
            pmid_processed = future_to_pmid_oa[future]
            try:
                worker_result = future.result() 
                
                if worker_result is None: 
                    logger.error(f"OA Thread ✗ For PMID {pmid_processed}, future.result() was None. Treating as failure.")
                    oa_failed_pmids.append(pmid_processed)
                elif isinstance(worker_result, tuple) and len(worker_result) == 2:
                    returned_pmid, success_status = worker_result
                    if returned_pmid != pmid_processed:
                         logger.warning(f"OA Thread ? Mismatch in returned PMID: expected {pmid_processed}, got {returned_pmid}. Processing with {pmid_processed}.")
                    
                    if success_status:
                        oa_succeeded_pmids.append(pmid_processed)
                    else:
                        oa_failed_pmids.append(pmid_processed)
                else: 
                    logger.error(f"OA Thread ✗ For PMID {pmid_processed}, worker returned unexpected result: {worker_result}. Treating as failure.")
                    oa_failed_pmids.append(pmid_processed)

            except Exception as e_thread: 
                logger.error(f"OA Thread ✗ Exception processing PMID {pmid_processed}: {e_thread}", exc_info=True)
                oa_failed_pmids.append(pmid_processed) 
    
    logger.info(f"Open Access Phase Summary: {len(oa_succeeded_pmids)} PDFs successfully downloaded and validated.")
    if oa_failed_pmids:
        logger.info(f"{len(oa_failed_pmids)} PMIDs not fetched via OA or failed validation: {sorted(oa_failed_pmids)[:10]}...")

    sci_hub_succeeded_pmids = []
    # Initialize final_still_failed_pmids with PMIDs that failed OA phase.
    # This list will be further reduced if Sci-Hub succeeds for any of them.
    pmids_for_scihub_attempt = list(oa_failed_pmids) 
    final_still_failed_pmids = [] # PMIDs that fail both OA and Sci-Hub attempts

    if pmids_for_scihub_attempt: 
        logger.info("--- Starting Sci-Hub Download Phase for Remaining PMIDs ---")
        active_scihub_domains = init_scihub_domains()
        
        if not active_scihub_domains:
            logger.error("Sci-Hub phase skipped: No active Sci-Hub domains found.")
            final_still_failed_pmids = list(pmids_for_scihub_attempt) # All OA failures are now final failures
        else:
            scihub_max_workers = max(1, MAX_THREADS // 2 if MAX_THREADS > 1 else 1)
            with ThreadPoolExecutor(max_workers=scihub_max_workers, thread_name_prefix="SciHub_Worker") as executor_sh:
                future_to_pmid_scihub = {}
                for pmid_to_try_scihub in pmids_for_scihub_attempt:
                    if pmid_to_try_scihub not in metadata_dict: 
                        logger.warning(f"Sci-Hub: Metadata missing for PMID {pmid_to_try_scihub}, skipping.")
                        final_still_failed_pmids.append(pmid_to_try_scihub)
                        continue
                        
                    meta_for_pmid = metadata_dict[pmid_to_try_scihub]
                    identifier_for_scihub = meta_for_pmid.get('doi') if meta_for_pmid.get('doi') else pmid_to_try_scihub
                    
                    future_to_pmid_scihub[executor_sh.submit(
                        scihub_worker, 
                        identifier_for_scihub, 
                        pmid_to_try_scihub, 
                        meta_for_pmid, 
                        active_scihub_domains
                    )] = pmid_to_try_scihub
                
                for future_sh in as_completed(future_to_pmid_scihub):
                    pmid_processed_scihub = future_to_pmid_scihub[future_sh]
                    try:
                        sh_worker_result = future_sh.result()
                        if sh_worker_result is None:
                            logger.error(f"Sci-Hub Thread ✗ For PMID {pmid_processed_scihub}, future.result() was None. Treating as failure.")
                            final_still_failed_pmids.append(pmid_processed_scihub)
                        elif isinstance(sh_worker_result, tuple) and len(sh_worker_result) == 2:
                            _, success_status_scihub = sh_worker_result
                            if success_status_scihub:
                                sci_hub_succeeded_pmids.append(pmid_processed_scihub)
                            else:
                                final_still_failed_pmids.append(pmid_processed_scihub)
                        else:
                            logger.error(f"Sci-Hub Thread ✗ For PMID {pmid_processed_scihub}, worker returned unexpected result: {sh_worker_result}. Treating as failure.")
                            final_still_failed_pmids.append(pmid_processed_scihub)
                    except Exception as e_sh_thread:
                        logger.error(f"Sci-Hub Thread ✗ Exception processing PMID {pmid_processed_scihub}: {e_sh_thread}", exc_info=True)
                        final_still_failed_pmids.append(pmid_processed_scihub)

            logger.info(f"Sci-Hub Phase Summary: {len(sci_hub_succeeded_pmids)} PDFs successfully downloaded and validated.")
            # final_still_failed_pmids now contains those that failed both OA and SciHub
            if final_still_failed_pmids:
                 logger.info(f"{len(final_still_failed_pmids)} PMIDs still missing after Sci-Hub attempts or validation: {sorted(final_still_failed_pmids)[:10]}...")
    else: 
        logger.info("--- Sci-Hub Download Phase Skipped: No PMIDs failed the Open Access phase. ---")
        final_still_failed_pmids = [] # No OA failures means no final failures if SciHub isn't run

    total_succeeded = len(oa_succeeded_pmids) + len(sci_hub_succeeded_pmids)
    total_time_taken = time.time() - t_start
    logger.info("--- Overall Summary ---")
    logger.info(f"Processed {len(pmids)} unique input PMIDs.")
    logger.info(f"Attempted downloads for {len(valid_pmids_with_meta)} PMIDs (those with metadata).")
    logger.info(f"Total PDFs successfully downloaded & validated: {total_succeeded} / {len(valid_pmids_with_meta)}.")
    logger.info(f"  - Via Open Access (Unpaywall/PMC): {len(oa_succeeded_pmids)}")
    logger.info(f"  - Via Sci-Hub: {len(sci_hub_succeeded_pmids)}")
    
    all_attempted_pmids = set(valid_pmids_with_meta)
    all_succeeded_pmids = set(oa_succeeded_pmids) | set(sci_hub_succeeded_pmids)
    # PMIDs that were attempted but are not in any succeeded list are the true failures
    final_truly_failed_pmids_for_log = sorted(list(all_attempted_pmids - all_succeeded_pmids))

    if final_truly_failed_pmids_for_log:
        logger.info(f"Total PMIDs ultimately NOT downloaded or failed validation: {len(final_truly_failed_pmids_for_log)}")
        
        failed_log_filename = "failed_articles_details.log" 
        logger.info(f"Logging details of {len(final_truly_failed_pmids_for_log)} failed PMIDs to {os.path.join(OUTPUT_PDF_DIR, failed_log_filename)}")
        for pmid_fail in final_truly_failed_pmids_for_log:
            # Ensure we use the global set for this log to avoid re-logging if main is somehow run multiple times on same set.
            log_enhanced_failure_details(
                failed_log_filename,
                pmid_fail,
                metadata_dict.get(pmid_fail, {}), # Use .get for safety
                "Full-text Retrieval Failed",
                details="Failed all download attempts (Open Access and Sci-Hub, or PDF validation failed for all sources).",
                logged_set=LOGGED_FAILED_PMIDS 
            )
    else: # No truly failed PMIDs
        if total_succeeded == len(valid_pmids_with_meta) and len(valid_pmids_with_meta) > 0:
             logger.info("All requested PDFs (with metadata) successfully downloaded and validated!")
        elif len(valid_pmids_with_meta) == 0 : 
             logger.info("No PMIDs with metadata were available to attempt download.")
        # else case: total_succeeded < valid_pmids_with_meta but final_truly_failed_pmids_for_log is empty.
        # This would be an anomaly in logic, but the calculation for final_truly_failed_pmids_for_log should cover it.
        # If it occurs, it means some PMIDs were neither succeeded nor marked as failed - needs investigation.
        elif total_succeeded < len(valid_pmids_with_meta):
            logger.warning("Processing complete, but count of succeeded PDFs is less than attempted PMIDs with metadata, yet no specific PMIDs were flagged as finally failed. Please review logs for anomalies.")


    logger.info(f"Total execution time: {total_time_taken:.2f} seconds.")
    logger.info(f"=== PDF Fetcher v12-pow completed at {time.strftime('%Y-%m-%d %H:%M:%S')} ===")

if __name__ == "__main__":
    # Initialize global sets at the start of a run if script is run multiple times in same session (e.g. Jupyter)
    LOGGED_SUSPICIOUS_PMIDS.clear()
    LOGGED_FAILED_PMIDS.clear()
    main()

2025-05-22 13:41:52,044 - INFO - [MainThread] - main - === PDF Fetcher v12-pow started at 2025-05-22 13:41:52 ===
2025-05-22 13:41:52,257 - INFO - [MainThread] - main - Loaded 32 unique, valid PMIDs from C:\Users\Galaxy\Downloads\screening_ERAS.xlsx
2025-05-22 13:41:52,257 - INFO - [MainThread] - fetch_metadata - EFetch PubMed metadata for 32 PMIDs in 1 batch(es)...
2025-05-22 13:41:52,258 - INFO - [MainThread] - fetch_metadata - NCBI EFetch POST (batch 1/1) → IDs=39955421,40340819,39068053...
2025-05-22 13:41:53,152 - INFO - [MainThread] - main - Successfully fetched metadata for 32 PMIDs.
2025-05-22 13:41:53,154 - INFO - [MainThread] - main - PDFs will be saved to: c:\Users\Galaxy\LEVI\jupyter\litscape\downloaded_pdfs_v12_pow
2025-05-22 13:41:53,154 - INFO - [MainThread] - main - Suspicious/failed validation files will be in: c:\Users\Galaxy\LEVI\jupyter\litscape\downloaded_pdfs_v12_pow\suspicious_pdfs
2025-05-22 13:41:53,155 - INFO - [MainThread] - main - --- Starting Open Access Do