In [None]:
"""
Function: download_from_scihub

Description:
    This script uses the Sci-Hub method to download PDFs, mainly targeting articles published
    before 2021. Its logic is consistent with the way Zotero retrieves full texts.

Structure:
    1. The first part uses Selenium to simulate accessing Sci-Hub URLs and automatically
       download article PDFs.
       - Input:  src_dir  → source directory containing article identifiers or metadata.
       - Output: dst_dir  → directory where the downloaded PDFs are saved.

    2. The second part performs filtering based on different research needs:
       (1) Some Sci-Hub versions provide pre/postprint versions with watermarks or outdated files.
           These can be filtered for later manual updates.
       (2) Since a separate ScienceDirect supplementary downloader is prepared,
           papers from ScienceDirect can be filtered out for independent processing.
       (3) Papers confirmed to have no "Supplementary" or "Supporting Information" sections
           can be marked as completed.

Notes:
    This workflow ensures that older or incomplete Sci-Hub downloads are properly
    categorized and updated, maintaining data quality for further literature analysis.
"""


In [None]:
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import requests

SCIHUB_URL = "https://sci-hub.ren/"
SAVE_DIR = "sci-hub"
os.makedirs(SAVE_DIR, exist_ok=True)

HEADERS = {"User-Agent": "Mozilla/5.0"}

def init_driver():
    chrome_options = Options()
    # chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--log-level=3")
    driver = webdriver.Chrome(options=chrome_options)
    return driver


def download_from_doi(doi, pmid, driver):
    try:
        url = f"{SCIHUB_URL}{doi}#"
        driver.get(url)
        time.sleep(3)

        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")

        if soup.iframe is None and soup.embed is not None:
            download_url = soup.embed.get("src")
        elif soup.iframe is not None:
            download_url = soup.iframe.get("src")
        else:
            raise ValueError("No PDF iframe/embed found")

        if download_url.startswith("//"):
            download_url = "https:" + download_url
        elif download_url.startswith("/"):
            download_url = SCIHUB_URL.rstrip("/") + download_url

        r = requests.get(download_url, headers=HEADERS, timeout=60)
        r.raise_for_status()
        fname = f"{pmid}.pdf"
        with open(os.path.join(SAVE_DIR, fname), "wb") as f:
            f.write(r.content)
        print(f"Downloaded {pmid}")
        return True

    except Exception as e:
        with open("error.log", "a+", encoding="utf-8") as error:
            error.write(f"{pmid}: {doi}: {e}\n")
        print(f" Failed {pmid}: {e}")
        return False


if __name__ == "__main__":
    import pandas as pd
    from tqdm import tqdm

    df = pd.read_csv("last_literature list - pubmed.csv", dtype=str)
    df = df[df["DOI"].notna() & (df["DOI"].str.strip() != "")]

    driver = init_driver()

    failed = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        pmid, doi = str(row["PMID"]).strip(), str(row["DOI"]).strip()
        success = download_from_doi(doi, pmid, driver)
        if not success:
            failed.append(pmid)
        time.sleep(2)

    driver.quit()

    with open("failed_pmids.txt", "w", encoding="utf-8") as f:
        for pmid in failed:
            f.write(pmid + "\n")

    print(f"finish: {len(df)-len(failed)}, false: {len(failed)} ")


In [None]:
import os
import shutil
from PyPDF2 import PdfReader

src_dir = "sci-hub"
dst_dir = "nos_sci"
os.makedirs(dst_dir, exist_ok=True)

keywords = [
    # "supplementary", "supporting information",
    "uncorrected author proof",
    "accepted manuscript",
    "preprint", "postprint,"
    # "elsevier"
]

for fname in os.listdir(src_dir):
    if not fname.lower().endswith(".pdf"):
        continue

    fpath = os.path.join(src_dir, fname)
    try:
        text = ""
        reader = PdfReader(fpath)
        for page in reader.pages:
            text += page.extract_text() or ""

        found = any(kw.lower() in text.lower() for kw in keywords)

        if not found:
            shutil.copy(fpath, os.path.join(dst_dir, fname))
            os.remove(fpath)

    except Exception as e:
        print(f"⚠️ Error reading {fname}: {e}")
