In [None]:
"""
Function: download_from_unpaywall

Description:
    This script simulates retrieving article PDFs using the Unpaywall service.
    For each DOI extracted from the input CSV file, the script constructs the Unpaywall API URL
    and attempts to download the corresponding full-text PDF.

Inputs:
    csv_path : str
        Path to the input CSV file (e.g., "literature list - pubmed.csv") containing a 'DOI' column.
    out_dir : str
        Directory where downloaded PDF files will be saved (e.g., "papers").
    email : str
        Contact email address (must end with ".edu") required by the Unpaywall API for access.

Process:
    - Extract DOI values from the CSV file.
    - Build Unpaywall URLs using the DOI and email.
    - Attempt to download each article’s PDF and save it to the output directory.

Notes:
    - Some PDFs may be downloaded incorrectly or be unreadable.
    - Check and remove any leftover ".part" files or invalid PDF files.
"""


In [None]:
import os
import re
import time
from pathlib import Path

import pandas as pd
import requests
from tqdm import tqdm

In [None]:
def normalize_col(col: str) -> str:
    return re.sub(r"[^a-z0-9]+", "_", col.strip().lower())


def find_col(cols, candidates):
    norm = {normalize_col(c): c for c in cols}
    for cand in candidates:
        key = normalize_col(cand)
        if key in norm:
            return norm[key]
    return None

def safe_filename(name: str, maxlen: int = 150) -> str:
    name = re.sub(r"[\\/:*?\"<>|]+", "_", name)
    name = re.sub(r"\s+", " ", name).strip()
    return (name[:maxlen].rstrip() or "paper") + ".pdf"



In [None]:
def get_pdf_url_from_unpaywall(doi: str, email: str, session: requests.Session):

    url = f"https://api.unpaywall.org/v2/{doi}?email={email}"
    r = session.get(url, timeout=30)
    if r.status_code != 200:
        return None, f"unpaywall_status_{r.status_code}"

    data = r.json()
    loc = data.get("best_oa_location") or {}
    pdf = loc.get("url_for_pdf")
    if pdf:
        return pdf, ""

    for loc in data.get("oa_locations") or []:
        if loc.get("url_for_pdf"):
            return loc["url_for_pdf"], ""
    return None, "no_oa_pdf"

In [None]:
def stream_download(url: str, out_path: Path, session: requests.Session,
                    chunk=1024 * 256, timeout=60):

    out_path.parent.mkdir(parents=True, exist_ok=True)
    with session.get(url, stream=True, timeout=timeout, allow_redirects=True) as r:
        r.raise_for_status()
        total = int(r.headers.get("Content-Length", "0")) or None
        with open(out_path, "wb") as f, tqdm(total=total, unit="B", unit_scale=True, desc=out_path.name) as pbar:
            for chunk_data in r.iter_content(chunk_size=chunk):
                if chunk_data:
                    f.write(chunk_data)
                    pbar.update(len(chunk_data))

In [None]:
def download_from_unpaywall(csv_path: str, out_dir: str, email: str,
                            sleep: float = 1.0, overwrite: bool = False):
    out_root = Path(out_dir)
    out_root.mkdir(parents=True, exist_ok=True)

    report_rows = []
    failed_no_oa, failed_errors, successes = [], [], []

    df = pd.read_csv(csv_path)
    col_doi = find_col(df.columns, ["DOI"])
    col_title = find_col(df.columns, ["Title"])
    col_pmid = find_col(df.columns, ["PMID"])

    if not col_doi:
        raise SystemExit("No DOI")

    session = requests.Session()
    session.headers.update({"User-Agent": "UnpaywallBulkDownloader/1.0"})

    for idx, row in df.iterrows():
        doi = str(row.get(col_doi, "") or "").strip()
        title = str(row.get(col_title, "") or "").strip()
        pmid = str(row.get(col_pmid, "") or "").strip()

        if doi.startswith("https://doi.org/"):
            doi = doi.replace("https://doi.org/", "", 1)

        if not doi:
            report_rows.append(dict(idx=idx, doi="", title=title,
                                    status="skip", url="", filepath="", reason="missing_doi"))
            continue

        fname = safe_filename(pmid if pmid else doi.replace("/", "_"))
        dst = out_root / fname
        if dst.exists() and not overwrite:
            report_rows.append(dict(idx=idx, doi=doi, title=title,
                                    status="exists", url="", filepath=str(dst.resolve()), reason=""))
            continue

        try:
            pdf_url, reason = get_pdf_url_from_unpaywall(doi, email, session)
        except Exception as e:
            failed_errors.append(doi)
            report_rows.append(dict(idx=idx, doi=doi, title=title,
                                    status="error", url="", filepath="", reason=f"unpaywall_error:{e}"))
            time.sleep(sleep)
            continue

        if not pdf_url:
            failed_no_oa.append(doi)
            report_rows.append(dict(idx=idx, doi=doi, title=title,
                                    status="no_oa", url="", filepath="", reason=reason))
            time.sleep(sleep)
            continue

        try:
            stream_download(pdf_url, dst, session)
            if dst.exists() and dst.stat().st_size > 1024:
                successes.append(str(dst.resolve()))
                report_rows.append(dict(idx=idx, doi=doi, title=title,
                                        status="ok", url=pdf_url, filepath=str(dst.resolve()), reason=""))
            else:
                failed_errors.append(doi)
                report_rows.append(dict(idx=idx, doi=doi, title=title,
                                        status="error", url=pdf_url, filepath=str(dst.resolve()), reason="empty_file"))
        except Exception as e:
            failed_errors.append(doi)
            report_rows.append(dict(idx=idx, doi=doi, title=title,
                                    status="error", url=pdf_url, filepath=str(dst.resolve()), reason=f"download_error:{e}"))

        time.sleep(sleep)

    pd.DataFrame(report_rows).to_csv(out_root / "download_report.csv", index=False)
    (out_root / "failed_no_oa.txt").write_text("\n".join(failed_no_oa), encoding="utf-8")
    (out_root / "failed_errors.txt").write_text("\n".join(failed_errors), encoding="utf-8")
    (out_root / "successes.txt").write_text("\n".join(successes), encoding="utf-8")

    print(f"""
  report: {out_root / 'download_report.csv'}
  no DOI: {out_root / 'failed_no_oa.txt'}
  failed: {out_root / 'failed_errors.txt'}
  successes: {out_root / 'successes.txt'}
""")


In [None]:
csv_path = "literature list - pubmed.csv"
out_dir = "papers"
email = ".edu"

download_from_unpaywall(csv_path, out_dir, email, sleep=1.0, overwrite=False)

In [None]:
import os

folder_path = ".\papers"

for filename in os.listdir(folder_path):
    if filename.endswith(".part"):
        file_path = os.path.join(folder_path, filename)
        try:
            os.remove(file_path)
        except Exception as e:
            print(e)

print("delete finsihed")
