## Attempt to download all pdfs found in the list of urls.
### Only about 15% are downloaded successfully. This is due to the parser being unable to find the PDF link on the page.


In [None]:

import pandas as pd
import requests
import os
import time
from bs4 import BeautifulSoup
from urllib.parse import urlparse

# === CONFIGURATION ===
input_csv = "datasets\open_access_enriched.csv"      # your enriched file
url_column = "Open Access URL"              # name of column with links
doi_column = "DOI"                          # name of DOI column (for filename)
pdf_folder = "papers_1"

# === SETUP ===
os.makedirs(pdf_folder, exist_ok=True)
df = pd.read_csv(input_csv)
success_count = 0
fail_count = 0

def try_download_pdf(doi, url):
    filename = os.path.join(pdf_folder, doi.replace("/", "_") + ".pdf")
    headers = {
        "User-Agent": "Mozilla/5.0"
    }

    try:
        if pd.isna(url):
            return False

        # Direct PDF
        if url.endswith(".pdf"):
            r = requests.get(url, headers=headers)
            if r.status_code == 200:
                with open(filename, "wb") as f:
                    f.write(r.content)
                return True

        # Scrape page for PDF links
        r = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(r.text, "html.parser")
        pdf_links = [a['href'] for a in soup.find_all('a', href=True) if "pdf" in a['href'].lower()]
        for link in pdf_links:
            full_link = link if link.startswith("http") else f"{urlparse(url).scheme}://{urlparse(url).netloc}{link}"
            r_pdf = requests.get(full_link, headers=headers)
            if r_pdf.status_code == 200:
                with open(filename, "wb") as f:
                    f.write(r_pdf.content)
                return True
    except Exception as e:
        return False

    return False

# === MAIN LOOP ===
for i, row in df.iterrows():
    doi = str(row[doi_column])
    url = row[url_column]
    success = try_download_pdf(doi, url)

    if success:
        success_count += 1
    else:
        fail_count += 1

    if (i + 1) % 10 == 0:
        print(f"Processed {i+1}/{len(df)} — ✅ {success_count} downloaded | ❌ {fail_count} failed")

print(f"\n🎉 Done! PDFs saved in folder: {pdf_folder}")
print(f"✅ Total successful: {success_count}")
print(f"❌ Total failed: {fail_count}")


  input_csv = "datasets\open_access_enriched.csv"      # your enriched file


Processed 10/1499 — ✅ 1 downloaded | ❌ 9 failed
Processed 20/1499 — ✅ 2 downloaded | ❌ 18 failed
Processed 30/1499 — ✅ 3 downloaded | ❌ 27 failed
Processed 40/1499 — ✅ 3 downloaded | ❌ 37 failed
Processed 50/1499 — ✅ 5 downloaded | ❌ 45 failed
Processed 60/1499 — ✅ 5 downloaded | ❌ 55 failed
Processed 70/1499 — ✅ 7 downloaded | ❌ 63 failed
Processed 80/1499 — ✅ 8 downloaded | ❌ 72 failed
Processed 90/1499 — ✅ 11 downloaded | ❌ 79 failed
Processed 100/1499 — ✅ 13 downloaded | ❌ 87 failed
Processed 110/1499 — ✅ 15 downloaded | ❌ 95 failed
Processed 120/1499 — ✅ 17 downloaded | ❌ 103 failed
Processed 130/1499 — ✅ 18 downloaded | ❌ 112 failed
Processed 140/1499 — ✅ 18 downloaded | ❌ 122 failed
Processed 150/1499 — ✅ 18 downloaded | ❌ 132 failed
Processed 160/1499 — ✅ 19 downloaded | ❌ 141 failed
Processed 170/1499 — ✅ 21 downloaded | ❌ 149 failed
Processed 180/1499 — ✅ 21 downloaded | ❌ 159 failed
Processed 190/1499 — ✅ 24 downloaded | ❌ 166 failed
Processed 200/1499 — ✅ 27 downloaded | ❌ 