### This file will download papers to a desired location given its DOI. It should be used right after obtaining a list of DOIs from similar papers to a seed paper (see get_similar_papers.ipynb). 

In [1]:
import time
import bibtexparser
import requests
import urllib
import os
import urllib.request

from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium import webdriver
from urllib.parse import quote


In [3]:
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument(
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36')

options.add_argument("enable-automation");
options.add_argument("--window-size=1920,1080");
options.add_argument("--no-sandbox");
options.add_argument("--disable-extensions");
options.add_argument("--dns-prefetch-disable");
options.add_argument("--disable-gpu");


options.add_experimental_option('prefs', {      
"download.default_directory": "/home/bowenyi/side project/get seedpapers/seed paper 3",  # Papers will be downloaded at this address
"download.prompt_for_download": False,  
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True  
})

driver = webdriver.Chrome(options=options)

In [4]:
# Replace with your science_direct api key and insstoken. Insstoken needs to be requested from sciencedirect.com. 
science_direct_api_key = ""  
science_direct_insttoken = ""


In [5]:
def download_science_direct(doi, api_key=science_direct_api_key, insttoken=science_direct_insttoken):
    url = f"https://api.elsevier.com/content/article/doi/{doi}?apiKey={api_key}&httpAccept=application%2Fpdf&insttoken={insttoken}"
    response = urllib.request.urlopen(url)
    pdf_content = response.read()

    filename = f"seed paper 3/{doi.replace('/', '_')}.pdf"  # Replace with whatever you want to name this pdf
    with open(filename, 'wb') as f:
        f.write(pdf_content)


In [7]:
not_retrieved = []
retrieved = []

In [8]:
# This function will automatically download a paper to a desired location given its DOI
# The download location can be defined in the second cell under options.add_experimental_option -> "download.default_directory"   
def download_pdf(driver, doi):
    url = "https://doi.org/" + doi
    driver.get(url)
    time.sleep(1)

    current_url = driver.current_url

    if "sciencedirect" in current_url:
        download_science_direct(doi=doi)
        retrieved.append(doi)
        return
        
    url_split = current_url.split("/")
    url_find = url_split[-1].split("?")[0]

    doi_split = doi.split("/")

    # journals.sagepub.com
    if "journals.sagepub.com" in current_url:
        try:
            reader_href = f"https://journals.sagepub.com/doi/reader/{doi}"
            driver.get(reader_href)
            time.sleep(1)
            download_links = driver.find_elements(By.CSS_SELECTOR, "a[class*='btn--light btn format-download-btn download']")
            for link in download_links:
                href = link.get_attribute('href')
                if href:
                    driver.get(href)
                    time.sleep(1)
                    retrieved.append(doi)
                    return
        except:
            pass

    # Emerald
    if "www.emerald.com" in current_url:
        try:
            href = current_url.replace("html")

    # Taylor & Francis
    if "www.tandfonline.com" in current_url:
        try:
            doi = current_url.split("full/")[1].split("#")[0]
            reader_href = f"https://www.tandfonline.com/doi/pdf/{doi}/?download=true"
            driver.get(reader_href)
            time.sleep(1)
            retrieved.append(doi)
            return
            
        except:
            pass

    # ieee
    if "ieeexplore.ieee.org" in current_url:
        try:
            doc_number = current_url.split("/")[-1]
            pdf_link = f"https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber={doc_number}"
            print(pdf_link)
            driver.get(pdf_link)
            time.sleep(1)
            retrieved.append(doi)
            return
            
        except:
            pass

    # plos:
    if "journals.plos.org" in current_url:
        try:
            split = current_url.split("?")
            prefix = split[0]
            id = split[-1]

            url = f"{prefix}/file?{id}&type=printable"
            driver.get(url)
            time.sleep(1)
            retrieved.append(doi)
            return
            
        except:
            pass
            
    # # JSTOR: (can't bypass the wall)
    if "www.jstor.org" in current_url:
        not_retrieved.append(doi)
        return
    

    # agupubs.onlinelibrary.wiley.com
    if "agupubs.onlinelibrary.wiley.com" in current_url:
        try:
            href = f"https://agupubs.onlinelibrary.wiley.com/doi/pdfdirect/{doi}?download=true"
            drive.get(href)
            time.sleep(1)
            retrieved.append(doi)
            return
        except:
            pass


    # www.mdpi.com
    if "www.mdpi.com" in current_url:
        try:
            links = driver.find_element(By.CSS_SELECTOR, 'a=[class*="UD_ArticlePDF')
            href = link.get_attribute('href')
            driver.get(href)
            time.sleep(1)
            retrieved.append(doi)
            return
            
        except:
            pass
                
    
    # AMS Publications
    if "journals.ametsoc.org" in current_url:
        try:
            href = current_url.replace("view", "downloadpdf/view").replace(".xml", ".pdf")
            driver.get(href)
            time.sleep(1)
            retrieved.append(doi)
            return
        except:
            pass

    
    # Now publisher
    if "nowpublishers" in current_url:
        try:
            href = f"https://www.nowpublishers.com/article/Download/{url_find}"
            driver.get(href)
            time.sleep(1)
            retrieved.append(doi)
            return
            
        except:
            pass

    
    # a general case for many paper websites
    try:
        pdf_links = driver.find_elements(By.CSS_SELECTOR, "a[class*='article-pdfLink']")
        for pdf_link in pdf_links:
            href = pdf_link.get_attribute('href')
            if href:
                driver.get(href)
                time.sleep(1)
                retrieved.append(doi)
                return
    except Exception as e:
        pass
        
    # www.degruyter.com
    if "degruyter.com" in current_url:
        try:
            pdf_links = driver.find_elements(By.CSS_SELECTOR, "a[class*='ga_download_button_pdf_article downloadCompletePdfArticle downloadPdf btn btn-primary fw-bold py-2 w-100 vgwort-click']")
            for pdf_link in pdf_links:
                href = pdf_link.get_attribute('href')
                if href:
                    driver.get(href)
                    time.sleep(1)
                    retrieved.append(doi)
                    return
        except Exception as e:
            pass

    
    # academic.oup.com
    if "academic.oup.com" in current_url:
        try:
            pdf_link = driver.find_element(By.CSS_SELECTOR, "a.al-link.pdf.article-pdfLink")
            href = pdf_link.get_attribute('href')
            if href:
                driver.get(href)
                time.sleep(1)
                retrieved.append(doi)
                return
        except Exception as e:
            pass

    
    href_conditions = [
        doi_split[-1] + ".pdf",
        doi_split[-1] + "-pdf",
        doi_split[-1] + "/pdf",
        "epdf/" + doi,
        "reader/" + doi,
        url_find + "/pdf",
        url_find + "/pdfft"
    ]   
        
    # If form is not found, search for href-based downloads
    for condition in href_conditions:
        try:
            # Find an element where the href contains the condition
            elements = driver.find_elements(By.XPATH, f"//a[contains(@href, '{condition}')]")
            for element in elements:
                href = element.get_attribute('href')
                driver.get(href)
                retrieved.append(doi)
                return
        except Exception as e:
            continue
    
    # Additional check for a condition that occurs in ScienceDirect papers: doi and "pdf" present without concatenation
    try:
        elements = driver.find_elements(By.XPATH, "//a[contains(@href, 'pdf')]")
        for element in elements:
            href = element.get_attribute('href')
            if (doi in href and "pdf" in href) or (url_find in href and "pdf" in href):
                driver.get(href)
                retrieved.append(doi)
                return
    except Exception as e:
        pass

    # Check if there is a form for PDF download
    try:
        form = driver.find_element(By.XPATH, "//form[contains(@class, 'ft-download-content__form ft-download-content__form--pdf js-ft-download-form')]")
        if form:                                                       
            form.submit()
            time.sleep(1)
            retrieved.append(doi)
            return
    except Exception as e:
        pass
        
    not_retrieved.append(doi)
    

#### Sample usage:

In [9]:
import json

with open ("filtered_papers/filtered_paper_3.json") as file:
    papers = json.load(file)
    
doi_list = []
for paper in papers:
    doi = paper["doi"]
    doi_list.append(doi.replace("DOI ", ""))

    for ref in paper['references']:
        doi_list.append(ref['doi'].replace("DOI ", ""))

for doi in doi_list[7:]:
    download_pdf(driver, doi)

driver.quit()


In [10]:
print(f"Retrieved {len(retrieved)} papers. {len(not_retrieved)} papers unretrieved.")

Retrieved 91 papers. 6 papers unretrieved.


In [12]:
not_retrieved

['10%2E1016%2Fj%2Eaogh%2E2015%2E08%2E008',
 '10%2E1016%2Fj%2Eaogh%2E2014%2E09%2E007',
 '10%2E1108%2FDPM%2D09%2D2013%2D0152',
 '10%2E12854%2Ferde%2D147%2D7',
 '10%2E12854%2Ferde%2D147%2D10',
 '10%2E3167%2Fnc%2E2010%2E050201']