In [1]:
import os
import re
import shutil
import time
import numpy as np
import pandas as pd
from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

from progressbar import ProgressBar

In [22]:
# Configurations.
# doi_url_pattern = "https://www.doi.org/{doi:s}"
doi_url_pattern = "https://sci-hub.ru/{doi:s}"
download_path = os.path.join(os.getcwd(), "download")
save_path = os.path.join(os.getcwd(), "pdf")

# Load paper list requiring abstracts.
df = pd.read_csv("./progress.csv")
df = df.loc[:, ["id", "title", "doi", "pdf"]]
print(len(df))
df.head(2)

419


Unnamed: 0,id,title,doi,pdf
0,5,3D near infrared and ultrasound imaging of per...,10.1007/978-3-319-46726-9_45,True
1,12,A Brain-Computer Interface Based on a Few-Chan...,10.1109/ACCESS.2016.2637409,True


In [23]:
# Start a webdriver instance.
options = webdriver.ChromeOptions()
prefs = {}
prefs["profile.default_content_settings.popups"] = 0
prefs["download.default_directory"] = download_path
options.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome("./chromedriver.exe", options=options)

In [4]:
def download_wait(directory, timeout=30, nfiles=None):
    """
    Wait for downloads to finish with a specified timeout.

    Args
    ----
    directory : str
        The path to the folder where the files will be downloaded.
    timeout : int
        How many seconds to wait until timing out.
    nfiles : int, defaults to None
        If provided, also wait for the expected number of files.

    """
    seconds = 0
    dl_wait = True
    while dl_wait and seconds < timeout:
        time.sleep(1)
        dl_wait = False
        files = os.listdir(directory)
        if nfiles and len(files) != nfiles:
            dl_wait = True

        for fname in files:
            if fname.endswith('.crdownload'):
                dl_wait = True

        seconds += 1
    return seconds

In [None]:
# Iterating through papers.
re_pdf = re.compile(".*/(?P<pdf_name>.*?\.pdf).*")
pbar = ProgressBar(max_value=len(df))

for index, row in df.iterrows():
    
    if row.pdf:
        pbar.update(index+1)
        continue
    
    # Check if there is pdf.
    try:
        doi = row.doi
        url = doi_url_pattern.format(doi=doi)
        # print(url, end=" ")
        driver.get(url)

        button = driver.find_element_by_xpath("//div[@id='buttons']//button")
        button.click()

        # Wait until downloaded. 
        download_wait(download_path)

        # Move file. 
        download_href = button.get_attribute("onclick")
        pdf_name = re_pdf.match(download_href).group("pdf_name")
        shutil.move(os.path.join(download_path, pdf_name),
                    os.path.join(save_path, "{}.pdf".format(doi.replace("/", "__"))))
        
        # Record.
        df.loc[index, "pdf"] = True
        
        # print(" pdf downloaded")

    except Exception as e:
        print(index, end=":")
        print(doi, end=" ")
        print(str(e))
    
    # Update progress.
    pbar.update(index+1)
    
    time.sleep(np.random.randint(5, 10))

In [25]:
df.to_csv("progress.csv", index=None)

In [26]:
df.head()

Unnamed: 0,id,title,doi,pdf
0,5,3D near infrared and ultrasound imaging of per...,10.1007/978-3-319-46726-9_45,True
1,12,A Brain-Computer Interface Based on a Few-Chan...,10.1109/ACCESS.2016.2637409,True
2,16,A Compact Continuous non-Invasive Glucose Moni...,10.1109/BIOCAS.2018.8584693,True
3,17,A compact multispectral image capture unit for...,10.1109/I2MTC.2016.7520445,True
4,18,A compact NIR fluorescence imaging system with...,10.1109/ISCAS.2015.7168960,True


In [27]:
df.pdf.sum()

290