In [7]:
import os
import re
import shutil
import time
import numpy as np
import pandas as pd
from lxml import etree
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

from progressbar import ProgressBar

In [5]:
# Configurations.
# doi_url_pattern = "https://www.doi.org/{doi:s}"
doi_url_pattern = "https://sci-hub.ru/{doi:s}"
download_path = os.path.join(os.getcwd(), "download")
save_path = os.path.join(os.getcwd(), "pdf")

# Load paper list requiring abstracts.
df = pd.read_csv("./progress.csv")
df = df.loc[:, ["id", "title", "doi", "pdf"]]
print(len(df))
df.head(2)

72


Unnamed: 0,id,title,doi,pdf
0,5,A Method to Evaluate Spectral Analysis by Spec...,10.3390/s22155638,
1,6,A Miniaturized Flexible Functional Near-infrar...,10.1109/ISCAS48785.2022.9937752,


In [8]:
# Start a webdriver instance.
service = Service(executable_path="./chromedriver.exe")
options = webdriver.ChromeOptions()
prefs = {}
prefs["profile.default_content_settings.popups"] = 0
prefs["download.default_directory"] = download_path
options.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome(service=service, options=options)

In [9]:
def download_wait(directory, timeout=30, nfiles=None):
    """
    Wait for downloads to finish with a specified timeout.

    Args
    ----
    directory : str
        The path to the folder where the files will be downloaded.
    timeout : int
        How many seconds to wait until timing out.
    nfiles : int, defaults to None
        If provided, also wait for the expected number of files.

    """
    seconds = 0
    dl_wait = True
    while dl_wait and seconds < timeout:
        time.sleep(1)
        dl_wait = False
        files = os.listdir(directory)
        if nfiles and len(files) != nfiles:
            dl_wait = True

        for fname in files:
            if fname.endswith('.crdownload'):
                dl_wait = True

        seconds += 1
    return seconds

In [14]:
# Iterating through papers.
re_pdf = re.compile(".*/(?P<pdf_name>.*?\.pdf).*")
pbar = ProgressBar(max_value=len(df))

for index, row in df.iterrows():
    
    if row.pdf == True:
        pbar.update(index+1)
        continue
    
    # Check if there is pdf.
    try:
        doi = row.doi
        url = doi_url_pattern.format( doi=doi)
        # print(url, end=" ")
        driver.get(url)

        button = driver.find_element_by_xpath("//div[@id='buttons']//button")
        button.click()

        # Wait until downloaded. 
        download_wait(download_path)

        # Move file. 
        download_href = button.get_attribute("onclick")
        pdf_name = re_pdf.match(download_href).group("pdf_name")
        shutil.move(os.path.join(download_path, pdf_name),
                    os.path.join(save_path, "{}.pdf".format(doi.replace("/", "__"))))
        
        # Record.
        df.loc[index, "pdf"] = True
        
        # print(" pdf downloaded")

    except Exception as e:
        print(index, end=":")
        print(doi, end=" ")
        print(str(e))
    
    # Update progress.
    pbar.update(index+1)
    
    time.sleep(np.random.randint(5, 10))

  0% (0 of 72) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--

0:10.3390/s22155638 'WebDriver' object has no attribute 'find_element_by_xpath'


  2% (2 of 72) |                         | Elapsed Time: 0:00:07 ETA:   0:04:05

1:10.1109/ISCAS48785.2022.9937752 'WebDriver' object has no attribute 'find_element_by_xpath'


  4% (3 of 72) |#                        | Elapsed Time: 0:00:16 ETA:   0:10:21

2:10.1109/ICOIACT55506.2022.9972007 'WebDriver' object has no attribute 'find_element_by_xpath'


  5% (4 of 72) |#                        | Elapsed Time: 0:00:21 ETA:   0:05:40

3:10.3390/electronics11131971 'WebDriver' object has no attribute 'find_element_by_xpath'


  6% (5 of 72) |#                        | Elapsed Time: 0:00:28 ETA:   0:08:45

4:10.3390/s22072749 'WebDriver' object has no attribute 'find_element_by_xpath'


  8% (6 of 72) |##                       | Elapsed Time: 0:00:35 ETA:   0:07:17

5:10.1016/j.compag.2022.106872 'WebDriver' object has no attribute 'find_element_by_xpath'


  9% (7 of 72) |##                       | Elapsed Time: 0:00:45 ETA:   0:10:46

6:10.3390/s22051915 'WebDriver' object has no attribute 'find_element_by_xpath'


 11% (8 of 72) |##                       | Elapsed Time: 0:00:55 ETA:   0:10:13

7:10.1109/UEMCON54665.2022.9965628 'WebDriver' object has no attribute 'find_element_by_xpath'


 12% (9 of 72) |###                      | Elapsed Time: 0:01:04 ETA:   0:09:30

8:10.1109/TBCAS.2022.3149766 'WebDriver' object has no attribute 'find_element_by_xpath'


 13% (10 of 72) |###                     | Elapsed Time: 0:01:10 ETA:   0:06:09

9:10.1016/j.culher.2022.04.003 'WebDriver' object has no attribute 'find_element_by_xpath'


 15% (11 of 72) |###                     | Elapsed Time: 0:01:17 ETA:   0:07:45

10:10.1117/12.2642271 'WebDriver' object has no attribute 'find_element_by_xpath'


 16% (12 of 72) |####                    | Elapsed Time: 0:01:24 ETA:   0:06:56

11:10.1117/12.2638467 'WebDriver' object has no attribute 'find_element_by_xpath'


 18% (13 of 72) |####                    | Elapsed Time: 0:01:33 ETA:   0:08:16

12:10.1007/978-3-030-96641-6\_11 'WebDriver' object has no attribute 'find_element_by_xpath'


 19% (14 of 72) |####                    | Elapsed Time: 0:01:42 ETA:   0:09:15

13:10.1109/MetroAgriFor55389.2022.9964732 'WebDriver' object has no attribute 'find_element_by_xpath'


 20% (15 of 72) |#####                   | Elapsed Time: 0:01:50 ETA:   0:07:10

14:10.3390/s22114010 'WebDriver' object has no attribute 'find_element_by_xpath'


 22% (16 of 72) |#####                   | Elapsed Time: 0:01:55 ETA:   0:05:14

15:10.3390/s22010249 'WebDriver' object has no attribute 'find_element_by_xpath'


 23% (17 of 72) |#####                   | Elapsed Time: 0:02:03 ETA:   0:07:00

16:10.1109/ICHMS56717.2022.9980752 'WebDriver' object has no attribute 'find_element_by_xpath'


 25% (18 of 72) |######                  | Elapsed Time: 0:02:10 ETA:   0:06:42

17:10.1016/j.chemolab.2021.104459 'WebDriver' object has no attribute 'find_element_by_xpath'


KeyboardInterrupt: 

In [25]:
df.to_csv("progress.csv", index=None)

In [27]:
df.pdf.sum()

290