In [3]:
from pprint import pprint
import urllib
from bs4 import BeautifulSoup
from webdriver_manager.firefox import GeckoDriverManager
import time
import sys
import datetime

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

In [4]:
# Given a URL string, opens the URL in a headless Firefox instance and returns a reference to the webdriver object

def obtain_driver(url):
    buttons = []
    driver_options = Options()
    driver_options.headless = True
    
    print("\n\n*****" + str(datetime.datetime.now()) + "*****")
    
    driver = webdriver.Firefox(options=driver_options, executable_path=GeckoDriverManager().install())
    
    # WARNING, THIS IS SPECIFIC TO THIS MACHINE (retrieves the installed uBlock Origin from the Firefox on this computer.) Need a computer-agnostic way.
    driver.install_addon("~/.mozilla/firefox/f4t6w0s5.default-release/extensions/uBlock0@raymondhill.net.xpi", temporary=True)

    driver.get(url)
    
    time.sleep(.5)
    
    return driver

In [10]:
# Clicks on element and waits for a page load. driver == webdriver, element == string (class or css_selector), find_elements_by == integer
# find_elements_by: 0 for css_selector, 1 for class_name

def click_and_wait(driver, element, find_elements_by):
    old_driver = driver.find_element_by_tag_name('html')
    
    WebDriverWait(driver, 10).until(EC.staleness_of(old_driver))
    
    try:
        if find_elements_by == 0:
            button = driver.find_element_by_css_selector(element)
        else:
            button = driver.find_element_by_class_name(element)
            
        driver.execute_script("arguments[0].click()", button)
    except:
        print("No button")

In [6]:
# Given a DOI, uses obtain_driver to open up the search site and search the DOI to open up the paper in the page. Returns the webdriver associated with it.

def search_paper(doi, driver):
    if "www.researchgate.net/search" in driver.current_url:
        search_input = "search-form__input"
        search_button = "search-form__left-button"
    else:
        search_input = "lite-page__header-search-input"
        search_button = "lite-page__header-search-button"
    
    print("\n---" + "New search " + str(datetime.datetime.now()) + "---")
    print("DOI: " + doi)
    
    
    search_field_list = driver.find_element_by_class_name(search_input)
    time.sleep(.25) # added for stability
    search_field = search_field_list
    time.sleep(.25)
    search_field.clear()
    time.sleep(.25)
    search_field.send_keys(doi)
    
    enter_field = driver.find_element_by_class_name(search_button)
    driver.execute_script("arguments[0].click()", enter_field)
  
    click_and_wait(driver, "a[href*='search/data']", 0)

    time.sleep(.5)
    
    if "www.researchgate.net/search" in driver.current_url:
        print("Paper not found (search)")
        
        return None
    else: 
        print("Success")
        return driver

In [7]:
# Given the webdriver, shows more authors. Returns the modified webdriver.

def show_authors(driver): 
    
    time.sleep(1) # Change to 1.5 if too unstable
    
    if not driver:
        print("Exiting: No driver")
        return None
    
    first_start = time.time()
    
    buttons = driver.find_elements_by_class_name("show-more-authors")
    
    print("show_more_authors button: " + str(buttons))
    
    if len(buttons) == 1:
        click_and_wait(driver, "show-more-authors", 1)
#         time.sleep(2)
#         print("click")
#         driver.execute_script("arguments[0].click();", buttons[0])
# #         WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, "show-less-authors")))
#         time.sleep(1)    # Change to 1.5 if too unstable
    
    first_end = time.time()

    print("DONE - " + str(first_end-first_start))

    return driver
    
# driver.quit()


In [8]:
# Given the webdriver, parses its source for author URLs.

def soup_it(driver, author_tokens):
    
#     time.sleep(2)
    if driver is None:
        print("No soup for you")
        return 0
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    thing = soup.find_all('div', {"class": "nova-e-text nova-e-text--size-m nova-e-text--family-sans-serif nova-e-text--spacing-none nova-e-text--color-inherit nova-e-text--clamp nova-v-person-list-item__title"})
    for stuff in thing:
        author_url = stuff.find('a').get("href")
        print(author_url)
        if len(author_tokens) == 1:
            if author_tokens[0] in author_url:
                print(str(author_url) + "<-------------")
        elif len(author_tokens) == 2:
            if author_tokens[0][0] in author_url and author_tokens[1] in author_url:
                print(str(author_url) + "<-------------")

    return 1

In [6]:
# original_file_descriptor = sys.stdout

In [7]:
# sys.stdout = open("./scraping_log.txt", "a")

In [8]:
# sys.stdout = original_file_descriptor

In [9]:
overall_first_time = time.time()

driver = obtain_driver("https://www.researchgate.net/search")
time.sleep(1)


# doi_list = ["10.1021/acs.iecr.5b03509", "10.1007/s10450-014-9639-3", "10.1002/adfm.200500563", "10.1002/adfm.200500561", "10.1002/adfm.200500563"]
# source = search_paper("10.1021/acs.iecr.5b03509", driver)   # 6 authors
# source = search_paper("10.1007/s10450-014-9639-3", driver)  # 9 authors
# source = search_paper("10.1002/adfm.200500561", driver)     # 4 authors
# source = search_paper("10.1002/adfm.200500563")     # doesn't exist
doi_list = ['10.1016/j.carbon.2009.06.046', '10.1016/j.coal.2004.05.002', '10.1016/j.coal.2005.07.003', '10.1016/j.coal.2007.01.005', '10.1016/j.coal.2010.08.013', '10.1016/s0166-5162(02)00078-2', '10.1016/s0375-6742(03)00122-5']

for doi in doi_list:
    soup_it(show_authors(search_paper(doi, driver)), "Gensterblum")
    time.sleep(.8)

driver.quit()

final_time = time.time() - overall_first_time
print(final_time)



*****2019-06-24 11:27:44.134679*****

Checking for linux64 geckodriver:v0.24.0 in cache
Driver found in /home/local/NIST/jfl2/.wdm/geckodriver/v0.24.0/linux64/geckodriver

---New search 2019-06-24 11:27:55.127994---
DOI: 10.1016/j.carbon.2009.06.046
Success
show_more_authors button: [<selenium.webdriver.firefox.webelement.FirefoxWebElement (session="ff24cc89-9f8c-4dec-8139-ac56dcbc0bf8", element="6d3b95a6-16c1-4e7b-b855-a9ed480600c3")>]


TimeoutException: Message: 
