In [1]:
from pprint import pprint
import urllib
from bs4 import BeautifulSoup
from webdriver_manager.firefox import GeckoDriverManager
import time
import sys
import datetime

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

In [2]:
def obtain_driver(url):
    """
    Given a URL string, opens the URL in a headless Firefox instance. 
    
    Returns a reference to the webdriver object
    """
    
    buttons = []
    driver_options = Options()
#     driver_options.headless = True
    
    print("\n\n*****" + str(datetime.datetime.now()) + "*****")
    
    driver = webdriver.Firefox(options=driver_options, executable_path=GeckoDriverManager().install())
    
    # WARNING, THIS IS SPECIFIC TO THIS MACHINE (retrieves the installed uBlock Origin from the Firefox on this computer.) Need a computer-agnostic way.
    driver.install_addon("~/.mozilla/firefox/f4t6w0s5.default-release/extensions/uBlock0@raymondhill.net.xpi", temporary=True)

    driver.get(url)
    
    time.sleep(.5)
    
    return driver

In [3]:
def click_and_wait(driver, element, find_elements_by):
    """ 
    Clicks on element and waits for a page load. 
    
    driver == webdriver, element == string (class or css_selector), find_elements_by == integer
    find_elements_by: 0 for css_selector, 1 for class_name
    """
    
    try:
        if find_elements_by == 0:
            button = driver.find_element_by_css_selector(element)
        else:
            button = driver.find_element_by_class_name(element)
            
        driver.execute_script("arguments[0].click()", button)
    except:
        print("No button")
        
    old_driver = driver.find_element_by_tag_name('html')
    
    WebDriverWait(driver, 10).until(EC.staleness_of(old_driver))

In [4]:
def search_paper(doi, driver):
    """
    Given a DOI, uses obtain_driver to open up the search site and search the DOI to open up the paper in the page. 
    
    Returns the webdriver associated with it.
    """

    if "www.researchgate.net/search" in driver.current_url:
        search_input = "search-form__input"
        search_button = "search-form__left-button"
    else:
        search_input = "lite-page__header-search-input"
        search_button = "lite-page__header-search-button"
    
    print("\n---" + "New search " + str(datetime.datetime.now()) + "---")
    print("DOI: " + doi)
    
    
    search_field_list = driver.find_element_by_class_name(search_input)
    time.sleep(.25) # added for stability
    search_field = search_field_list
    time.sleep(.25)
    search_field.clear()
    time.sleep(.25)
    search_field.send_keys(doi)
    
    enter_field = driver.find_element_by_class_name(search_button)
    driver.execute_script("arguments[0].click()", enter_field)
    
    old_driver = driver.find_element_by_tag_name('html')
    WebDriverWait(driver, 10).until(EC.staleness_of(old_driver))
  
    click_and_wait(driver, "a[href*='search/data']", 0)
    
    if "www.researchgate.net/search" in driver.current_url:
        print("Paper not found (search)")
        return None
    else: 
        print("Success")
        return driver

In [5]:
def show_authors(driver): 
    """
    Given the webdriver, shows more authors. 
    
    Returns the modified webdriver.
    """
    
    if not driver:
        print("Exiting: No driver")
        return None
    
    first_start = time.time()
    
    try:
        button = driver.find_element_by_class_name("show-more-authors")
        driver.execute_script("arguments[0].click()", button)
    except:
        print("No button")
    
    first_end = time.time()

    print("DONE - " + str(first_end-first_start))

    return driver

In [18]:
def soup_it(driver, author_tokens, pairing_dict, author_id):
    """Given the webdriver, parses its source for author URLs."""    
    
    time.sleep(1) # Increased from .5 to 1 for stability
    
    if driver is None:
        print("No soup for you")
        return 0
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    thing = soup.find_all('div', {"class": "nova-e-text nova-e-text--size-m nova-e-text--family-sans-serif nova-e-text--spacing-none nova-e-text--color-inherit nova-e-text--clamp nova-v-person-list-item__title"})
    for stuff in thing:
        author_url = stuff.find('a').get("href")
        
        if len(author_tokens[0]) >= 2 and "." not in author_tokens[0] and author_tokens[0] in author_url and author_tokens[2] in author_url:
            print(str(author_url) + " <-------------")
            pairing_dict[author_id] = author_url
            
        elif ( (len(author_tokens[0]) == 2 and "." in author_tokens[0]) or (len(author_tokens[0]) == 1) ) and author_tokens[0][0] in author_url and author_tokens[2] in author_url:
            print(str(author_url) + " <-------------")
            pairing_dict[author_id] = author_url
        else:
            print(author_url)
#         if len(author_tokens) == 1:
#             if author_tokens[0] in author_url:
#                 print(str(author_url) + " <-------------")
#         elif len(author_tokens) == 2:
#             if author_tokens[0][0] in author_url and author_tokens[1] in author_url:
#                 print(str(author_url) + " <-------------")

    return 1

In [7]:
original_file_descriptor = sys.stdout

In [20]:
sys.stdout = open("./scraping_log.txt", "a")

In [9]:
with open("./stored_authors/authors_and_papers.txt", encoding="utf8") as papers_file:
    authors_and_papers = eval(papers_file.read())

In [10]:
with open("./stored_authors/authors_ids.txt", encoding="utf8") as authors_file:
    authors_ids = eval(authors_file.read())

In [11]:
authors_urls = {}

for author in authors_and_papers:
    authors_urls[author] = None

In [21]:
driver = obtain_driver("https://www.researchgate.net/search")
time.sleep(1)

for author in authors_urls:
    soup_it(show_authors(search_paper(authors_and_papers[author][0], driver)), authors_ids[author], authors_urls, author)
    print(author)
    sys.stdout.flush()
    time.sleep(1) # To avoid captcha

driver.quit()

WebDriverException: Message: Failed to decode response from marionette


In [19]:
with open("./stored_authors/author_url_pairings.txt", "w") as dup_file:
    pprint(authors_urls, stream = dup_file)

In [15]:
sys.stdout = original_file_descriptor

In [19]:
# Testing

driver = obtain_driver("https://www.researchgate.net/search")
time.sleep(1)

author = "0000-0001-8155-6489"

soup_it(show_authors(search_paper("10.1007/s10450-018-9958-x", driver)), authors_ids[author], authors_urls, author)

driver.quit()



*****2019-06-24 15:34:33.362725*****

Checking for linux64 geckodriver:v0.24.0 in cache
Driver found in /home/local/NIST/jfl2/.wdm/geckodriver/v0.24.0/linux64/geckodriver

---New search 2019-06-24 15:34:44.468373---
DOI: 10.1007/s10450-018-9958-x
Success
DONE - 0.043244123458862305
https://www.researchgate.net/profile/Huong_Giang_Nguyen
https://www.researchgate.net/profile/Laura_Espinal
https://www.researchgate.net/scientific-contributions/2133349107_R_D_van_Zee
https://www.researchgate.net/scientific-contributions/11437659_M_Thommes
https://www.researchgate.net/profile/Blaza_Toman
https://www.researchgate.net/profile/M_Sterlin_Hudson <-------------
https://www.researchgate.net/profile/Enzo_Mangano
https://www.researchgate.net/profile/Stefano_Brandani
https://www.researchgate.net/profile/Darren_Broom
https://www.researchgate.net/scientific-contributions/2145380044_M_J_Benham
https://www.researchgate.net/scientific-contributions/57866576_K_Cychosz
https://www.researchgate.net/profile/

In [56]:
# Testing

for author in sorted(authors_urls)[:4]:
    print(author)
    print(authors_urls[author])
    

0000-0001-6082-5862
https://www.researchgate.net/profile/Y_Gensterblum
0000-0001-7586-9841
https://www.researchgate.net/profile/Tao_Li73
0000-0001-7792-4322
https://www.researchgate.net/scientific-contributions/2052162429_Jing_Li
0000-0001-7998-4492
https://www.researchgate.net/profile/Pravas_Deria


In [49]:
print(len(authors_urls))

997


In [21]:
print(authors_ids["040cebbec1f902a6fa063eef81989e4a82ce7515"])

In [None]:
# Testing cell

overall_first_time = time.time()

driver = obtain_driver("https://www.researchgate.net/search")
time.sleep(1)


# doi_list = ["10.1021/acs.iecr.5b03509", "10.1007/s10450-014-9639-3", "10.1002/adfm.200500563", "10.1002/adfm.200500561", "10.1002/adfm.200500563"]
# source = search_paper("10.1021/acs.iecr.5b03509", driver)   # 6 authors
# source = search_paper("10.1007/s10450-014-9639-3", driver)  # 9 authors
# source = search_paper("10.1002/adfm.200500561", driver)     # 4 authors
# source = search_paper("10.1002/adfm.200500563")     # doesn't exist
doi_list = ['10.1016/j.carbon.2009.06.046', '10.1016/j.coal.2004.05.002', '10.1016/j.coal.2005.07.003', '10.1016/j.coal.2007.01.005', '10.1016/j.coal.2010.08.013', '10.1016/s0166-5162(02)00078-2', '10.1016/s0375-6742(03)00122-5']

for doi in doi_list:
    soup_it(show_authors(search_paper(doi, driver)), "Gensterblum")
    time.sleep(.8)

driver.quit()

final_time = time.time() - overall_first_time
print(final_time)