In [31]:
from pprint import pprint
import urllib
import time
import sys
import datetime
import random

from bs4 import BeautifulSoup
from unidecode import unidecode
from webdriver_manager.firefox import GeckoDriverManager

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

In [20]:
def obtain_driver(url):
    """
    Given a URL string, opens the URL in a headless Firefox instance. 
    
    Returns a reference to the webdriver object
    """
    
    buttons = []
    driver_options = Options()
#     driver_options.headless = True
    
    print("\n\n*****" + str(datetime.datetime.now()) + "*****")
    
    driver = webdriver.Firefox(options=driver_options, executable_path=GeckoDriverManager().install())
    
    # WARNING, THIS IS SPECIFIC TO THIS MACHINE (retrieves the installed uBlock Origin from the Firefox on this computer.) Need a computer-agnostic way.
    driver.install_addon("~/.mozilla/firefox/f4t6w0s5.default-release/extensions/uBlock0@raymondhill.net.xpi", temporary=True)

    driver.get(url)
    
    time.sleep(.5)
    
    return driver

In [21]:
# def compare_tokens(tokens1, tokens2):
#     """Compares two lists of string tokens for equality as used in soup_it."""
    
#     # Iterates through each token
#     for index in range(len(tokens1)):
#         # 
#         if index >= len(tokens2):
#             return False
    
#         comparison1 = tokens1[index]
        
        
#         comparison2 = tokens2[index]
        
#         # Case of initials
#         if "." in tokens1[index] or "." in tokens2:
            
#     return True

In [22]:
def click_and_wait(driver, element, find_elements_by):
    """ 
    Clicks on element and waits for a page load. 
    
    Parameters:
    driver == webdriver
    element == string to find (class or css_selector)
    find_elements_by == integer: 0 for css_selector, 1 for class_name.
    
    Only works on an actual page load (will hang if the click only runs dynamic JS on the same webpage)
    """
    
    try:
        if find_elements_by == 0:
            button = driver.find_element_by_css_selector(element)
        else:
            button = driver.find_element_by_class_name(element)
            
        driver.execute_script("arguments[0].click()", button)
    except:
        print("No button")
        
    old_driver = driver.find_element_by_tag_name('html')

    WebDriverWait(driver, 10).until(EC.staleness_of(old_driver))

In [23]:
# def search_paper(doi, driver):
#     """
#     Given a DOI and a given driver (on search) page, searches the DOI to open up the paper in the page. 
    
#     Returns the webdriver associated with it.
#     """

#     if "www.researchgate.net/search" in driver.current_url:
#         search_input = "search-form__input"
#         search_button = "search-form__left-button"
#     else:
#         search_input = "lite-page__header-search-input"
#         search_button = "lite-page__header-search-button"
    
#     print("\n---" + "New search " + str(datetime.datetime.now()) + "---")
#     print("DOI: " + doi)
    
    
#     search_field_list = driver.find_element_by_class_name(search_input)
#     time.sleep(.25) # added for stability
#     search_field = search_field_list
#     time.sleep(.25)
#     search_field.clear()
#     time.sleep(.25)
#     search_field.send_keys(doi)
    
#     enter_field = driver.find_element_by_class_name(search_button)
#     driver.execute_script("arguments[0].click()", enter_field)
    
#     old_driver = driver.find_element_by_tag_name('html')
#     WebDriverWait(driver, 10).until(EC.staleness_of(old_driver))
  
#     click_and_wait(driver, "a[href*='search/data']", 0)
    
#     if "www.researchgate.net/search" in driver.current_url:
#         print("Paper not found (search)")
#         return None
#     else: 
#         print("Success")
#         return driver

In [38]:
def search_paper(doi, driver):
    """
    Given a DOI and an existing driver, opens google to search for the paper.
    
    Returns the associated webdriver.
    """
    
    print("\n---" + "New search " + str(datetime.datetime.now()) + "---")
    print("DOI: " + doi)
    
    driver.get("https://www.google.com")

    # Constructs the string being searched, randomizes it so it's not so robotic
    if random.randint(0, 1) == 0:
        query_string = '"' + doi + '"' + ' ' + '"researchgate.net"'
    else:
        query_string = '"researchgate.net"' + ' ' + '"' + doi + '"'
        
    # Enters search string into searchbox
    blah = driver.find_element_by_xpath("/html/body/div/div[3]/form/div[2]/div/div[1]/div/div[1]/input")
    blah.send_keys(query_string)
    
    time.sleep(random.randint(2, 6)) # to respect crawling
    
    # Clicks on "I'm Feeling Lucky" button
    button = driver.find_element_by_xpath('//*[@id="gbqfbb"]')
    driver.execute_script("arguments[0].click()", button)
    
    # Waits for page to load
    old_driver = driver.find_element_by_tag_name('html')
    WebDriverWait(driver, 10).until(EC.staleness_of(old_driver))
    time.sleep(1) # increased from .5 to 1 to respect crawling
    
    # Checks if paper was found
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    if "/sorry/" in driver.current_url:
        print("Captcha'd")
        print("Confirming lack of DOI: " + str(soup.select("div.nova-e-text--color-grey-500:nth-child(2) > a:nth-child(1)")))
        return None
    elif "google.com" in driver.current_url:
        print("Paper not found")
        return None
    else:
        # Prints DOI string. Need to compare it to the actual doi to see if it's the correct paper.
        print("Success: found at " + str(driver.current_url))
        return driver

In [25]:
def show_authors(driver): 
    """
    Given the webdriver, shows more authors. 
    
    Returns the modified webdriver.
    """
    
    if not driver:
        print("Exiting: No driver")
        return None
    
    first_start = time.time()
    
    time.sleep(.5) # testing for stability
    
    try:
        button = driver.find_element_by_class_name("show-more-authors")
        driver.execute_script("arguments[0].click()", button)
    except:
        print("No button")
    
    first_end = time.time()

    print("DONE - " + str(first_end-first_start))
    
    return driver

In [26]:
def soup_it(driver, author_tokens, pairing_dict, author_id):
    """
    Given the webdriver, parses its source for author URLs.
    
    Parameters:
    driver == webdriver
    author_tokens == list containing the searched author's name in tokens delimited by spaces
    pairing_dict == Dictionary to store the URLs into (for the respective author indicated by author_id)
    author_id == The searched author's id for usage in pairing_dict 
    """    
    
    time.sleep(1.5) # Increased from .5 to 1.5 for stability
      
    if driver is None:
        print("No soup for you")
        pairing_dict[author_id] = "NO_PAPER"
        return -1
    
    success = 0
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    thing = soup.find_all('div', {"class": "nova-v-person-list-item__title"})
    
    for stuff in thing:
        
        author_name = stuff.find('a').string
        author_url = stuff.find('a').get("href")
        
        # Determines if the website's author has ASCII compatible characters or not (tests whole name. 
        # Still doesn't address if one half of name uses UTF-8 only characters and the other half doesn't)
        if unidecode(author_name) == author_name:
            author_first = unidecode(author_tokens[0]).split()
            author_last = unidecode(author_tokens[2]).split()
        else: 
            author_first = author_tokens[0].split()
            author_last = author_tokens[2].split()
            
        # Checks first name on researchgate is initial (e.g. if author only has a "scientific-contributions" page), if so then compare first initials
        if "." in author_name.split()[0]:
            author_first[0] = str(author_first[0][0]) + "."
                
        print(str(author_first) + " " + str(author_last) + " and " + author_name)
        
        # Full name
        if len(author_first[0]) >= 2 and "." not in author_first[0] and author_first[0].casefold() == author_name.split()[0].casefold() \
            and author_last[-1].casefold() == author_name.split()[-1].casefold():
            
            print(str(author_url) + " <-------------")
            pairing_dict[author_id] = author_url
            
            success = 1
            
        # First initial
        elif ( (len(author_first[0]) == 2 and "." in author_first[0]) or (len(author_tokens[0]) == 1) ) \
            and author_first[0][0].casefold() in author_name.split()[0][0].casefold() \
            and author_last[-1].casefold() == author_name.split()[-1].casefold():
            
            print(str(author_url) + " <-------------")
            pairing_dict[author_id] = author_url
            
            success = 1
            
        # No first name
        elif author_first[0] == "" and author_last[-1].casefold() == author_name.split()[-1].casefold():
            print(str(author_url) + " <-------------")
            pairing_dict[author_id] = author_url
            
            success = 1
            
        else:
            print(author_url)

    if success == 0:
        pairing_dict[author_id] = "NOT_FOUND"
        
    return success

In [9]:
original_file_descriptor = sys.stdout
sys.stdout = open("./scraping_log.txt", "a")

In [10]:
with open("./stored_authors/authors_and_papers.txt", encoding="utf8") as papers_file:
    authors_and_papers = eval(papers_file.read())
with open("./stored_authors/authors_ids.txt", encoding="utf8") as authors_file:
    authors_ids = eval(authors_file.read())

In [40]:
# authors_urls = {}

# for author in authors_and_papers:
#     authors_urls[author] = None

with open("./stored_authors/author_url_pairings.txt", encoding="utf8") as author_pair_file:
    authors_urls = eval(author_pair_file.read())

In [None]:
driver = obtain_driver("https://www.google.com")
time.sleep(2)

for author in authors_urls:
    if authors_urls[author] == None:
        
        # Author has no associated paper
        if len(authors_and_papers[author]) == 0:
            print("AUTHOR HAS NO PAPER")
            authors_urls[author] = "AUTHOR_HAS_NO_PAPER"
            
            driver.quit()
            time.sleep(1)
            driver = obtain_driver("https://www.google.com")
            time.sleep(2)
            
        else:
            success = soup_it(show_authors(search_paper(authors_and_papers[author][0], driver)), authors_ids[author], authors_urls, author)
            print(author)
            sys.stdout.flush()

            if success == -1:
                driver.quit()
                time.sleep(1)
                driver = obtain_driver("https://www.google.com")
                time.sleep(2)

        time.sleep(random.randint(6, 12)) # Increased from 1 to 6-12 to respect crawling
        
driver.quit()

In [42]:
with open("./stored_authors/author_url_pairings.txt", "w") as dup_file:
    pprint(authors_urls, stream = dup_file)

In [12]:
sys.stdout = original_file_descriptor

In [48]:
print(len(authors_urls))

997


In [21]:
# Testing Khaled (multiple words in first name)

driver = obtain_driver("https://www.researchgate.net/search")
time.sleep(2)

author = "1d13bd887854f23bc64bab2b7d388d0bbcd1b333"

soup_it(show_authors(search_paper("10.1016/j.coal.2013.11.009", driver)), authors_ids[author], authors_urls, author)

driver.quit()



*****2019-06-25 16:10:43.533727*****

Checking for linux64 geckodriver:v0.24.0 in cache
Driver found in /home/local/NIST/jfl2/.wdm/geckodriver/v0.24.0/linux64/geckodriver

---New search 2019-06-25 16:10:56.303352---
DOI: 10.1016/j.coal.2013.11.009
Success
No button
DONE - 0.5094668865203857
['Khaled', 'A.'] ['Gasem'] and Pongtorn Chareonsuppanimit
https://www.researchgate.net/profile/Pongtorn_Chareonsuppanimit
['Khaled', 'A.'] ['Gasem'] and Sayeed A. Mohammad
https://www.researchgate.net/scientific-contributions/85514326_Sayeed_A_Mohammad
['Khaled', 'A.'] ['Gasem'] and Robert L. Robinson Jr
https://www.researchgate.net/profile/Robert_Jr2
['Khaled', 'A.'] ['Gasem'] and Khaled Gasem
https://www.researchgate.net/profile/Khaled_Gasem <-------------


In [77]:
# Testing M Hudson

driver = obtain_driver("https://www.researchgate.net/search")
time.sleep(2)

author = "0000-0001-8155-6489"

soup_it(show_authors(search_paper("10.1007/s10450-018-9958-x", driver)), authors_ids[author], authors_urls, author)

driver.quit()



*****2019-06-25 14:13:52.063208*****

Checking for linux64 geckodriver:v0.24.0 in cache
Driver found in /home/local/NIST/jfl2/.wdm/geckodriver/v0.24.0/linux64/geckodriver

---New search 2019-06-25 14:14:04.935616---
DOI: 10.1007/s10450-018-9958-x
Success
DONE - 0.543921947479248
['M'] ['Hudson'] and Huong Giang Nguyen
https://www.researchgate.net/profile/Huong_Giang_Nguyen
['M'] ['Hudson'] and Laura Espinal
https://www.researchgate.net/profile/Laura_Espinal
['M.'] ['Hudson'] and R. D. van Zee
https://www.researchgate.net/scientific-contributions/2133349107_R_D_van_Zee
['M.'] ['Hudson'] and M. Thommes
https://www.researchgate.net/scientific-contributions/11437659_M_Thommes
['M'] ['Hudson'] and Blaza Toman
https://www.researchgate.net/profile/Blaza_Toman
['M.'] ['Hudson'] and M. Sterlin Leo Hudson
https://www.researchgate.net/profile/M_Sterlin_Hudson <-------------
['M'] ['Hudson'] and Enzo Mangano
https://www.researchgate.net/profile/Enzo_Mangano
['M'] ['Hudson'] and Stefano Brandani


In [36]:
# Testing Carlos Otero Arean (last name is two names)

driver = obtain_driver("https://www.researchgate.net/search")
time.sleep(1)

author = "0000-0002-2980-7997"

soup_it(show_authors(search_paper("10.1016/j.cplett.2011.11.054", driver)), authors_ids[author], authors_urls, author)

driver.quit()



*****2019-06-25 13:08:05.523798*****

Checking for linux64 geckodriver:v0.24.0 in cache
Driver found in /home/local/NIST/jfl2/.wdm/geckodriver/v0.24.0/linux64/geckodriver

---New search 2019-06-25 13:08:17.251158---
DOI: 10.1016/j.cplett.2011.11.054
Success
No button
DONE - 0.012266874313354492
['Carlos'] ['Otero', 'Areán'] and Carlos Otero Areán
https://www.researchgate.net/profile/Carlos_Arean <-------------
['Carlos'] ['Otero', 'Arean'] and Carlos Palomino Cabello
https://www.researchgate.net/profile/Carlos_Palomino_Cabello
['Carlos'] ['Otero', 'Arean'] and Gemma Turnes Palomino
https://www.researchgate.net/profile/Gemma_Turnes_Palomino


In [37]:
# Testing Tao li (case matching)

driver = obtain_driver("https://www.researchgate.net/search")
time.sleep(1)

author = "fb51c9b3b4998ec6cca5977372fc110212d1a8c6"

soup_it(show_authors(search_paper("10.1039/C4TA05225K ", driver)), authors_ids[author], authors_urls, author)

driver.quit()



*****2019-06-25 13:09:56.553568*****

Checking for linux64 geckodriver:v0.24.0 in cache
Driver found in /home/local/NIST/jfl2/.wdm/geckodriver/v0.24.0/linux64/geckodriver

---New search 2019-06-25 13:10:08.107685---
DOI: 10.1039/C4TA05225K 
Success
DONE - 0.05252337455749512
['Tao'] ['Li'] and surendar reddy Venna
https://www.researchgate.net/profile/Surendar_reddy_Venna
['Tao'] ['Li'] and Michael Lartey
https://www.researchgate.net/profile/Michael_Lartey6
['Tao'] ['Li'] and Tao li
https://www.researchgate.net/profile/Tao_Li73 <-------------
['Tao'] ['Li'] and Alex Spore
https://www.researchgate.net/scientific-contributions/2065445495_Alex_Spore
['Tao'] ['Li'] and Santosh Kumar
https://www.researchgate.net/scientific-contributions/2065452147_Santosh_Kumar
['Tao'] ['Li'] and Hunaid B. Nulwala
https://www.researchgate.net/profile/Hunaid_Nulwala
['Tao'] ['Li'] and David Richard Luebke
https://www.researchgate.net/scientific-contributions/2007995838_David_Richard_Luebke
['Tao'] ['Li'] an

In [38]:
# Testing Froeba (matching special characters)

driver = obtain_driver("https://www.researchgate.net/search")
time.sleep(1)

author = "13a7a901aac8693e7b652cdf66ec4eeeace36f3f"

soup_it(show_authors(search_paper("10.1021/ic201596x ", driver)), authors_ids[author], authors_urls, author)

driver.quit()



*****2019-06-25 13:11:08.667032*****

Checking for linux64 geckodriver:v0.24.0 in cache
Driver found in /home/local/NIST/jfl2/.wdm/geckodriver/v0.24.0/linux64/geckodriver

---New search 2019-06-25 13:11:20.819320---
DOI: 10.1021/ic201596x 
Success
No button
DONE - 0.006753444671630859
['Michael'] ['Froba'] and Daniela Frahm
https://www.researchgate.net/scientific-contributions/58927593_Daniela_Frahm
['Michael'] ['Froba'] and Michael Fischer
https://www.researchgate.net/profile/Michael_Fischer3
['Michael'] ['Froba'] and Frank Hoffmann
https://www.researchgate.net/profile/Frank_Hoffmann2
['Michael'] ['Fröba'] and Michael Fröba
https://www.researchgate.net/profile/Michael_Froeba <-------------


In [39]:
# Testing Rebecca Siegelman (scientific-contributions case)

driver = obtain_driver("https://www.researchgate.net/search")
time.sleep(1)

author = "0000-0002-4249-6118"

soup_it(show_authors(search_paper("10.1007/s10450-018-9958-x", driver)), authors_ids[author], authors_urls, author)

driver.quit()



*****2019-06-25 13:11:34.401229*****

Checking for linux64 geckodriver:v0.24.0 in cache
Driver found in /home/local/NIST/jfl2/.wdm/geckodriver/v0.24.0/linux64/geckodriver

---New search 2019-06-25 13:11:45.878041---
DOI: 10.1007/s10450-018-9958-x
Success
DONE - 0.18370318412780762
['Rebecca'] ['Siegelman'] and Huong Giang Nguyen
https://www.researchgate.net/profile/Huong_Giang_Nguyen
['Rebecca'] ['Siegelman'] and Laura Espinal
https://www.researchgate.net/profile/Laura_Espinal
['R.'] ['Siegelman'] and R. D. van Zee
https://www.researchgate.net/scientific-contributions/2133349107_R_D_van_Zee
['R.'] ['Siegelman'] and M. Thommes
https://www.researchgate.net/scientific-contributions/11437659_M_Thommes
['Rebecca'] ['Siegelman'] and Blaza Toman
https://www.researchgate.net/profile/Blaza_Toman
['R.'] ['Siegelman'] and M. Sterlin Leo Hudson
https://www.researchgate.net/profile/M_Sterlin_Hudson
['Rebecca'] ['Siegelman'] and Enzo Mangano
https://www.researchgate.net/profile/Enzo_Mangano
['Rebe

In [40]:
# Testing Francois (special character in database but not on researchgate)

driver = obtain_driver("https://www.researchgate.net/search")
time.sleep(1)

author = "00825555e861a984384ccc94310829a3ff525a07"

soup_it(show_authors(search_paper("10.1039/B822834e", driver)), authors_ids[author], authors_urls, author)

driver.quit()



*****2019-06-25 13:13:30.508967*****

Checking for linux64 geckodriver:v0.24.0 in cache
Driver found in /home/local/NIST/jfl2/.wdm/geckodriver/v0.24.0/linux64/geckodriver

---New search 2019-06-25 13:13:42.081979---
DOI: 10.1039/B822834e
Success
DONE - 0.0194244384765625
['Francois'] ['Henn'] and Sabine Devautour-Vinot
https://www.researchgate.net/scientific-contributions/35338587_Sabine_Devautour-Vinot
['Francois'] ['Henn'] and Guillaume Maurin
https://www.researchgate.net/profile/Guillaume_Maurin
['Francois'] ['Henn'] and Francois Henn
https://www.researchgate.net/profile/Francois_Henn <-------------
['Francois'] ['Henn'] and Christian Serre
https://www.researchgate.net/profile/Christian_Serre
['Francois'] ['Henn'] and Thomas Devic
https://www.researchgate.net/scientific-contributions/39843151_Thomas_Devic
['François'] ['Henn'] and Gérard Férey
https://www.researchgate.net/profile/Gerard_Ferey


In [None]:
# Testing

for author in sorted(authors_urls)[:4]:
    print(author)
    print(authors_urls[author])
    

In [None]:
print(len(authors_urls))

In [None]:
print(authors_ids["040cebbec1f902a6fa063eef81989e4a82ce7515"])

In [None]:
# Testing cell

overall_first_time = time.time()

driver = obtain_driver("https://www.researchgate.net/search")
time.sleep(1)


# doi_list = ["10.1021/acs.iecr.5b03509", "10.1007/s10450-014-9639-3", "10.1002/adfm.200500563", "10.1002/adfm.200500561", "10.1002/adfm.200500563"]
# source = search_paper("10.1021/acs.iecr.5b03509", driver)   # 6 authors
# source = search_paper("10.1007/s10450-014-9639-3", driver)  # 9 authors
# source = search_paper("10.1002/adfm.200500561", driver)     # 4 authors
# source = search_paper("10.1002/adfm.200500563")     # doesn't exist
doi_list = ['10.1016/j.carbon.2009.06.046', '10.1016/j.coal.2004.05.002', '10.1016/j.coal.2005.07.003', '10.1016/j.coal.2007.01.005', '10.1016/j.coal.2010.08.013', '10.1016/s0166-5162(02)00078-2', '10.1016/s0375-6742(03)00122-5']

for doi in doi_list:
    soup_it(show_authors(search_paper(doi, driver)), "Gensterblum")
    time.sleep(.8)

driver.quit()

final_time = time.time() - overall_first_time
print(final_time)