In [46]:
from pprint import pprint
import urllib
import time
import sys
import datetime
from collections import Counter

from bs4 import BeautifulSoup
from unidecode import unidecode
from webdriver_manager.firefox import GeckoDriverManager

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

In [None]:
def obtain_driver(url):
    """
    Given a URL string, opens the URL in a headless Firefox instance. 
    
    Returns a reference to the webdriver object
    """
    
    buttons = []
    driver_options = Options()
#     driver_options.headless = True
    
    print("\n\n*****" + str(datetime.datetime.now()) + "*****")
    
    driver = webdriver.Firefox(options=driver_options, executable_path=GeckoDriverManager().install())
    
    # WARNING, THIS IS SPECIFIC TO THIS MACHINE (retrieves the installed uBlock Origin from the Firefox on this computer.) Need a computer-agnostic way.
    driver.install_addon("~/.mozilla/firefox/f4t6w0s5.default-release/extensions/uBlock0@raymondhill.net.xpi", temporary=True)

    driver.get(url)
    
    time.sleep(.5)
    
    return driver

In [42]:
def common_char_count(string1, string2):
    """Counts common characters between two strings"""
    
    common_characters = Counter(string1) & Counter(string2)
    return sum(common_characters.values())

In [None]:
def click_and_wait(driver, element, find_elements_by):
    """ 
    Clicks on element and waits for a page load. 
    
    driver == webdriver, element == string (class or css_selector), find_elements_by == integer
    find_elements_by: 0 for css_selector, 1 for class_name
    """
    
    try:
        if find_elements_by == 0:
            button = driver.find_element_by_css_selector(element)
        else:
            button = driver.find_element_by_class_name(element)
            
        driver.execute_script("arguments[0].click()", button)
    except:
        print("No button")
        
    old_driver = driver.find_element_by_tag_name('html')
    
    WebDriverWait(driver, 10).until(EC.staleness_of(old_driver))

In [None]:
def search_paper(doi, driver):
    """
    Given a DOI, uses obtain_driver to open up the search site and search the DOI to open up the paper in the page. 
    
    Returns the webdriver associated with it.
    """

    if "www.researchgate.net/search" in driver.current_url:
        search_input = "search-form__input"
        search_button = "search-form__left-button"
    else:
        search_input = "lite-page__header-search-input"
        search_button = "lite-page__header-search-button"
    
    print("\n---" + "New search " + str(datetime.datetime.now()) + "---")
    print("DOI: " + doi)
    
    
    search_field_list = driver.find_element_by_class_name(search_input)
    time.sleep(.25) # added for stability
    search_field = search_field_list
    time.sleep(.25)
    search_field.clear()
    time.sleep(.25)
    search_field.send_keys(doi)
    
    enter_field = driver.find_element_by_class_name(search_button)
    driver.execute_script("arguments[0].click()", enter_field)
    
    old_driver = driver.find_element_by_tag_name('html')
    WebDriverWait(driver, 10).until(EC.staleness_of(old_driver))
  
    click_and_wait(driver, "a[href*='search/data']", 0)
    
    if "www.researchgate.net/search" in driver.current_url:
        print("Paper not found (search)")
        return None
    else: 
        print("Success")
        return driver

In [None]:
def show_authors(driver): 
    """
    Given the webdriver, shows more authors. 
    
    Returns the modified webdriver.
    """
    
    if not driver:
        print("Exiting: No driver")
        return None
    
    first_start = time.time()
    
    try:
        button = driver.find_element_by_class_name("show-more-authors")
        driver.execute_script("arguments[0].click()", button)
    except:
        print("No button")
    
    first_end = time.time()

    print("DONE - " + str(first_end-first_start))

    return driver

In [71]:
def soup_it(driver, author_tokens, pairing_dict, author_id):
    """Given the webdriver, parses its source for author URLs."""    
    
    time.sleep(1) # Increased from .5 to 1 for stability
    
    if driver is None:
        print("No soup for you")
        return 0
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    thing = soup.find_all('div', {"class": "nova-v-person-list-item__title"})
    
    for stuff in thing:
        
        author_name = stuff.find('a').string
        author_url = stuff.find('a').get("href")
        
        # Determines if the website's author has ASCII compatible characters or not (tests whole name. 
        # Still doesn't address if one half of name uses UTF-8 only characters and the other half doesn't)
        if unidecode(author_name) == author_name:
            author_first = unidecode(author_tokens[0])
            author_last = unidecode(author_tokens[2])
        else: 
            author_first = author_tokens[0]
            author_last = author_tokens[2]

        print(str(author_first) + " " + str(author_last) + " and " + author_name)
        
        if len(author_tokens[0]) >= 2 and "." not in author_tokens[0] and author_first in author_name and author_last in author_name:
            print(str(author_url) + " <-------------")
            pairing_dict[author_id] = author_url
            
        elif ( (len(author_tokens[0]) == 2 and "." in author_tokens[0]) or (len(author_tokens[0]) == 1) ) and author_tokens[0][0] in author_name and author_tokens[2] in author_name:
            print(str(author_url) + " <-------------")
            pairing_dict[author_id] = author_url
        else:
            print(author_url)

    return 1

In [None]:
original_file_descriptor = sys.stdout
sys.stdout = open("./scraping_log.txt", "a")

In [72]:
with open("./stored_authors/authors_and_papers.txt", encoding="utf8") as papers_file:
    authors_and_papers = eval(papers_file.read())
with open("./stored_authors/authors_ids.txt", encoding="utf8") as authors_file:
    authors_ids = eval(authors_file.read())

In [None]:
authors_urls = {}

for author in authors_and_papers:
    authors_urls[author] = None

In [None]:
driver = obtain_driver("https://www.researchgate.net/search")
time.sleep(1)

for author in authors_urls:
    soup_it(show_authors(search_paper(authors_and_papers[author][0], driver)), authors_ids[author], authors_urls, author)
    print(author)
    sys.stdout.flush()
    time.sleep(1) # To avoid captcha

driver.quit()

In [None]:
with open("./stored_authors/author_url_pairings.txt", "w") as dup_file:
    pprint(authors_urls, stream = dup_file)

In [None]:
sys.stdout = original_file_descriptor

In [70]:
# Testing Froeba

driver = obtain_driver("https://www.researchgate.net/search")
time.sleep(1)

author = "13a7a901aac8693e7b652cdf66ec4eeeace36f3f"

soup_it(show_authors(search_paper("10.1021/ic201596x ", driver)), authors_ids[author], authors_urls, author)

driver.quit()



*****2019-06-25 09:35:40.151321*****

Checking for linux64 geckodriver:v0.24.0 in cache
Driver found in /home/local/NIST/jfl2/.wdm/geckodriver/v0.24.0/linux64/geckodriver

---New search 2019-06-25 09:35:50.837872---
DOI: 10.1021/ic201596x 
Success
No button
DONE - 0.023989200592041016
Michael Froba and Daniela Frahm
https://www.researchgate.net/scientific-contributions/58927593_Daniela_Frahm
Michael Froba and Michael Fischer
https://www.researchgate.net/profile/Michael_Fischer3
Michael Froba and Frank Hoffmann
https://www.researchgate.net/profile/Frank_Hoffmann2
Michael Fröba and Michael Fröba
https://www.researchgate.net/profile/Michael_Froeba <-------------


In [62]:
# Testing Froeba

driver = obtain_driver("https://www.researchgate.net/search")
time.sleep(1)

author = "13a7a901aac8693e7b652cdf66ec4eeeace36f3f"

soup_it(show_authors(search_paper("10.1021/ic201596x ", driver)), authors_ids[author], authors_urls, author)

driver.quit()



*****2019-06-25 09:28:00.318082*****

Checking for linux64 geckodriver:v0.24.0 in cache
Driver found in /home/local/NIST/jfl2/.wdm/geckodriver/v0.24.0/linux64/geckodriver

---New search 2019-06-25 09:28:10.772993---
DOI: 10.1021/ic201596x 
Success
No button
DONE - 0.01657867431640625
Michael Froba and Daniela Frahm
https://www.researchgate.net/scientific-contributions/58927593_Daniela_Frahm
Michael Froba and Michael Fischer
https://www.researchgate.net/profile/Michael_Fischer3
Michael Froba and Frank Hoffmann
https://www.researchgate.net/profile/Frank_Hoffmann2
Michael Fröba and Michael Fröba
https://www.researchgate.net/profile/Michael_Froeba <-------------


In [63]:
# Testing Francois

driver = obtain_driver("https://www.researchgate.net/search")
time.sleep(1)

author = "00825555e861a984384ccc94310829a3ff525a07"

soup_it(show_authors(search_paper("10.1039/B822834e", driver)), authors_ids[author], authors_urls, author)

driver.quit()



*****2019-06-25 09:29:36.248766*****

Checking for linux64 geckodriver:v0.24.0 in cache
Driver found in /home/local/NIST/jfl2/.wdm/geckodriver/v0.24.0/linux64/geckodriver

---New search 2019-06-25 09:29:46.927861---
DOI: 10.1039/B822834e
Success
DONE - 0.019084453582763672
Francois Henn and Sabine Devautour-Vinot
https://www.researchgate.net/scientific-contributions/35338587_Sabine_Devautour-Vinot
Francois Henn and Guillaume Maurin
https://www.researchgate.net/profile/Guillaume_Maurin
Francois Henn and Francois Henn
https://www.researchgate.net/profile/Francois_Henn <-------------
Francois Henn and Christian Serre
https://www.researchgate.net/profile/Christian_Serre
Francois Henn and Thomas Devic
https://www.researchgate.net/scientific-contributions/39843151_Thomas_Devic
François Henn and Gérard Férey
https://www.researchgate.net/profile/Gerard_Ferey


In [None]:
# Testing

for author in sorted(authors_urls)[:4]:
    print(author)
    print(authors_urls[author])
    

In [None]:
print(len(authors_urls))

In [None]:
print(authors_ids["040cebbec1f902a6fa063eef81989e4a82ce7515"])

In [None]:
# Testing cell

overall_first_time = time.time()

driver = obtain_driver("https://www.researchgate.net/search")
time.sleep(1)


# doi_list = ["10.1021/acs.iecr.5b03509", "10.1007/s10450-014-9639-3", "10.1002/adfm.200500563", "10.1002/adfm.200500561", "10.1002/adfm.200500563"]
# source = search_paper("10.1021/acs.iecr.5b03509", driver)   # 6 authors
# source = search_paper("10.1007/s10450-014-9639-3", driver)  # 9 authors
# source = search_paper("10.1002/adfm.200500561", driver)     # 4 authors
# source = search_paper("10.1002/adfm.200500563")     # doesn't exist
doi_list = ['10.1016/j.carbon.2009.06.046', '10.1016/j.coal.2004.05.002', '10.1016/j.coal.2005.07.003', '10.1016/j.coal.2007.01.005', '10.1016/j.coal.2010.08.013', '10.1016/s0166-5162(02)00078-2', '10.1016/s0375-6742(03)00122-5']

for doi in doi_list:
    soup_it(show_authors(search_paper(doi, driver)), "Gensterblum")
    time.sleep(.8)

driver.quit()

final_time = time.time() - overall_first_time
print(final_time)