In [1]:
from pprint import pprint
import urllib
import time
import sys
import datetime
import random
from collections import Counter

from bs4 import BeautifulSoup
from unidecode import unidecode
from webdriver_manager.firefox import GeckoDriverManager

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException

In [21]:
def obtain_driver(url):
    """
    Given a URL string, opens the URL in a headless Firefox instance. 
    
    Returns a reference to the webdriver object
    """
    
    buttons = []
    driver_options = Options()
#     driver_options.headless = True
    
    print("\n\n*****" + str(datetime.datetime.now()) + "*****")
    
    driver = webdriver.Firefox(options=driver_options, executable_path=GeckoDriverManager().install())
    
    # WARNING, THIS IS SPECIFIC TO THIS MACHINE (retrieves the installed uBlock Origin from the Firefox on this computer.) Need a computer-agnostic way.
    driver.install_addon("~/.mozilla/firefox/f4t6w0s5.default-release/extensions/uBlock0@raymondhill.net.xpi", temporary=True)

#     driver.get(url)
    
    time.sleep(.5)
    
    return driver

In [3]:
def click_and_wait(driver, element, find_elements_by):
    """ 
    Clicks on element and waits for a page load. 
    
    Parameters:
    driver == webdriver
    element == string to find (class or css_selector)
    find_elements_by == integer: 0 for css_selector, 1 for class_name.
    
    Only works on an actual page load (will hang if the click only runs dynamic JS on the same webpage)
    """
    
    try:
        if find_elements_by == 0:
            button = driver.find_element_by_css_selector(element)
        else:
            button = driver.find_element_by_class_name(element)
            
        driver.execute_script("arguments[0].click()", button)
    except:
        print("No button")
        
    old_driver = driver.find_element_by_tag_name('html')

    WebDriverWait(driver, 10).until(EC.staleness_of(old_driver))

In [4]:
def search_paper(doi, driver, engine):
    """
    Given a DOI, an existing driver, and an engine number (0 for Google, 1 for DDG), search for paper.
    
    Returns the associated webdriver.
    """
    
    print("\n---" + "New search " + str(datetime.datetime.now()) + "---")
    print("DOI: " + doi)
    
    if engine == 0:
        search_engine = "https://www.google.com"
        # Constructs the string being searched, randomizes it so it's not so robotic
        if random.randint(0, 1) == 0:
            query_string = '"' + doi + '"' + ' ' + '"researchgate.net"'
        else:
            query_string = '"researchgate.net"' + ' ' + '"' + doi + '"'
        query_string + " -filetype:pdf"
            
    elif engine == 1:
        search_engine = "https://duckduckgo.com/"
        query_string = '\\research gate' + ' ' + '"' + doi + '"'
    
    driver.get(search_engine)

    
    # Enters search string into searchbox
    if engine == 0:
        try: 
            blah = driver.find_element_by_xpath("/html/body/div/div[3]/form/div[2]/div/div[1]/div/div[1]/input")
        except NoSuchElementException: # Tries div[2]
            print("Trying second xpath")
            blah = driver.find_element_by_xpath("/html/body/div/div[3]/form/div[2]/div/div[1]/div/div[2]/input")
        
    
    elif engine == 1:
        blah = driver.find_element_by_xpath('//*[@id="search_form_input_homepage"]')
        
    blah.send_keys(query_string)
    
    
    time.sleep(random.randint(2, 7)) # to respect crawling
    
    if engine == 0:
        # Clicks on "I'm Feeling Lucky" button
        button = driver.find_element_by_xpath('//*[@id="gbqfbb"]')
        driver.execute_script("arguments[0].click()", button)
    elif engine == 1:
        blah.send_keys(Keys.RETURN)

    # Waits for page to load
    old_driver = driver.find_element_by_tag_name('html')
    WebDriverWait(driver, 10).until(EC.staleness_of(old_driver))
    time.sleep(2) # increased from .5 to 2 to respect crawling
    
    # Checks if paper was found
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    if "/sorry/" in driver.current_url:
        raise Exception("Captcha'd")
        
        return None
    elif "google.com" in driver.current_url:
        print("Paper not found")
        return None
    else:
        # Prints DOI string. Need to compare it to the actual doi to see if it's the correct paper.
        print("Success: found at " + str(driver.current_url))
        return driver

In [5]:
def show_authors(driver): 
    """
    Given the webdriver, shows more authors. 
    
    Returns the modified webdriver.
    """
    
    if not driver:
        print("Exiting: No driver")
        return None
    
    first_start = time.time()
    
    time.sleep(2) # Increase from .5 to 2 for stability
    
    try:
        button = driver.find_element_by_xpath("/html/body/div[2]/main/section/section[1]/div[2]/a")
        driver.execute_script("arguments[0].click()", button)
    except:
        print("No button")
        
    time.sleep(1) # Increased to 1 for stability
    
    first_end = time.time()

    print("DONE - " + str(first_end-first_start))
    
    return driver

In [6]:
def remove_chars(string):
    """Removes - and space characters from a string"""
    
    return string.translate({ord(char): None for char in "- "})

In [7]:
def common_chars(string1, string2):
    """Counts the number of common characters in two strings"""
    
    common = Counter(string1.casefold()) & Counter(string2.casefold())
    return sum(common.values())

In [8]:
def compare_names(query_name, rg_name):
    """Compare names while agnostic to special characters and rearranged names"""
    
    # If one string is empty and not the other, return false
    if (query_name == "" and rg_name != "") or (query_name != "" and rg_name == ""):
        return False
    
    # Removes spaces and - from names
    query_clean = remove_chars(query_name)
    rg_clean = remove_chars(rg_name)
    
    # Same length and same amount of common characters
    return len(query_clean) <= len(rg_clean) and common_chars(query_clean, rg_clean) == len(query_clean)

In [9]:
def initial_check(name):
    """Checks if name is an initial, returns true if so"""
    
    return (len(name) == 1) or ("." in name)
# len(name) == 2 and

In [10]:
def compare_authors(query_author, rg_author):
    """
    Compares authors
    
    Parameters:
    query_author : list of of queried author's first name then last name
    rg_author    : string of researchgate author's full name
    """
    
    # Checks if rg_author has any special non-ASCII characters. Translates query_author based on that and sets the author's first and last name strings.
    # Still doesn't address if one half of name uses UTF-8 only characters and the other half doesn't) but unlikely case
    if unidecode(rg_author) == rg_author:
        author_first = unidecode(query_author[0]).split()
        author_last = unidecode(query_author[2]).split()
    else:
        author_first = query_author[0].split()
        author_last = query_author[2].split()
    
    # Splits rg_author into tokens
    rg_tokens = rg_author.split()
    
    # Removes Jr from last name. Need to put the last check in case someone is just named "Jr"
    if len(rg_tokens) > 1 and rg_tokens[-1] == "Jr" or rg_tokens[-1] == "Jr.":
        rg_tokens.pop(-1)
    
    # Deals with no first_name in query_author
    if author_first == "" and compare_names(author_last, rg_tokens[-1]):
        return True
        
    # Incase rg_author uses first name initial, compares the first letter of queried author's first name to that string
    if initial_check(rg_tokens[0]):
        author_first[0] = str(author_first[0][0]) + "."
    
    # Merges first name for queried author
    merged_author_first = ""
    for name in author_first:
        
        # Incase rg_author uses first name initial, compares the first letter of queried author's first name to that string
        if initial_check(rg_tokens[0]):
            name = name[0] + "."
           
        merged_author_first = merged_author_first + name
        
    # Assigns last name to last part of last name
    author_last = author_last[-1]
    
    # Merges all but last name for researchgate name
    merged_rg_first = ""
    for name in rg_tokens[:-1]:
        
        # Adds periods to initials in the name
        if len(name) == 1:
            name = name + "."
        
        # Converts name to initial of queried author is in initial
        if initial_check(author_first[0]):
            name = name[0] + "."
        
        merged_rg_first = merged_rg_first + name
        
    return ( compare_names(merged_author_first, merged_rg_first) and author_last.casefold() == rg_tokens[-1].casefold() )

In [11]:
# a = ['Dong', 'Ok', 'Kim']
# b = "Dong Ok Kim"
# c = "Dong Wook Kim"
# compare_authors(a, b)

In [12]:
def soup_it(driver, author_tokens, pairing_dict, author_id):
    """
    Given the webdriver, parses its source for author URLs.
    
    Parameters:
    driver == webdriver
    author_tokens == list containing the searched author's name in tokens delimited by spaces
    pairing_dict == Dictionary to store the URLs into (for the respective author indicated by author_id)
    author_id == The searched author's id for usage in pairing_dict 
    """    
    
    time.sleep(1) # Increased to 1 for stability
      
    if driver is None:
        print("No soup for you: NO_PAPER")
        pairing_dict[author_id] = "NO_PAPER"
        return -1
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    thing = soup.find_all('div', {"class": "nova-v-person-list-item__title"})
    
    print("Searching for: " + str(author_tokens))
    
    match_count = 0
    answer = ""
    
    for stuff in thing:     
        author_name = stuff.find('a').string
        author_url = stuff.find('a').get("href")

        if compare_authors(author_tokens, author_name):
            print(str(author_url) + " <---------------- " + author_name)
            answer = author_url
            match_count += 1
        else:
            print(author_url + " ~ " + author_name)

    if match_count == 0:
        answer = "NOT_FOUND2"
    elif match_count > 1:
        answer = "Duplicates"
        
    print("Writing: " + answer)
    pairing_dict[author_id] = answer
        
    return match_count

In [13]:
original_file_descriptor = sys.stdout
sys.stdout = open("./scraping_log.txt", "a")

In [14]:
# Written from pairing_unknown_authors
with open("./stored_authors/authors_and_papers.txt", encoding="utf8") as papers_file:
    authors_and_papers = eval(papers_file.read())
with open("./stored_authors/authors_ids.txt", encoding="utf8") as authors_file:
    authors_ids = eval(authors_file.read())

In [17]:
# authors_urls = {}

# for author in authors_and_papers:
#     authors_urls[author] = None

with open("./stored_authors/author_url_pairings.txt", encoding="utf8") as author_pair_file:
    authors_urls = eval(author_pair_file.read())

# Any new entries not already saved in author_url_pairings.txt
for author in authors_and_papers:
    if author not in authors_urls:
        authors_urls[author] = None

In [31]:
search_engine = "https://www.google.com"  # i.e. 0
# search_engine = "https://duckduckgo.com/" # i.e. 1

if search_engine == "https://www.google.com":
    engine_number = 0
elif search_engine == "https://duckduckgo.com/":
    engine_number = 1
    
# replace_flag = "NOT_FOUND"
replace_flag = None

driver = obtain_driver(search_engine)
print("TEST1") #
time.sleep(2)
print("TEST2") #

for author in authors_urls:
    if authors_urls[author] == replace_flag:
        print("TEST4") # 
        # Author has no associated paper
        if len(authors_and_papers[author]) == 0:
            print("\n\nAUTHOR HAS NO PAPER")
            authors_urls[author] = "AUTHOR_HAS_NO_PAPER_IN_DATABASE"
            
            driver.quit()
            time.sleep(1)
            driver = obtain_driver(search_engine)
            time.sleep(2)
            
        else:
            success = soup_it(show_authors(search_paper(authors_and_papers[author][0], driver, engine_number)), authors_ids[author][:3], authors_urls, author)
            print(author)
            sys.stdout.flush()

            if success == -1 or success == 0:
                driver.quit()
                time.sleep(1)
                driver = obtain_driver(search_engine)
                time.sleep(2)

        time.sleep(random.randint(6, 12)) # Increased from 1 to 6-12 to respect crawling
        
driver.quit()



*****2019-07-15 09:59:38.427325*****

Checking for linux64 geckodriver:v0.24.0 in cache
Driver found in /home/local/NIST/jfl2/.wdm/geckodriver/v0.24.0/linux64/geckodriver
TEST1
TEST2
TEST4

---New search 2019-07-15 09:59:44.820607---
DOI: 10.1016/j.memsci.2011.07.048
Success: found at https://www.researchgate.net/publication/229313266_Characterization_and_gas_transport_properties_of_MOF-5_membranes
No button
DONE - 3.0114729404449463
Searching for: ['Y.', 'S.', 'Lin']
https://www.researchgate.net/profile/Zhenxia_Zhao ~ Zhenxia Zhao
https://www.researchgate.net/scientific-contributions/82691542_Xiaoli_Ma ~ Xiaoli Ma
https://www.researchgate.net/profile/Zhong_Li ~ Zhong Li
https://www.researchgate.net/profile/Jerry_Lin8 <---------------- Jerry Y S Lin
Writing: https://www.researchgate.net/profile/Jerry_Lin8
0000-0001-5905-8336
TEST4

---New search 2019-07-15 10:00:04.518290---
DOI: 10.1016/j.micromeso.2009.01.016
Success: found at https://www.researchgate.net/publication/229331191_Atom

In [18]:
with open("./stored_authors/author_url_pairings.txt", "w") as dup_file:
    pprint(authors_urls, stream = dup_file)

In [19]:
sys.stdout = original_file_descriptor

In [48]:
print(len(authors_urls))

997


In [22]:
# # Testing Dong Kim (Wook vs Ok) duplicates

# driver = obtain_driver("https://www.google.com")
# time.sleep(2)

# author = "dee98b81cb9252b78dc4cbe137a1e1983a19f047"

# soup_it(show_authors(search_paper("10.1016/j.jssc.2012.08.046", driver, 0)), authors_ids[author], authors_urls, author)

# driver.quit()



*****2019-07-01 09:24:24.254224*****

Checking for linux64 geckodriver:v0.24.0 in cache
Driver found in /home/local/NIST/jfl2/.wdm/geckodriver/v0.24.0/linux64/geckodriver

---New search 2019-07-01 09:24:31.453188---
DOI: 10.1016/j.jssc.2012.08.046
Success: found at https://www.researchgate.net/publication/256798038_Synthesis_of_MOF_having_hydroxyl_functional_side_groups_and_optimization_of_activation_process_for_the_maximization_of_its_BET_surface_area
No button
DONE - 2.0117602348327637
Searching for: ['Dong', 'Ok', 'Kim']
https://www.researchgate.net/scientific-contributions/2030683750_Jongsik_Kim ~ Jongsik Kim
https://www.researchgate.net/profile/Dong_Kim30 <---------------- Dong Ok Kim
https://www.researchgate.net/scientific-contributions/2153142096_Dong_Wook_Kim <---------------- Dong Wook Kim
https://www.researchgate.net/scientific-contributions/2030803930_Kil_Sagong ~ Kil Sagong
Writing: Duplicates
