In [16]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import time
import pandas as pd
import numpy as np
import regex as re
import requests
import os
import sys
from io import StringIO
import pymongo
import urllib

In [17]:
def extract_components_table():
    '''
    extract components table on the right side.
    '''
    names_cols = ['Component', 'Name', 'Formula', 'Mol weight', 'Structure']
    components_first_row = driver.find_element(By.XPATH, '//*[@id="dscomp_pane"]/table/tr[1]').text
    components_not_first_row = driver.find_element(By.XPATH, '//*[@id="dscomp_pane"]/table').text[len(components_first_row):]
    num_rows = len(re.findall('\n', components_not_first_row))
    num_img = len(re.findall("\n", driver.find_element(By.XPATH,"//*[@id='dscomp_pane']/table").text))


    components_df = pd.DataFrame( np.nan, index=2, columns = names_cols)
    
    
    df = pd.read_csv(StringIO(components_first_row), sep=' +', header=None)    
    df.columns = names_cols    
    
    return df

In [18]:
def extract_reference():
    '''
    return title and reference of data.
    '''
    return driver.find_element(By.ID, "refview").text.split('\n')[1:]  

In [171]:
def get_smiles(chemical_name):
    '''
    This function require "requests" package to run. 
    :param chemical_name: chemical name, alphanumeric,.
    
    '''
    smiles = []
    for i in re.split(" ", chemical_name, 1):
        r = requests.get("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/%s/property/CanonicalSMILES/txt"%i)
        
        if r.status_code == 200:
            smiles.append(r.content.decode('ascii').strip())
        else:
            try:
                smile = get_smile_from_google(i)
                smiles.append(smile)
            except:
                smiles.append("NA")
                print("SMILES searching failed")
    return '~'.join(smiles)

In [211]:
def get_smile_from_google(chemical_name):
    """
    returns SMILES structure by using the first pubchem search result on Google.
    Once on pubchem, it clicks the parent link and extract Smiles.
    input chemical_name: string
    """
    
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    driver = webdriver.Chrome()
    url = "https://www.google.com"
    driver.get(url)
    
    ## Pull up search results by key_words.
    search_box = driver.find_element(By.NAME, 'q')
    search_box.clear()
    search_box.send_keys(chemical_name)
    search_box.send_keys(Keys.RETURN)
    
    # find and navigate to pubchem website
    result = driver.find_elements_by_xpath("//a[@href]")
    pubchem_url = [ i.get_attribute("href") for i in result if 'pubchem' in i.get_attribute("href") ][0]
    driver.get(pubchem_url)
    
    ## Get parent compound link from pubchem webpage.
    time.sleep(2)
    result = driver.find_elements_by_xpath("//a[@href]")
    parent_link = [ i.get_attribute("href") for i in result if i.get_attribute("data-label") == "Content Link: Parent-Compound"][0]
    
    driver.get(parent_link)
    time.sleep(1)
    result = driver.find_element_by_xpath("//section[@id='Canonical-SMILES']/div[@class='section-content']/div[@class='section-content-item']/p")
    smile = result.text
    
    driver.close()
    #print(smile)
    return smile

In [212]:

get_smile_from_google("bis(trifluoromethanesulfonyl)imide")


'CC[S+](CC)CC'

In [98]:
def extract_component_table( ):
    '''
    requires: get_smiles function.
    
    return component table as dataframe, and save structure images to images folder in current folder.
    '''
    if not os.path.exists('./images'):
        os.mkdir("./images")
        
    names_cols = ['Component', 'Name', 'Formula', 'Mol weight', 'SMILES', 'Structure']
    ## Number of components. use for image saving loop.
    num_rows = len(re.findall("\n", driver.find_element(By.XPATH, "//*[@id='dscomp_pane']/table").text))
    component_matrix = []

    for i in range(num_rows):
        component_row = []
        for j in range(4):
            component_row.append(driver.find_element( By.XPATH, "//*[@id='dscomp_pane']/table/tr[{}]/td[{}]".format(i+2, j+1)).text)


        img_url = driver.find_element(By.XPATH, "//*[@id='dscomp_pane']/table/tr[{}]/td[5]/input".format(i+2)).get_attribute("src")
        img_name = re.split('=',img_url)[-1]

        ## Avoiding save if exists duplicated file.
        if not os.path.exists('./images/{}.jpeg'.format(img_name)):
            img = requests.get(img_url, headers={'user-agent':'Chrome/16.0'}, timeout=30)

            with open('./images/{}.jpeg'.format(img_name), 'wb') as imgf:
                imgf.write(img.content)
                
        component_row.append(get_smiles(component_row[1] ))
        component_row.append(img_url)
        component_matrix.append(component_row)
    
    df = pd.DataFrame(component_matrix, columns=names_cols)
    
    return df

In [99]:
def extract_data_table():
    '''
    Extract experimental data table on the right side.
    '''
    data_first_row = driver.find_element(By.XPATH, '//*[@id="dsdata_pane"]/table[2]/tr[1]').text
    data_not_first_row = driver.find_element(By.XPATH, '//*[@id="dsdata_pane"]/table[2]').text[len(data_first_row):]
    data_not_first_row = re.sub(" +± +", "±", data_not_first_row)
    df = pd.read_csv(StringIO(data_not_first_row), sep=' +', header=None, engine='python')
    num_cols = df.shape[1]

    names_cols = []
    for i in range(num_cols):
        names_cols.append(driver.find_element(By.XPATH, '//*[@id="dsdata_pane"]/table[2]/tr[1]/th[{}]'.format(i+1)).text)
    df.columns = names_cols
    
    return df

In [100]:
# Set up mongodb connection:
client = pymongo.MongoClient("mongodb://127.0.0.1:27017")
db = client.JiFeng
con = db.ILthermo

In [101]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
driver = webdriver.Chrome()
url = "https://ilthermo.boulder.nist.gov"
driver.get(url)
IL_name = "a"

In [117]:



## Pull up search results by key_words.
search_box = driver.find_element(By.ID, 'sbutton_label')
search_box.click()

IL_search_box = driver.find_element(By.ID, "cmp")
IL_search_box.send_keys(IL_name)

IL_search_button = driver.find_element(By.XPATH, "/html/body/div[4]/div[2]/div[2]/span[1]")
IL_search_button.click()

time.sleep(30)
num_pages = int(driver.find_element(By.XPATH, '//*[@id="dsgrid"]/div[4]/div/div[2]/span[2]/span[5]').text)


## Saving left table.



In [116]:
driver.maximize_window()
split_bar_id = "gridholder_splitter"
split_bar = driver.find_element(By.ID, split_bar_id)
time.sleep(2)
action = ActionChains(driver)
action.drag_and_drop_by_offset(split_bar, -500, 0).perform()



NoSuchWindowException: Message: no such window: window was already closed
  (Session info: chrome=102.0.5005.115)


In [121]:
if __name__ == "__main__":

## Calculate number of pages would be looping through
    num_pages = int(driver.find_element(By.XPATH, '//*[@id="dsgrid"]/div[4]/div/div[2]/span[2]/span[5]').text)
    #count = 0

    ## Looping each row in the left table.
    #next_page_locator = [2,3,4,5,6] + list(np.repeat(6, num_pages)) ## from beginning
    next_page_locator = list(np.repeat(6, num_pages)) ## continue

    for page in range(num_pages):

        ## Find all rows.
        IL_rows = driver.find_elements(By.XPATH, '//*[contains(@id, "dsgrid-row-")]')

        for i, IL_row in enumerate(IL_rows):
            time.sleep(.15)
            try:
                IL_row.click()
            except:
                print("IL_row failed.")
            time.sleep(.5)
            try:
                myElem = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.ID, 'refview')))
                print ("Page is ready!")
            except:
                print ("Loading took too much time!")
                
            ## Get info from left table
            t = re.sub("\) ", ")\n", IL_row.text).split('\n')
            ## Get title and reference.
            title, reference = extract_reference()
            ## extract basic information for each component:
            comp_table = extract_component_table().to_dict(orient = 'records')
            #print(comp_table)
            ## Make sure list contains three elements.
            if len(comp_table) == 1:
                comp_table.append({'Name':'None'})
                comp_table.append({'Name':'None'})
            elif len(comp_table) == 2:
                comp_table.append({'Name':'None'})
                
            ## Extract Data table:
            data_table = extract_data_table().to_dict(orient = 'records')
            
            
            ## define post array.        
            post_array = {"_id": count, "ref": t[0], "property": t[1], "phase": t[2], "datapoints": t[3],
                         "title": title, "reference": reference, 
                          "component 1": comp_table[0],
                          "component 2": comp_table[1],
                          "component 3": comp_table[2],
                          "data": data_table}
            try:
                con.insert_one(post_array)
            except:
                print("ID duplicated.")
            count += 1
            #print(count)
            #print(extract_component_table())



        next_page_button = driver.find_element(By.XPATH, '//*[@id="dsgrid"]/div[4]/div/div[2]/span[2]/span[{}]'.format(next_page_locator[page]))
        next_page_button.click()

Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
IL_row failed.
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is ready!
Page is re

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[@id="dsgrid"]/div[4]/div/div[2]/span[2]/span[6]"}
  (Session info: chrome=102.0.5005.115)


In [90]:
count=count-5

In [120]:
#num_pages = int(driver.find_element(By.XPATH, '//*[@id="dsgrid"]/div[4]/div/div[2]/span[2]/span[5]').text)

for i in range(4502):
    next_page_locator = list(np.repeat(6, num_pages))
    next_page_button = driver.find_element(By.XPATH, '//*[@id="dsgrid"]/div[4]/div/div[2]/span[2]/span[{}]'.format(next_page_locator[i]))
    next_page_button.click()

In [118]:
count

45023

In [107]:
# import the random module, which includes functions related to generating pseudo-random numbers
import random

# generate two pseudo-random numbers between 1 and 10 inclusive
rand1 = random.randint(1,10)
rand2 = random.randint(1,10)

# ask the user to guess the numbers.
guess1 = input("Guess a number between 1 and 10, inclusive: ")
guess2 = input("Guess another number between 1 and 10, inclusive: ")

# check whether the user guessed correctly
if guess1.isnumeric() and int(guess1) > 0 and int(guess1) < 11:
    if guess2.isnumeric() and int(guess2) > 0 and int(guess2) < 11:
        if int(guess1) == rand1 and int(guess2)  == rand2:
            print("Right!")
        if int(guess1) != rand1 or int(guess2) != rand2:
            print("Totally wrong!")
        elif int(guess1) == rand1 or int(guess2) == rand2:
            print("Partially right!")
    else:
        print("Bad second number!")
else:
    print("Bad first number!")
print("Done!")

KeyboardInterrupt: Interrupted by user

In [70]:
extract_component_table()

Unnamed: 0,Component,Name,Formula,Mol weight,SMILES,Structure
0,1,methanol,CH4O,32.04,CO,https://ilthermo.boulder.nist.gov/ILT2/ilimage...
1,2,1-ethyl-3-methylimidazolium ethyl sulfate,C8H16N2O4S,236.29,CCN1C=C[N+](=C1)C~CCOS(=O)(=O)O\nCCOS(=O)(=O)[O-],https://ilthermo.boulder.nist.gov/ILT2/ilimage...


In [68]:
re.split(' ', '1-ethyl-3-methylimidazolium ethyl sulfate', 1)

['1-ethyl-3-methylimidazolium', 'ethyl sulfate']

In [9]:
driver = webdriver.Chrome()
url = "https://i6.cims.nyu.edu/~hp1369/index.html"
driver.get(url)
IL_name = "a"


## Pull up search results by key_words.


In [13]:
driver.set_window_size(1200, 800)
container = driver.find_element(By.CSS_SELECTOR, '.container')
container.size['width']

1200

In [22]:
browser_widths = [961, 1400]
driver = webdriver.Chrome()


for browser_width in browser_widths:
  # set the browser width
  driver.set_window_size(browser_width, 800)

  # the pages to test    
  pages = ['index.html', 'about_me.html', 'more_about_me.html', 'topic_of_interest.html']

  # test each page
  for page in pages:
    url = "{}/{}".format("https://i6.cims.nyu.edu/~hp1369", page)
    driver.get(url) # return to page of interest
    sleep(.5)
    container = driver.find_element_by_css_selector(".container")
    header = driver.find_element_by_tag_name("header")
    footer = driver.find_element_by_tag_name("footer")
    column1 = driver.find_element_by_css_selector(".column1")
    column2 = driver.find_element_by_css_selector(".column2")
    column3 = driver.find_element_by_css_selector(".column3")

    # determine margin and padding on container
    # pl = int(container.value_of_css_property('padding-left')[:-2])
    # pr = int(container.value_of_css_property('padding-right')[:-2])
    # ml = int(container.value_of_css_property('margin-left')[:-2])
    # mr = int(container.value_of_css_property('margin-right')[:-2])
    # available_width = container.size['width'] - pl - pr - ml - mr

    # check widths are appropriate
    assert container.size["width"] <= browser_width
    assert header.size["width"] <= container.size["width"]
    assert footer.size["width"] <= container.size["width"]
    assert column1.size["width"] <= container.size["width"]
    assert column2.size["width"] <= container.size["width"]
    assert column3.size["width"] <= container.size["width"]

    # check that the columns fit in one row
    assert column1.size['width'] + column2.size['width'] + column3.size['width'] <= container.size["width"]

    # remember the size of the container
    last_container_width = -1
    if last_container_width >= 0:
        assert container.size['width'] == last_container_width
    else:
        last_container_width = container.size['width']

    # check floats are appropriate
    assert container.value_of_css_property('float') == 'none'
    assert column1.value_of_css_property('float') == 'left'
    assert column2.value_of_css_property('float') == 'left'
    assert column3.value_of_css_property('float') == 'left'

In [24]:
assert 1 == 1

In [30]:
driver = webdriver.Chrome()


browser_widths = [481, 960]  # min and max tablet widths
last_container_width = -1 # keep track of the width we detect
# try out both min and max widths
for browser_width in browser_widths:
  # set the browser width
  driver.set_window_size(browser_width, 800)

  # the pages to test    
  pages = ['index.html', 'about_me.html', 'more_about_me.html']#, 'topic_of_interest.html']

  # test each page
  for page in pages:
    url = "{}/{}".format("https://i6.cims.nyu.edu/~hp1369", page)
    driver.get(url) # return to page of interest

    container = driver.find_element_by_css_selector(".container")
    header = driver.find_element_by_tag_name("header")
    footer = driver.find_element_by_tag_name("footer")
    column1 = driver.find_element_by_css_selector(".column1")
    column2 = driver.find_element_by_css_selector(".column2")
    column3 = driver.find_element_by_css_selector(".column3")

    # determine margin and padding on container
    # pl = int(container.value_of_css_property('padding-left')[:-2])
    # pr = int(container.value_of_css_property('padding-right')[:-2])
    # ml = int(container.value_of_css_property('margin-left')[:-2])
    # mr = int(container.value_of_css_property('margin-right')[:-2])
    # available_width = container.size['width'] - pl - pr - ml - mr

    # check widths are appropriate
    assert container.size["width"] <= browser_width
    assert header.size["width"] <= container.size["width"]
    assert footer.size["width"] <= container.size["width"]
    assert column1.size["width"] <= container.size["width"]
    assert column2.size["width"] <= container.size["width"]
    assert column3.size["width"] <= container.size["width"]

    # check that the columns fit in one row
    assert column1.size['width'] + column2.size['width'] <= container.size["width"]

    # remember the size of the container
    if last_container_width >= 0:
        assert container.size['width'] == last_container_width
    else:
        last_container_width = container.size['width']

    # check floats are appropriate
    assert container.value_of_css_property('float') == 'none'
    assert column1.value_of_css_property('float') == 'left'
    assert column2.value_of_css_property('float') == 'left'
    # assert column3.value_of_css_property('float') == 'left'

AssertionError: 

In [40]:
driver.set_window_size(1400, 800)

In [41]:
driver.get_window_size()

{'width': 1400, 'height': 800}