# **Let's now talk about scraping the data from the website.**

In [1]:
!pip install beautifulsoup4
!pip install requests



In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re, time, random, os, requests
import numpy as np
import pandas as pd

### Let's do a web search on wikipedia

In [3]:
# search for benzene
time.sleep(random.randint(1, 10))
chemical = 'benzene'
html = urlopen("https://en.wikipedia.org/wiki/" + chemical)

In [4]:
# check what's html
html

<http.client.HTTPResponse at 0x1694c3383a0>

In [5]:
# Since html is a http.client.HTTPResponse object, we can get its properties by dir(html)
print(html.__dir__())

['fp', 'debuglevel', '_method', 'headers', 'msg', 'version', 'status', 'reason', 'chunked', 'chunk_left', 'length', 'will_close', 'code', 'url', '__module__', '__init__', '_read_status', 'begin', '_check_close', '_close_conn', 'close', 'flush', 'readable', 'isclosed', 'read', 'readinto', '_read_next_chunk_size', '_read_and_discard_trailer', '_get_chunk_left', '_readall_chunked', '_readinto_chunked', '_safe_read', '_safe_readinto', 'read1', 'peek', 'readline', '_read1_chunked', '_peek_chunked', 'fileno', 'getheader', 'getheaders', '__iter__', 'info', 'geturl', 'getcode', '__doc__', '__abstractmethods__', '_abc_impl', 'detach', 'readinto1', 'write', '__next__', '__del__', '__new__', 'seek', 'tell', 'truncate', 'seekable', 'writable', '_checkClosed', '_checkSeekable', '_checkReadable', '_checkWritable', 'isatty', '__enter__', '__exit__', 'readlines', 'writelines', '__dict__', 'closed', '__repr__', '__hash__', '__str__', '__getattribute__', '__setattr__', '__delattr__', '__lt__', '__le__',

#### Let's check the status of the website
- 200 means OK. 
- 404 means Not Found. 
- 403 means Forbidden. 
- 500 means Internal Server Error. 

In [6]:
html.status

200

#### Now, let's take a look at the website's content

In [7]:
time.sleep(random.randint(1, 10))
html = urlopen("https://en.wikipedia.org/wiki/" + chemical)
html.read()



In [8]:
time.sleep(random.randint(1, 10))
html = urlopen("https://en.wikipedia.org/wiki/" + chemical)
html.readlines()
#split html to lines

[b'<!DOCTYPE html>\n',
 b'<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-appearance-disabled vector-feature-appearance-pinned-clientpref-0 vector-feature-night-mode-disabled skin-theme-clientpref-day vector-toc-available" lang="en" dir="ltr">\n',
 b'<head>\n',
 b'<meta charset="UTF-8">\n',
 b'<title>Benzene - Wikipedia</title>\n',
 b'<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-

#### It is clear that there is some type of structure with html

Basic Content


<h1>Main Heading</h1>

 	
<h2>Subheading</h2>

 	
<h3>Section heading</h3>

 	
<p>Paragraph text</p>

 	
<br>

<br /> – equivalent syntax (XHTML)

 	
<ul>
  <li>List item text</li>
</ul>

 	
<ol>
  <li>List item text</li>
</ol>

Text Formatting

 	
<b>words to be made bold</b>

<strong>equivalent syntax</strong>

 	
<i>words to be made italic</i>

<em>equivalent syntax</em>

 	
<u>words to underline</u>

 	
<tt>Shown in fixed-width font</tt>

 	
<span style="…">Uses CSS formatting</span>

<span class="…">Uses predefined style</span>

 	
<div style="…">Uses CSS formatting</div>

<div class="…">Uses predefined style</div>

Hypertext

 	
<a href="url">Link text to click on</a>

 	
<img src="url">

<img src="url" /> – XHTML equivalent

 	
<img src="url" align="left">

In [9]:
time.sleep(random.randint(1, 10))
html = urlopen("https://en.wikipedia.org/wiki/" + chemical)
soup = BeautifulSoup(html, "html.parser")

In [10]:
soup

<!DOCTYPE html>

<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-appearance-disabled vector-feature-appearance-pinned-clientpref-0 vector-feature-night-mode-disabled skin-theme-clientpref-day vector-toc-available" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Benzene - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limit

In [11]:
print(soup.__dir__())



In [12]:
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-appearance-disabled vector-feature-appearance-pinned-clientpref-0 vector-feature-night-mode-disabled skin-theme-clientpref-day vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Benzene - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vec

In [13]:
with open('benzene.html', 'w') as f:
    f.write(soup.prettify())

In [14]:
text = soup.get_text()
text



In [15]:
'Melting point' in text

True

In [16]:
# Since text is a string, we can use its method, split, to split the text into lines by '\n' and remove empty lines
texts =  text.splitlines()
texts

['',
 '',
 '',
 '',
 'Benzene - Wikipedia',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'Jump to content',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'Main menu',
 '',
 '',
 '',
 '',
 '',
 'Main menu',
 'move to sidebar',
 'hide',
 '',
 '',
 '',
 '\t\tNavigation',
 '\t',
 '',
 '',
 'Main pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate',
 '',
 '',
 '',
 '',
 '',
 '\t\tContribute',
 '\t',
 '',
 '',
 'HelpLearn to editCommunity portalRecent changesUpload file',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'Search',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'Search',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'Create account',
 '',
 'Log in',
 '',
 '',
 '',
 '',
 '',
 '',
 

In [17]:
# We can remove empty lines by using filter
texts = list(filter(None, texts))
texts

['Benzene - Wikipedia',
 'Jump to content',
 'Main menu',
 'Main menu',
 'move to sidebar',
 'hide',
 '\t\tNavigation',
 '\t',
 'Main pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate',
 '\t\tContribute',
 '\t',
 'HelpLearn to editCommunity portalRecent changesUpload file',
 'Search',
 'Search',
 'Create account',
 'Log in',
 'Personal tools',
 ' Create account Log in',
 '\t\tPages for logged out editors learn more',
 'ContributionsTalk',
 'Contents',
 'move to sidebar',
 'hide',
 '(Top)',
 '1History',
 'Toggle History subsection',
 '1.1Discovery',
 '1.2Ring formula',
 '1.3Nomenclature',
 '1.4Early applications',
 '1.5Occurrence',
 '2Structure',
 '3Benzene derivatives',
 '4Production',
 'Toggle Production subsection',
 '4.1Catalytic reforming',
 '4.2Toluene hydrodealkylation',
 '4.3Toluene disproportionation',
 '4.4Steam cracking',
 '4.5Other methods',
 '5Uses',
 'Toggle Uses subsection',
 '5.1Component of gasoline',
 '6Reactions',
 'Toggle Reactions subsection',


### Let's get some properties of benzene

In [18]:
# Recall that we can use the method index to find the index of an element in a list
texts.index('Melting point')

144

In [19]:
# You can guess that the value of melting point is in the next element
texts[144:146]

['Melting point', '5.53\xa0°C (41.95\xa0°F; 278.68\xa0K)']

In [20]:
# To deal with \xa0, we can use the method replace
texts[145].replace(u'\xa0', u' ')

'5.53 °C (41.95 °F; 278.68 K)'

#### Let's repeat the same steps as above for different properties

In [21]:
properties = ['Molar mass', 'Chemical formula', 'Density', 'Melting point', 'Boiling point', 'Solubility in water', 'Solubility']
for property in properties:
    idx = texts.index(property)
    print([texts[idx], texts[idx+1].replace(u'\xa0', u' ')])

['Molar mass', '78.114 g·mol−1']
['Chemical formula', 'C6H6']
['Density', '0.8765(20) g/cm3[2]']
['Melting point', '5.53 °C (41.95 °F; 278.68 K)']
['Boiling point', '80.1 °C (176.2 °F; 353.2 K)']
['Solubility in water', '1.53 g/L (0 °C) 1.81 g/L (9 °C) 1.79 g/L (15 °C)[3][4][5] 1.84 g/L (30 °C) 2.26 g/L (61 °C) 3.94 g/L (100 °C) 21.7 g/kg (200 °C, 6.5 MPa) 17.8 g/kg (200 °C, 40 MPa)[6]']
['Solubility', 'Soluble in alcohol, CHCl3, CCl4, diethyl ether, acetone, acetic acid[6]']


To obtain all properties, let's search in the "Properties" section of the website.

In [22]:
idx_p = texts.index('Properties') #134
idx_s = texts.index('Structure') #176
properties = [i.replace(u'\xa0', u' ') for i in texts[idx_p+1:idx_s]]
properties

['Chemical formula',
 'C6H6',
 'Molar mass',
 '78.114 g·mol−1',
 'Appearance',
 'Colorless liquid',
 'Odor',
 'sweet aromatic',
 'Density',
 '0.8765(20) g/cm3[2]',
 'Melting point',
 '5.53 °C (41.95 °F; 278.68 K)',
 'Boiling point',
 '80.1 °C (176.2 °F; 353.2 K)',
 'Solubility in water',
 '1.53 g/L (0 °C) 1.81 g/L (9 °C) 1.79 g/L (15 °C)[3][4][5] 1.84 g/L (30 °C) 2.26 g/L (61 °C) 3.94 g/L (100 °C) 21.7 g/kg (200 °C, 6.5 MPa) 17.8 g/kg (200 °C, 40 MPa)[6]',
 'Solubility',
 'Soluble in alcohol, CHCl3, CCl4, diethyl ether, acetone, acetic acid[6]',
 'Solubility in ethanediol',
 '5.83 g/100 g (20 °C) 6.61 g/100 g (40 °C) 7.61 g/100 g (60 °C)[6]',
 'Solubility in ethanol',
 '20 °C, solution in ethanol: 1.2 mL/L (20% v/v)[7]',
 'Solubility in acetone',
 '20 °C, solution in acetone: 7.69 mL/L (38.46% v/v) 49.4 mL/L (62.5% v/v)[7]',
 'Solubility in diethylene glycol',
 '52 g/100 g (20 °C)[6]',
 'log P',
 '2.13',
 'Vapor pressure',
 '12.7 kPa (25 °C) 24.4 kPa (40 °C) 181 kPa (100 °C)[8]',
 'Con

It can be seen that the properties are in x, y format. Let's group them in pairs.

In [23]:
print(len(properties))
property_table = np.array(properties).reshape(int(len(properties)/2), 2)
property_table

42


array([['Chemical formula', 'C6H6'],
       ['Molar mass', '78.114 g·mol−1'],
       ['Appearance', 'Colorless liquid'],
       ['Odor', 'sweet aromatic'],
       ['Density', '0.8765(20) g/cm3[2]'],
       ['Melting point', '5.53 °C (41.95 °F; 278.68 K)'],
       ['Boiling point', '80.1 °C (176.2 °F; 353.2 K)'],
       ['Solubility in water',
        '1.53 g/L (0 °C) 1.81 g/L (9 °C) 1.79 g/L (15 °C)[3][4][5] 1.84 g/L (30 °C) 2.26 g/L (61 °C) 3.94 g/L (100 °C) 21.7 g/kg (200 °C, 6.5 MPa) 17.8 g/kg (200 °C, 40 MPa)[6]'],
       ['Solubility',
        'Soluble in alcohol, CHCl3, CCl4, diethyl ether, acetone, acetic acid[6]'],
       ['Solubility in ethanediol',
        '5.83 g/100 g (20 °C) 6.61 g/100 g (40 °C) 7.61 g/100 g (60 °C)[6]'],
       ['Solubility in ethanol',
        '20 °C, solution in ethanol: 1.2 mL/L (20% v/v)[7]'],
       ['Solubility in acetone',
        '20 °C, solution in acetone: 7.69 mL/L (38.46% v/v) 49.4 mL/L (62.5% v/v)[7]'],
       ['Solubility in diethylene glyco

Now, lets put the properties in a pandas dataframe.

In [24]:
colnames = property_table.T[0]
values = [property_table.T[1]] #要[]因爲panda data frame要是一個column
df = pd.DataFrame(values, columns=colnames)
df

Unnamed: 0,Chemical formula,Molar mass,Appearance,Odor,Density,Melting point,Boiling point,Solubility in water,Solubility,Solubility in ethanediol,...,Solubility in acetone,Solubility in diethylene glycol,log P,Vapor pressure,Conjugate acid,Conjugate base,UV-vis (λmax),Magnetic susceptibility (χ),Refractive index (nD),Viscosity
0,C6H6,78.114 g·mol−1,Colorless liquid,sweet aromatic,0.8765(20) g/cm3[2],5.53 °C (41.95 °F; 278.68 K),80.1 °C (176.2 °F; 353.2 K),1.53 g/L (0 °C) 1.81 g/L (9 °C) 1.79 g/L (15 °...,"Soluble in alcohol, CHCl3, CCl4, diethyl ether...",5.83 g/100 g (20 °C) 6.61 g/100 g (40 °C) 7.61...,...,"20 °C, solution in acetone: 7.69 mL/L (38.46% ...",52 g/100 g (20 °C)[6],2.13,12.7 kPa (25 °C) 24.4 kPa (40 °C) 181 kPa (100...,Benzenium[9],Benzenide[10],255 nm,−54.8·10−6 cm3/mol,1.5011 (20 °C) 1.4948 (30 °C)[6],0.7528 cP (10 °C) 0.6076 cP (25 °C) 0.4965 cP ...


#### Let's repeat this for different chemicals

Let's collect what we have thus far in a function first.

In [27]:
# A function to scrap wikipedia
def get_wiki(chemical):
    html = urlopen("https://en.wikipedia.org/wiki/" + chemical)
    if html.status != 200:
        return 'Page not found'
    else:
        soup = BeautifulSoup(html, "html.parser")
        text = soup.get_text()
        texts = list(filter(None, text.splitlines()))
        idx_p = texts.index('Properties')
        idx_s = texts.index('Structure')
        properties = [i.replace(u'\xa0', u' ') for i in texts[idx_p+1:idx_s]]
        property_table = np.array(properties).reshape(int(len(properties)/2), 2)
        colnames = property_table.T[0]
        values = [property_table.T[1]]
        df = pd.DataFrame(values, columns=colnames)
        return df

In [28]:
#import os
#data = json.load(open(os.path.join(os.path.expanduser('~'),'Downloads/periodictable.json'),encoding='utf-8'))
chemicals = open('solvents.csv').read().splitlines()
for chemical in chemicals:
    time.sleep(1)
    get_wiki(chemical.replace(' ', '_'))

ValueError: 'Structure' is not in list

#### Let's find the table that contains the properties

Let's parse benzene again

In [29]:
# search for benzene
time.sleep(random.randint(1, 10))
chemical = 'benzene'
html = urlopen("https://en.wikipedia.org/wiki/" + chemical)
soup = BeautifulSoup(html, "html.parser")

In [30]:
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-appearance-disabled vector-feature-appearance-pinned-clientpref-0 vector-feature-night-mode-disabled skin-theme-clientpref-day vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Benzene - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vec

In [31]:
soup.find('title')
# table, img, link, h1, h2
# it will return a tag
# only find the first match

<title>Benzene - Wikipedia</title>

In [33]:
soup.find_all('table')
# return a list

[<table class="infobox ib-chembox">
 <caption>Benzene
 </caption>
 <tbody><tr>
 <td colspan="2" style="text-align:center; padding:2px;"><span typeof="mw:File"><a class="mw-file-description" href="/wiki/File:Benzene-3D-vdW.png" title="Benzene molecule"><img alt="Benzene molecule" class="mw-file-element" data-file-height="981" data-file-width="1100" decoding="async" height="134" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/97/Benzene-3D-vdW.png/150px-Benzene-3D-vdW.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/97/Benzene-3D-vdW.png/225px-Benzene-3D-vdW.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/97/Benzene-3D-vdW.png/300px-Benzene-3D-vdW.png 2x" width="150"/></a></span><br/><div style="text-align:center;">Space-filling model</div>
 </td></tr>
 <tr>
 <td class="borderless" colspan="2" style="text-align:center">
 <table border="0" style="width:100%;display:inline-table;">
 <tbody><tr>
 <td style="border-right:1px solid #aaa; width:50%;"><figure class

In [36]:
tables = soup.find('table')
tables.get_text()

'\nBenzene\n\n\nSpace-filling model\n\n\n\n\n\nSkeletal formula detail of benzene.Geometry\n\nBenzene ball-and-stick modelBall and stick model\n\n\n\nBenzene at room temperature\n\n\nNames\n\n\nPreferred IUPAC name\nBenzene[1]\n\n\nOther names\nBenzol (historic/German)PhenanePhenylene hydrideCyclohexa-1,3,5-triene; 1,3,5-Cyclohexatriene (theoretical resonance isomers)[6]Annulene (not recommended[1])Phene (historic)\n\n\nIdentifiers\n\n\nCAS Number\n\n71-43-2\xa0Y\n\n\n3D model (JSmol)\n\nInteractive image\n\n\nChEBI\n\nCHEBI:16716\xa0Y\n\n\nChEMBL\n\nChEMBL277500\xa0Y\n\n\nChemSpider\n\n236\xa0Y\n\n\nECHA InfoCard\n\n100.000.685 \n\n\nEC Number\n\n200-753-7\n\n\nKEGG\n\nC01407\xa0Y\n\n\nPubChem CID\n\n241\n\n\nRTECS number\n\nCY1400000\n\n\nUNII\n\nJ64922108F\xa0Y\n\n\nCompTox Dashboard (EPA)\n\nDTXSID3039242 \n\n\n\nInChI\nInChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H\xa0YKey:\xa0UHOVQNZJYSORNB-UHFFFAOYSA-N\xa0Y\n\n\n\n\nSMILES\nc1ccccc1\n\n\n\nProperties\n\n\nChemical formula\n\nC6H6\n\n\nMolar

In [37]:
tables = soup.find_all('table')
print(tables[0].prettify())

<table class="infobox ib-chembox">
 <caption>
  Benzene
 </caption>
 <tbody>
  <tr>
   <td colspan="2" style="text-align:center; padding:2px;">
    <span typeof="mw:File">
     <a class="mw-file-description" href="/wiki/File:Benzene-3D-vdW.png" title="Benzene molecule">
      <img alt="Benzene molecule" class="mw-file-element" data-file-height="981" data-file-width="1100" decoding="async" height="134" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/97/Benzene-3D-vdW.png/150px-Benzene-3D-vdW.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/97/Benzene-3D-vdW.png/225px-Benzene-3D-vdW.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/97/Benzene-3D-vdW.png/300px-Benzene-3D-vdW.png 2x" width="150"/>
     </a>
    </span>
    <br/>
    <div style="text-align:center;">
     Space-filling model
    </div>
   </td>
  </tr>
  <tr>
   <td class="borderless" colspan="2" style="text-align:center">
    <table border="0" style="width:100%;display:inline-table;">
     <tbody

We can search by html tags

In [38]:
table = tables[0]
ths = table.find_all('th') # th = table header
print(ths)

[<th colspan="2" style="background: #f8eaba; text-align: center;">Names
</th>, <th colspan="2" style="background: #f8eaba; text-align: center;">Identifiers
</th>, <th colspan="2" style="background: #f8eaba; text-align: center;">Properties
</th>, <th colspan="2" style="background: #f8eaba; text-align: center;">Structure
</th>, <th colspan="2" style="background: #f8eaba; text-align: center;">Thermochemistry
</th>, <th colspan="2" style="background: #f8eaba; text-align: center;">Hazards
</th>, <th colspan="2" style="background: #f8eaba; text-align: center;">Related compounds
</th>, <th colspan="2" style="background: #f8eaba; text-align: center;">Supplementary data page
</th>]


In [41]:
table.find_all('th')[3].get_text().strip()

'Structure'

In [39]:
headers = [th.get_text() for th in ths]
print(headers)

['Names\n', 'Identifiers\n', 'Properties\n', 'Structure\n', 'Thermochemistry\n', 'Hazards\n', 'Related compounds\n', 'Supplementary data page\n']


In [42]:
headers = [th.get_text().rstrip() for th in ths]
print(headers)

['Names', 'Identifiers', 'Properties', 'Structure', 'Thermochemistry', 'Hazards', 'Related compounds', 'Supplementary data page']


In [56]:
table = soup.find_all('table')[0]
text = table.get_text()
texts = text.splitlines()
texts = list(filter(None, texts))
texts

headers = table.find_all('th')
headers = [header.get_text().rstrip() for header in headers]
idx_p_header = headers.index('Properties')
header_next = headers[idx_p_header+1]
idx_p_text = texts.index('Properties')
idx_next_text = texts.index(header_next)
print(texts[idx_p_text+1:idx_next_text])
result = np.array(texts[idx_p_text+1:idx_next_text])
result = result.reshape(result.shape[0]//2,2)
colnams = result.T[0]
values = [result.T[1]]
df = pd.DataFrame(values, columns=colnams)
df.insert(0, "Name", ["Benzene"], True)
df



['Chemical formula', 'C6H6', 'Molar mass', '78.114\xa0g·mol−1', 'Appearance', 'Colorless liquid', 'Odor', 'sweet aromatic', 'Density', '0.8765(20) g/cm3[2]', 'Melting point', '5.53\xa0°C (41.95\xa0°F; 278.68\xa0K)', 'Boiling point', '80.1\xa0°C (176.2\xa0°F; 353.2\xa0K)', 'Solubility in water', '1.53 g/L (0 °C) 1.81 g/L (9 °C) 1.79 g/L (15 °C)[3][4][5] 1.84 g/L (30 °C) 2.26 g/L (61 °C) 3.94 g/L (100 °C) 21.7 g/kg (200 °C, 6.5 MPa) 17.8 g/kg (200 °C, 40 MPa)[6]', 'Solubility', 'Soluble in alcohol, CHCl3, CCl4, diethyl ether, acetone, acetic acid[6]', 'Solubility in ethanediol', '5.83 g/100 g (20 °C) 6.61 g/100 g (40 °C) 7.61 g/100 g (60 °C)[6]', 'Solubility in ethanol', '20 °C, solution in ethanol: 1.2 mL/L (20% v/v)[7]', 'Solubility in acetone', '20 °C, solution in acetone: 7.69 mL/L (38.46% v/v) 49.4 mL/L (62.5% v/v)[7]', 'Solubility in diethylene glycol', '52 g/100 g (20 °C)[6]', 'log P', '2.13', 'Vapor pressure', '12.7 kPa (25 °C) 24.4 kPa (40 °C) 181 kPa (100 °C)[8]', 'Conjugate ac

Unnamed: 0,Name,Chemical formula,Molar mass,Appearance,Odor,Density,Melting point,Boiling point,Solubility in water,Solubility,...,Solubility in acetone,Solubility in diethylene glycol,log P,Vapor pressure,Conjugate acid,Conjugate base,UV-vis (λmax),Magnetic susceptibility (χ),Refractive index (nD),Viscosity
0,Benzene,C6H6,78.114 g·mol−1,Colorless liquid,sweet aromatic,0.8765(20) g/cm3[2],5.53 °C (41.95 °F; 278.68 K),80.1 °C (176.2 °F; 353.2 K),1.53 g/L (0 °C) 1.81 g/L (9 °C) 1.79 g/L (15 °...,"Soluble in alcohol, CHCl3, CCl4, diethyl ether...",...,"20 °C, solution in acetone: 7.69 mL/L (38.46% ...",52 g/100 g (20 °C)[6],2.13,12.7 kPa (25 °C) 24.4 kPa (40 °C) 181 kPa (100...,Benzenium[9],Benzenide[10],255 nm,−54.8·10−6 cm3/mol,1.5011 (20 °C) 1.4948 (30 °C)[6],0.7528 cP (10 °C) 0.6076 cP (25 °C) 0.4965 cP ...


In [43]:
text = table.get_text()
texts =  text.splitlines()
texts = list(filter(None, texts))
idx_p_header = headers.index('Properties')
idx_p = texts.index('Properties')
idx_stop = texts.index(headers[idx_p_header+1])
properties = [i.replace(u'\xa0', u' ') for i in texts[idx_p+1:idx_stop]]
properties

['Chemical formula',
 'C6H6',
 'Molar mass',
 '78.114 g·mol−1',
 'Appearance',
 'Colorless liquid',
 'Odor',
 'sweet aromatic',
 'Density',
 '0.8765(20) g/cm3[2]',
 'Melting point',
 '5.53 °C (41.95 °F; 278.68 K)',
 'Boiling point',
 '80.1 °C (176.2 °F; 353.2 K)',
 'Solubility in water',
 '1.53 g/L (0 °C) 1.81 g/L (9 °C) 1.79 g/L (15 °C)[3][4][5] 1.84 g/L (30 °C) 2.26 g/L (61 °C) 3.94 g/L (100 °C) 21.7 g/kg (200 °C, 6.5 MPa) 17.8 g/kg (200 °C, 40 MPa)[6]',
 'Solubility',
 'Soluble in alcohol, CHCl3, CCl4, diethyl ether, acetone, acetic acid[6]',
 'Solubility in ethanediol',
 '5.83 g/100 g (20 °C) 6.61 g/100 g (40 °C) 7.61 g/100 g (60 °C)[6]',
 'Solubility in ethanol',
 '20 °C, solution in ethanol: 1.2 mL/L (20% v/v)[7]',
 'Solubility in acetone',
 '20 °C, solution in acetone: 7.69 mL/L (38.46% v/v) 49.4 mL/L (62.5% v/v)[7]',
 'Solubility in diethylene glycol',
 '52 g/100 g (20 °C)[6]',
 'log P',
 '2.13',
 'Vapor pressure',
 '12.7 kPa (25 °C) 24.4 kPa (40 °C) 181 kPa (100 °C)[8]',
 'Con

In [44]:
property_table = np.array(properties).reshape(int(len(properties)/2), 2)
colnames = property_table.T[0]
values = [property_table.T[1]]
df = pd.DataFrame(values, columns=colnames)
print(df)

  Chemical formula      Molar mass        Appearance            Odor  \
0             C6H6  78.114 g·mol−1  Colorless liquid  sweet aromatic   

               Density                 Melting point  \
0  0.8765(20) g/cm3[2]  5.53 °C (41.95 °F; 278.68 K)   

                 Boiling point  \
0  80.1 °C (176.2 °F; 353.2 K)   

                                 Solubility in water  \
0  1.53 g/L (0 °C) 1.81 g/L (9 °C) 1.79 g/L (15 °...   

                                          Solubility  \
0  Soluble in alcohol, CHCl3, CCl4, diethyl ether...   

                            Solubility in ethanediol  ...  \
0  5.83 g/100 g (20 °C) 6.61 g/100 g (40 °C) 7.61...  ...   

                               Solubility in acetone  \
0  20 °C, solution in acetone: 7.69 mL/L (38.46% ...   

  Solubility in diethylene glycol log P  \
0           52 g/100 g (20 °C)[6]  2.13   

                                      Vapor pressure Conjugate acid  \
0  12.7 kPa (25 °C) 24.4 kPa (40 °C) 181 kPa (100... 

#### Let's modify our function

In [47]:
# A function to scrap wikipedia
def get_wiki(chemical):
    html = urlopen("https://en.wikipedia.org/wiki/" + chemical)
    if html.status != 200:
        return 'Page not found'
    else:
        # load html as soup
        soup = BeautifulSoup(html, "html.parser")
        # get tables from soup
        tables = soup.find_all('table')
        table = tables[0] # We assume table 0 is the table we want, this is dangerous
        ths = table.find_all('th') # th = table header
        headers = [th.get_text().rstrip() for th in ths]
        text = table.get_text()
        texts =  text.splitlines()
        texts = list(filter(None, texts))
        idx_p_header = headers.index('Properties')
        idx_p = texts.index('Properties')
        idx_stop = texts.index(headers[idx_p_header+1])
        properties = [i.replace(u'\xa0', u' ') for i in texts[idx_p+1:idx_stop]]
        property_table = np.array(properties).reshape(int(len(properties)/2), 2)
        colnames = property_table.T[0]
        values = [property_table.T[1]]
        df = pd.DataFrame(values, columns=colnames)
        return df

In [58]:
chemicals = open('solvents.csv').read().splitlines()
flag = False
for i, chemical in enumerate(chemicals[:45]):
    try:
        time.sleep(1)
        if not flag:
            df = get_wiki(chemical.replace(' ', '_'))
            df.insert(0, "Name", [chemical], True)
            flag = True
        else:
            df_new = get_wiki(chemical.replace(' ', '_'))
            df_new.insert(0, "Name", [chemical], True)
            df = pd.merge(df, df_new, how='outer')
        print(chemical, 'proceeded.')
    except Exception as e:
        print(chemical, 'failed because', e)

acetic acid proceeded.
acetone proceeded.
acetonitrile proceeded.
benzene proceeded.
1-butanol proceeded.
2-butanol proceeded.
2-butanone proceeded.
t-butyl alcohol proceeded.
carbon tetrachloride proceeded.
chlorobenzene proceeded.
chloroform proceeded.
cyclohexane proceeded.
1,2-dichloroethane proceeded.
diethylene glycol proceeded.
diethyl ether proceeded.
diglyme proceeded.
1,2-dimethoxyethane proceeded.
dimethyl-formamide failed because HTTP Error 404: Not Found
dimethyl sulfoxide proceeded.
1,4-dioxane proceeded.
ethanol proceeded.
ethyl acetate proceeded.
ethylene glycol proceeded.
glycerin proceeded.
heptane proceeded.
hexamethylphosphoramide proceeded.
hexamethylphosphorous triamide failed because list index out of range
hexane proceeded.
methanol proceeded.
methyl t-butyl ether proceeded.
methylene chloride proceeded.
N-methyl-2-pyrrolidinone failed because HTTP Error 404: Not Found
nitromethane proceeded.
pentane failed because 'Properties' is not in list
petroleum ether pro

In [59]:
df

Unnamed: 0,Name,Chemical formula,Molar mass,Appearance,Odor,Density,Melting point,Boiling point,Solubility in water,log P,...,Solubility in diethylene glycol,Conjugate acid,Solubility in other solvents,Solubility in dimethyl sulfoxide,Solubility in Diethyl ether,"Solubility in ethanol, acetone, diethyl ether, benzene","Critical point (T, P)",Solubility in Ethanol,Basicity (pKb),Solubility in diethyl ether
0,acetic acid,CH3COOH,60.052 g·mol−1,Colourless liquid,Heavily vinegar-like,1.049 g/cm3 (liquid); 1.27 g/cm3 (solid),16 to 17 °C; 61 to 62 °F; 289 to 290 K,118 to 119 °C; 244 to 246 °F; 391 to 392 K,Miscible,−0.28[4],...,,,,,,,,,,
1,acetone,C3H6O,58.080 g·mol−1,Colourless liquid,"Pungent, fruity[9]",0.7845 g/cm3 (25 °C)[10],−94.9 °C (−138.8 °F; 178.2 K)[10],56.08 °C (132.94 °F; 329.23 K)[10],Miscible[10],−0.24[11],...,,,,,,,,,,
2,acetonitrile,C2H3N,41.053 g·mol−1,Colorless liquid,"Faint, distinct, fruity",0.786 g/cm3 at 25°C,−46 to −44 °C; −51 to −47 °F; 227 to 229 K,81.3 to 82.1 °C; 178.2 to 179.7 °F; 354.4 to 3...,Miscible,−0.334,...,,,,,,,,,,
3,benzene,C6H6,78.114 g·mol−1,Colorless liquid,sweet aromatic,0.8765(20) g/cm3[2],5.53 °C (41.95 °F; 278.68 K),80.1 °C (176.2 °F; 353.2 K),1.53 g/L (0 °C) 1.81 g/L (9 °C) 1.79 g/L (15 °...,2.13,...,52 g/100 g (20 °C)[6],Benzenium[9],,,,,,,,
4,1-butanol,C4H10O,74.123 g·mol−1,"Colourless, refractive liquid","banana-like,[2] harsh, alcoholic and sweet",0.81 g/cm3,−89.8 °C (−129.6 °F; 183.3 K),117.7 °C (243.9 °F; 390.8 K),73 g/L at 25 °C,0.839,...,,,,,,,,,,
5,2-butanol,C4H10O,74.123 g·mol−1,,,0.808 g cm−3,−115 °C; −175 °F; 158 K,98 to 100 °C; 208 to 212 °F; 371 to 373 K,290 g/L[3],0.683,...,,,,,,,,,,
6,2-butanone,C4H8O,72.107 g·mol−1,Colorless liquid,Mint or acetone-like[3],0.8050 g/mL,−86 °C (−123 °F; 187 K),79.64 °C (175.35 °F; 352.79 K),27.5 g/100 mL,0.37[4],...,,,,,,,,,,
7,t-butyl alcohol,C4H10O,74.123 g·mol−1,Colorless solid,Camphorous,0.775 g/mL,25 to 26 °C; 77 to 79 °F; 298 to 299 K,82 to 83 °C; 179 to 181 °F; 355 to 356 K,miscible[2],0.584,...,,,,,,,,,,
8,carbon tetrachloride,CCl4,153.81 g·mol−1,Colourless liquid,chloroform-like odor,1.5867 g·cm−3 (liquid)1.831 g·cm−3 at −186 °C ...,−22.92 °C (−9.26 °F; 250.23 K),76.72 °C (170.10 °F; 349.87 K),0.097 g/100 mL (0 °C)0.081 g/100 mL (25 °C),2.64,...,,,,,,,,,,
9,chlorobenzene,C6H5Cl,112.56 g/mol,colorless liquid,almond-like[2],"1.11 g/cm3, liquid",−45.58 °C (−50.04 °F; 227.57 K),131.70 °C (269.06 °F; 404.85 K),0.5 g l−1 in water at 20 °C,,...,,,soluble in most organic solvents,,,,,,,


In [None]:
df.to_excel('solvent_chemfinder_wiki.xlsx', index=False)

#### We can also get images from the website

In [None]:
# search for benzene
time.sleep(random.randint(1, 10))
chemical = 'benzene'
html = urlopen("https://en.wikipedia.org/wiki/" + chemical)
soup = BeautifulSoup(html, "html.parser")

In [None]:
images = soup.find_all('img')
images

In [None]:
image_urls = [img.get('src') for img in images if img.get('src') is not None]
image_urls

In [None]:
alts = []
for image in images:
    alt = image.get('alt')
    src = image.get('src')
    print(alt, src)
    alts.append(alt)

In [None]:
for alt in set(alts):
    print(alt, alts.count(alt))

In [None]:
fdir = 'benzene'
if not os.path.exists(fdir):
    os.mkdir(fdir)
for image in images:
    alt = image.get('alt')
    src = image.get('src')
    if alt not in ['check', ''] and '/static' not in src and alt != None:
        if 'http' not in src:
            src = 'https:' + src
        try:
            response = requests.get(src)
            if response.status_code == 200:
                fname = alt.replace(' ', '_') + ".png"
                fpath = os.path.join(fdir, fname)
                with open(fpath, "wb") as file:
                    file.write(response.content)
                print(alt, 'saved.')
            else:
                print('Link not accessible.')
        except Exception as e:
            print(alt, 'failed because of', e)