In [2]:
#compare among re & BeautifulSoup & lxml
FIELDS = ('area', 'population', 'iso', 'country', 'capital',
'continent', 'tld', 'currency_code', 'currency_name', 'phone',
'postal_code_format', 'postal_code_regex', 'languages',
'neighbours')

In [3]:
import re
def re_scraper(html):
    results = {}
    for field in FIELDS:
        results[field] = re.search('<tr id="places_%s__row">.*?<td class="w2p_fw">(.*?)</td>' % field, html).groups()[0]
    return results

In [4]:
from bs4 import BeautifulSoup
def bs_scraper(html):
    soup = BeautifulSoup(html, 'html.parser')
    results = {}
    for field in FIELDS:
        results[field] = soup.find('table').find('tr',id='places_%s__row' % field).find('td',class_='w2p_fw').text
    return results

In [5]:
import lxml.html
def lxml_scraper(html):
    tree = lxml.html.fromstring(html)
    results = {}
    for field in FIELDS:
        results[field] = tree.cssselect('table > tr#places_%s__row > td.w2p_fw' % field)[0].text_content()
    return results

In [19]:
import time
from download_iteration import download

In [15]:
NUM_ITERATIONS = 1000 # number of times to test each scraper
html = download('http://example.webscraping.com/places/default/view/China-47')

Downloading: http://example.webscraping.com/places/default/view/China-47


In [26]:
for name, scraper in [('Regular expressions', re_scraper),
    ('BeautifulSoup', bs_scraper),
    ('Lxml', lxml_scraper)]:
    # record start time of scrape
    start = time.time()
    for i in range(NUM_ITERATIONS):
        if scraper == re_scraper:
            re.purge()
        result = scraper(html)
        # check scraped result is as expected
        assert(result['area'] == '9,596,960 square kilometres')
    # record end time of scrape and output the total
    end = time.time()
    print '%s: %.2f seconds' % (name, end-start)

Regular expressions: 2.24 seconds
BeautifulSoup: 20.19 seconds
Lxml: 2.82 seconds


In [23]:
re_scraper(html)

{'area': '9,596,960 square kilometres',
 'capital': 'Beijing',
 'continent': '<a href="/places/default/continent/AS">AS</a>',
 'country': 'China',
 'currency_code': 'CNY',
 'currency_name': 'Yuan Renminbi',
 'iso': 'CN',
 'languages': 'zh-CN,yue,wuu,dta,ug,za',
 'neighbours': '<div><a href="/places/default/iso/LA">LA </a><a href="/places/default/iso/BT">BT </a><a href="/places/default/iso/TJ">TJ </a><a href="/places/default/iso/KZ">KZ </a><a href="/places/default/iso/MN">MN </a><a href="/places/default/iso/AF">AF </a><a href="/places/default/iso/NP">NP </a><a href="/places/default/iso/MM">MM </a><a href="/places/default/iso/KG">KG </a><a href="/places/default/iso/PK">PK </a><a href="/places/default/iso/KP">KP </a><a href="/places/default/iso/RU">RU </a><a href="/places/default/iso/VN">VN </a><a href="/places/default/iso/IN">IN </a></div>',
 'phone': '86',
 'population': '1,330,044,000',
 'postal_code_format': '######',
 'postal_code_regex': '^(\\d{6})$',
 'tld': '.cn'}

In [24]:
bs_scraper(html)

{'area': u'9,596,960 square kilometres',
 'capital': u'Beijing',
 'continent': u'AS',
 'country': u'China',
 'currency_code': u'CNY',
 'currency_name': u'Yuan Renminbi',
 'iso': u'CN',
 'languages': u'zh-CN,yue,wuu,dta,ug,za',
 'neighbours': u'LA BT TJ KZ MN AF NP MM KG PK KP RU VN IN ',
 'phone': u'86',
 'population': u'1,330,044,000',
 'postal_code_format': u'######',
 'postal_code_regex': u'^(\\d{6})$',
 'tld': u'.cn'}

In [25]:
lxml_scraper(html)

{'area': '9,596,960 square kilometres',
 'capital': 'Beijing',
 'continent': 'AS',
 'country': 'China',
 'currency_code': 'CNY',
 'currency_name': 'Yuan Renminbi',
 'iso': 'CN',
 'languages': 'zh-CN,yue,wuu,dta,ug,za',
 'neighbours': 'LA BT TJ KZ MN AF NP MM KG PK KP RU VN IN ',
 'phone': '86',
 'population': '1,330,044,000',
 'postal_code_format': '######',
 'postal_code_regex': '^(\\d{6})$',
 'tld': '.cn'}