In [18]:
import urllib2
import re
import lxml.html
import time

In [31]:
def download(url,user_agent='wswp',num_retries=2,scrape_callback=None):
    print 'Downloading:', url
    headers = {'User-agent': user_agent}
    request = urllib2.Request(url, headers=headers)
    try:
        html = urllib2.urlopen(url).read()
    except urllib2.URLError as e:
        print 'Download error:', e.reason
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # recursively retry 5xx HTTP errors
                return download(url,user_agent ,num_retries-1)
    links = []
    if scrape_callback:
        links.extend(scrape_callback(url, html) or [])
    return html

In [3]:
def get_links(html):
    """Return a list of links from html
    """
    # a regular expression to extract all links from the webpage
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']',
    re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html)

In [16]:
def link_crawler(seed_url, link_regex,scrape_callback=None):
    crawl_queue = [seed_url]
    # keep track which URL's have seen before
    seen = set(crawl_queue)
    while crawl_queue:
        url = crawl_queue.pop()
        html = download(url)
        for link in get_links(html):
            # check if link matches expected regex
            if re.match(link_regex, link):
                # form absolute link
                link = urllib2.urlparse.urljoin(seed_url, link)
                # check if have already seen this link
                if link not in seen:
                    seen.add(link)
                    crawl_queue.append(link)
    links = []
    if scrape_callback:
        links.extend(scrape_callback(url, html) or [])

In [5]:
url = 'http://example.webscraping.com'
html=download(url)
tree = lxml.html.fromstring(html)
td = tree.cssselect('tr > td > div> a')

Downloading: http://example.webscraping.com


In [28]:
data = {}
for p in td:
    data[p.text_content()]=p.get("href")
data

{' Afghanistan': '/places/default/view/Afghanistan-1',
 ' Aland Islands': '/places/default/view/Aland-Islands-2',
 ' Albania': '/places/default/view/Albania-3',
 ' Algeria': '/places/default/view/Algeria-4',
 ' American Samoa': '/places/default/view/American-Samoa-5',
 ' Andorra': '/places/default/view/Andorra-6',
 ' Angola': '/places/default/view/Angola-7',
 ' Anguilla': '/places/default/view/Anguilla-8',
 ' Antarctica': '/places/default/view/Antarctica-9',
 ' Antigua and Barbuda': '/places/default/view/Antigua-and-Barbuda-10'}

In [24]:
for keys,values in data.items():
    #link_crawler(url, values, scrape_callback=ScrapeCallback())
    link = urllib2.urlparse.urljoin(url, values)
    html = download(link)
    time.sleep(1)

Downloading: http://example.webscraping.com/places/default/view/Aland-Islands-2
Downloading: http://example.webscraping.com/places/default/view/Angola-7
Downloading: http://example.webscraping.com/places/default/view/Afghanistan-1
Downloading: http://example.webscraping.com/places/default/view/Antigua-and-Barbuda-10
Downloading: http://example.webscraping.com/places/default/view/Algeria-4
Downloading: http://example.webscraping.com/places/default/view/American-Samoa-5
Downloading: http://example.webscraping.com/places/default/view/Antarctica-9
Downloading: http://example.webscraping.com/places/default/view/Anguilla-8
Downloading: http://example.webscraping.com/places/default/view/Albania-3
Downloading: http://example.webscraping.com/places/default/view/Andorra-6


In [20]:
import csv
class ScrapeCallback:
    def __init__(self):
        self.writer = csv.writer(open('countries.csv', 'a'))
        self.fields = ('area', 'population', 'iso', 'country',
        'capital', 'continent', 'tld', 'currency_code',
        'currency_name', 'phone', 'postal_code_format',
        'postal_code_regex', 'languages',
        'neighbours')
        self.writer.writerow(self.fields)
    def __call__(self, url, html):
        if re.search('/view/', url):
            tree = lxml.html.fromstring(html)
            row = []
            for field in self.fields:
                row.append(tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content())
        self.writer.writerow(row)

In [38]:
#compare among re & BeautifulSoup & lxml
FIELDS = ['area', 'population', 'iso', 'country', 'capital',
'continent', 'tld', 'currency_code', 'currency_name', 'phone',
'postal_code_format', 'postal_code_regex', 'languages',
'neighbours']

In [59]:
import lxml.html
def lxml_scraper(html):
    tree = lxml.html.fromstring(html)
    results = {}
    for field in FIELDS:
        results[field] = tree.cssselect('table > tr#places_%s__row > td.w2p_fw' % field)[0].text_content()
    return results

In [60]:
res = []
for keys,values in data.items():
    link = urllib2.urlparse.urljoin(url, values)
    html = download(link,scrape_callback=ScrapeCallback())
    res.append(re_scraper(html))
    time.sleep(1)

Downloading: http://example.webscraping.com/places/default/view/Aland-Islands-2
Downloading: http://example.webscraping.com/places/default/view/Angola-7
Downloading: http://example.webscraping.com/places/default/view/Afghanistan-1
Downloading: http://example.webscraping.com/places/default/view/Antigua-and-Barbuda-10
Downloading: http://example.webscraping.com/places/default/view/Algeria-4
Downloading: http://example.webscraping.com/places/default/view/American-Samoa-5
Downloading: http://example.webscraping.com/places/default/view/Antarctica-9
Downloading: http://example.webscraping.com/places/default/view/Anguilla-8
Downloading: http://example.webscraping.com/places/default/view/Albania-3
Downloading: http://example.webscraping.com/places/default/view/Andorra-6


In [61]:
import csv
keys = res[0].keys()
with open('countries.csv', 'wb') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(res)

In [62]:
print res[0]

{'languages': 'sv-AX', 'area': '1,580 square kilometres', 'country': 'Aland Islands', 'postal_code_regex': '^(?:FI)*(\\d{5})$', 'tld': '.ax', 'currency_name': 'Euro', 'phone': '+358-18', 'neighbours': '<div><a href="/places/default/iso//"> </a></div>', 'iso': 'AX', 'postal_code_format': '#####', 'capital': 'Mariehamn', 'continent': '<a href="/places/default/continent/EU">EU</a>', 'currency_code': 'EUR', 'population': '26,711'}
