# Book - Web Scraping with Python - Collecting More Data From the Modern Web - O'Reilly

# Beautiful Soup 4 - https://www.crummy.com/software/BeautifulSoup/bs4/doc/

# Table of Contents

## Part 1 - Building Scrapers
1. Your First Web Scraper
2. Advanced HTML Parsing
3. Writing Web Crawlers
4. Web Crawling Models
5. Scrapy
6. Storing Data

## Part 2 - Advanced Scraping
7. Reading Documents
8. Cleaning Your Dirty Data
9. Reading and Writing Natural Languages
10. Crawling Throug Forms and Logins
11. Scraping JavaScript
12. Crawling Through APIs
13. Image Processing and Text Recognition
14. Avoiding Scraping Traps
15. Testing Your Website with Scrapers
16. Web Crawling in Parallel
17. Scraping Remotely
18. The Legalities and Ethics of Web Scraping


# Part 1 - Building Scrapers 

## 1. Your First Web Scraper

In [None]:
from urllib.request import urlopen
html = urlopen('http://pythonscraping.com/pages/page1.html')
print(html.read())

b'<html>\n<head>\n<title>A Useful Page</title>\n</head>\n<body>\n<h1>An Interesting Title</h1>\n<div>\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n</div>\n</body>\n</html>\n'


URLLIB documentation: https://docs.python.org/3/library/urllib.html

### Introduction to Beautiful Soup

In [None]:
pip install beautifulsoup4



In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://pythonscraping.com/pages/page1.html')
bs = BeautifulSoup(html.read(), 'html.parser')
print(bs.h1)

<h1>An Interesting Title</h1>


In [None]:
pip install lxml



In [None]:
bs = BeautifulSoup(html.read(), 'html5lib')

In [None]:
bs

<html><head></head><body></body></html>

In [None]:
from urllib.error import HTTPError
from urllib.error import URLError

In [None]:
try:
    html = urlopen('https://pythonscrapingthisurldoesnotexist.com')
except HTTPError as e:
    print(e)
except URLError as e:
    print('The server could not be found!')
else:
    print('It Worked!')

The server could not be found!


In [None]:
try:
    badContent = bs.nonExistingTag.anotherTag
except AttributeError as e:
    print('Tag was not found')
else:
    if badContent == None:
        print ('Tag was not found')
    else:
        print(badContent)

Tag was not found


  name=tag_name


In [None]:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup

def getTitle(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    try:
        bs = BeautifulSoup(html.read(), 'html.parser')
        title = bs.body.h1
    except AttributeError as e:
        return None
    return title

title = getTitle('http://www.pythonscraping.com/pages/page1.html')

if title == None:
    print('Title could not be found')
else:
    print(title)

<h1>An Interesting Title</h1>


## 2. Advanced HTML Parsing

In [None]:
bs.find_all('table')

[]

In [None]:
html = urlopen('http://www.pythonscraping.com/pages/warandpeace.html')
bs = BeautifulSoup(html.read(), 'html.parser')

nameList = bs.findAll('span', {'class':'green'})
for name in nameList:
    print(name.get_text())

In [None]:
nameList = bs.find_all(text='the prince')
print(len(nameList))

7


In [None]:
 bs.find_all(class_='green')

[<span class="green">Anna
 Pavlovna Scherer</span>, <span class="green">Empress Marya
 Fedorovna</span>, <span class="green">Prince Vasili Kuragin</span>, <span class="green">Anna Pavlovna</span>, <span class="green">St. Petersburg</span>, <span class="green">the prince</span>, <span class="green">Anna Pavlovna</span>, <span class="green">Anna Pavlovna</span>, <span class="green">the prince</span>, <span class="green">the prince</span>, <span class="green">the prince</span>, <span class="green">Prince Vasili</span>, <span class="green">Anna Pavlovna</span>, <span class="green">Anna Pavlovna</span>, <span class="green">the prince</span>, <span class="green">Wintzingerode</span>, <span class="green">King of Prussia</span>, <span class="green">le Vicomte de Mortemart</span>, <span class="green">Montmorencys</span>, <span class="green">Rohans</span>, <span class="green">Abbe Morio</span>, <span class="green">the Emperor</span>, <span class="green">the prince</span>, <span class="green">Pri

In [None]:
bs.find_all('', {'class':'green'})

[<span class="green">Anna
 Pavlovna Scherer</span>, <span class="green">Empress Marya
 Fedorovna</span>, <span class="green">Prince Vasili Kuragin</span>, <span class="green">Anna Pavlovna</span>, <span class="green">St. Petersburg</span>, <span class="green">the prince</span>, <span class="green">Anna Pavlovna</span>, <span class="green">Anna Pavlovna</span>, <span class="green">the prince</span>, <span class="green">the prince</span>, <span class="green">the prince</span>, <span class="green">Prince Vasili</span>, <span class="green">Anna Pavlovna</span>, <span class="green">Anna Pavlovna</span>, <span class="green">the prince</span>, <span class="green">Wintzingerode</span>, <span class="green">King of Prussia</span>, <span class="green">le Vicomte de Mortemart</span>, <span class="green">Montmorencys</span>, <span class="green">Rohans</span>, <span class="green">Abbe Morio</span>, <span class="green">the Emperor</span>, <span class="green">the prince</span>, <span class="green">Pri

In [None]:
html = urlopen('https://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'html.parser')

for child in bs.find('table',{'id':'giftList'}).children:
    print(child)

In [None]:
html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'html.parser')
for sibling in bs.find('table', {'id':'giftList'}).tr.next_siblings:
    print(sibling)

In [None]:
bs.find('table', {'id':'giftList'})

In [None]:
html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'html.parser')
print(bs.find('img',{'src':'../img/gifts/img1.jpg'}).parent.previous_sibling.get_text())


$15.00



#### Regular Expressions

In [None]:
import re
html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'html.parser')

images = bs.find_all('img', {'src':re.compile('\.\./img\/gifts/img.*\.jpg')})
images

In [None]:
for image in images:
  print(image['src'])

../img/gifts/img1.jpg
../img/gifts/img2.jpg
../img/gifts/img3.jpg
../img/gifts/img4.jpg
../img/gifts/img6.jpg


In [None]:
bs.find_all(lambda tag: len(tag.attrs) == 2)

In [None]:
bs.find_all(lambda tag: tag.get_text() ==
    'Or maybe he\'s only resting?')

[<span class="excitingNote">Or maybe he's only resting?</span>]

## 3. Writing Web Crawlers


In [None]:
html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')

In [None]:
for link in bs.find_all('a'):
  if 'href' in link.attrs:
    print(link.attrs['href'])

In [None]:
for link in bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$')):
  if 'href' in link.attrs:
    print(link.attrs['href'])


In [None]:
import datetime
import random

In [None]:
random.seed(datetime.datetime.now())

In [None]:
def getLinks(articleUrl):
  html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
  bs = BeautifulSoup(html, 'html.parser')
  return bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))


links = getLinks('/wiki/Kevin_Bacon')

In [None]:
while len(links) > 0:
  newArticle = links[random.randint(0, len(links) - 1)].attrs['href']
  print(newArticle)
  link = getLinks(newArticle)

In [None]:
pages = set()

In [None]:
def getLinks(pageUrl):
  global pages
  html = urlopen(f'http://en.wikipedia.org{pageUrl}')
  bs = BeautifulSoup(html, 'html.parser')
  for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
    if 'href' in link.attrs:
      if link.attrs['href'] not in pages:
        newPage = link.attrs['href']
        print(newPage)
        pages.add(newPage)
        getLinks(newPage)
      
getLinks('')

   

In [None]:
pages = set()
def getLinks(pageUrl):
    global pages
    html = urlopen(f'http://en.wikipedia.org{pageUrl}')
    bs = BeautifulSoup(html, 'html.parser')
    try:
      print(bs.h1.get_text())
      print(bs.find(id='mw-content-text').find_all('p')[0])
      print(bs.find(id='ca-edit').find('span').find('a').attrs['href'])
    except AttributeError:
      print('This page is missing something! Continuing')

    for link in bs.find_all('a', href=re.compile('^/(wiki)/')):
      if 'href' in link.attrs:
        if link.attrs['href'] not in pages:
          #We have encountered a new page
                newPage = link.attrs['href']
                print('-'*20)
                print(newPage)
                pages.add(newPage)
                getLinks(newPage)
getLinks('')

In [None]:
from urllib.parse import urlparse
pages = set()
random.seed(datetime.datetime.now())

In [None]:
#Retrieves a list of all Internal links found on a page
def getInternalLinks(bs, includeUrl):
  includeUrl = f'{urlparse(includeUrl).scheme}://{urlparse(includeUrl).netloc}'
  internalLinks = []
  
  #Finds all links that begin with a "/"
  for link in bs.find_all('a', href=re.compile('^(/|.*' + includeUrl + ')')):
    if link.attrs['href'] is not None:
      if link.attrs['href'] not in internalLinks:
        if(link.attrs['href'].startswith('/')):
          internalLinks.append(includeUrl+link.attrs['href'])
        else:
          internalLinks.append(link.attrs['href'])
  return internalLinks

In [None]:
#Retrieves a list of all external links found on a page

def getExternalLinks(bs, excludeUrl):
  externalLinks = []
   #Finds all links that start with "http" that do
   #not contain the current URL
  for link in bs.find_all('a', href=re.compile('^(http|www)((?!' + excludeUrl + ').)*$')):
    if link.attrs['href'] not in externalLinks:
      externalLinks.append(link.attrs['href'])
  return externalLinks

In [None]:
def getRandomExternalLink(startingPage):
    html = urlopen(startingPage)
    bs = BeautifulSoup(html, 'html.parser')
    externalLinks = getExternalLinks(bs, urlparse(startingPage).netloc)
    if len(externalLinks) == 0:
      print('No external links, looking around the site for one')
      domain = f'{urlparse(startingPage).scheme}://{urlparse(startingPage).netloc}'
      internalLinks = getInternalLinks(bs, domain)
      return getRandomExternalLink(internalLinks[random.randint(0, len(internalLinks) -1)])
    else:
      return externalLinks[random.randint(0, len(externalLinks) -1)]

In [None]:
def followExternalOnly(startingSite):
  externalLink = getRandomExternalLink(startingSite)
  print(f'Random external link is: {externalLink}')
  followExternalOnly(externalLink)
  followExternalOnly('http://oreilly.com')

In [None]:
followExternalOnly('http://oreilly.com')

In [None]:
allExtLinks = set()
allIntLinks = set()

In [None]:
def getAllExternalLinks(siteUrl):
  html = urlopen(siteUrl)
  domain = f'{urlparse(siteUrl).scheme}://{urlparse(siteUrl).netloc}'
  bs = BeautifulSoup(html, 'html.parser')

  internalLinks = getInternalLinks(bs, domain)
  externalLinks = getExternalLinks(bs, domain)
  
  for link in externalLinks:
    if link not in allExtLinks:
      allExtLinks.add(link)
      print(link)
  for link in internalLinks:
    if link not in allIntLinks:
      allIntLinks.add(link)
      getAllExternalLinks(link)

In [None]:
allIntLinks.add('http://oreilly.com')
getAllExternalLinks('http://oreilly.com')

https://www.oreilly.com
https://learning.oreilly.com/accounts/login-check/
https://www.oreilly.com/online-learning/try-now.html
https://www.oreilly.com/online-learning/teams.html
https://www.oreilly.com/online-learning/government.html
https://www.oreilly.com/online-learning/academic.html
https://www.oreilly.com/online-learning/individuals.html
https://www.oreilly.com/online-learning/features.html
https://www.oreilly.com/online-learning/feature-certification.html
https://www.oreilly.com/online-learning/intro-interactive-learning.html
https://www.oreilly.com/online-learning/live-events.html
https://www.oreilly.com/online-learning/feature-answers.html
https://www.oreilly.com/radar/
https://www.oreilly.com/content-marketing-solutions.html
https://learning.oreilly.com/p/register/
https://www.oreilly.com/online-learning/the-cost-of-doing-nothing.html
https://learning.oreilly.com/search/?query=author%3A%22Arianne%20Dee%22&extended_publisher_data=true&highlight=true&include_assessments=false&i

#4. Web Crawling Models

In [None]:
import requests
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import re
import datetime
import random
from urllib.parse import urlparse

In [None]:
class Content:
  def __init__(self, url, title, body):
    self.url = url
    self.title = title
    self.body = body

def getPage(url):
  req = requests.get(url)
  return BeautifulSoup(req.text, 'html.parser')

def scrapeNYTimes(url):
  bs = getPage(url)
  title = bs.find('h1').text
  lines = bs.find_all('p', {'class', 'story-content'})
  body = '\n'.join([line.text for line in lines])
  return Content(url, title, body)

def scrapeBrookings(url):
  bs = getPage(url)
  title = bs.find('h1').text
  body = bs.find('div', {'class':'post-body'}).text
  return Content(url, title, body)

In [None]:
url = 'https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/'

In [None]:
content = scrapeBrookings(url)

In [None]:
print(f'Title: {content.title}')

Title: Delivering inclusive urban access: 3 uncomfortable truths


In [None]:
print(f'URL: {content.url}\n')

URL: https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/



In [None]:
print(content.body)

In [None]:
url2 = 'https://www.nytimes.com/2018/01/25/opinion/sunday/silicon-valley-immortality.html'

In [None]:
content2 = scrapeNYTimes(url2)

In [None]:
print('Title: {}'.format(content2.title))

Title: The Men Who Want to Live Forever


In [None]:
print('URL: {}\n'.format(content2.url))

URL: https://www.nytimes.com/2018/01/25/opinion/sunday/silicon-valley-immortality.html



In [None]:
print(content2.body)




In [None]:
class Content:
  def __init__(self, url, title, body):
    self.url = url
    self.title = title
    self.body = body

  def print(self):
      """
      Flexible printing function controls output
      """
      print("URL: {}".format(self.url))
      print("TITLE: {}".format(self.title))
      print("BODY:\n{}".format(self.body))

In [None]:
class Website:
  def __init__(self, name, url, titleTag, bodyTag):
    self.name = name
    self.url = url
    self.titleTag = titleTag
    self.bodyTag = bodyTag

In [None]:
class Crawler:
  def getPage(self, url):
    try:
      req = requests.get(url)
    except requests.exceptions.RequestException:
      return None
    return BeautifulSoup(req.text, 'html.parser')

  def safeGet(self, pageObj, selector):
    selectedElems = pageObj.select(selector)
    if selectedElems is not None and len(selectedElems) > 0:
      return '\n'.join([elem.get_text() for elem in selectedElems])
    return ''

  def parse(self, site, url):
    bs = self.getPage(url)
    if bs is not None:
      title = self.safeGet(bs, site.titleTag)
      body = self.safeGet(bs, site.bodyTag)
      if title != '' and body != '':
        content = Content(url, title, body)
        content.print()

In [None]:
crawler = Crawler()

In [None]:
siteData = [
    ['O\'Reilly Media', 'http://oreilly.com',
    'h1', 'section#product-description'],
    ['Reuters', 'http://reuters.com', 'h1',
    'div.StandardArticleBody_body_1gnLA'],
    ['Brookings', 'http://www.brookings.edu',
    'h1', 'div.post-body'],
    ['New York Times', 'http://nytimes.com',
    'h1', 'p.story-content']]

In [None]:
websites = []

In [None]:
for row in siteData:
  websites.append(Website(row[0], row[1], row[2], row[3]))

In [None]:
crawler.parse(websites[0], 'http://shop.oreilly.com/product/'\
    '0636920028154.do')
crawler.parse(websites[1], 'http://www.reuters.com/article/'\
    'us-usa-epa-pruitt-idUSKBN19W2D0')
crawler.parse(websites[2], 'https://www.brookings.edu/blog/'\
    'techtank/2016/03/01/idea-to-retire-old-methods-of-policy-education/')
crawler.parse(websites[3], 'https://www.nytimes.com/2018/01/'\
    '28/business/energy-environment/oil-boom.html')

In [None]:
class Content:
  def __init__(self, topic, url, title, body):
      self.topic = topic
      self.title = title
      self.body = body
      self.url = url
  def print(self):
      """
      Flexible printing function controls output
      """
      print("New article found for topic: {}".format(self.topic))
      print("TITLE: {}".format(self.title))
      print("BODY:\n{}".format(self.body))
      print("URL: {}".format(self.url))

In [None]:
class Website:
    """Contains information about website structure"""
    def __init__(self, name, url, searchUrl, resultListing,
        resultUrl, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.searchUrl = searchUrl
        self.resultListing = resultListing
        self.resultUrl = resultUrl
        self.absoluteUrl=absoluteUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag

In [None]:
class Crawler:
    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')
    def safeGet(self, pageObj, selector):
        childObj = pageObj.select(selector)
        if childObj is not None and len(childObj) > 0:
            return childObj[0].get_text()
        return ""
    def search(self, topic, site):
        """
        Searches a given website for a given topic and records all pages found
        """
        bs = self.getPage(site.searchUrl + topic)
        searchResults = bs.select(site.resultListing)
        for result in searchResults:
            url = result.select(site.resultUrl)[0].attrs["href"]
            # Check to see whether it's a relative or an absolute URL
            if(site.absoluteUrl):
                bs = self.getPage(url)
            else:
                bs = self.getPage(site.url + url)
            if bs is None:
                print("Something was wrong with that page or URL. Skipping!")
                return
            title = self.safeGet(bs, site.titleTag)
            body = self.safeGet(bs, site.bodyTag)
            if title != '' and body != '':
                content = Content(topic, title, body, url)
                content.print()
crawler = Crawler()
siteData = [['O\'Reilly Media', 'http://oreilly.com',
        'https://ssearch.oreilly.com/?q=','article.product-result',
        'p.title a', True, 'h1', 'section#product-description'],
    ['Reuters', 'http://reuters.com',
        'http://www.reuters.com/search/news?blob=',
        'div.search-result-content','h3.search-result-title a',
        False, 'h1', 'div.StandardArticleBody_body_1gnLA'],
    ['Brookings', 'http://www.brookings.edu',
        'https://www.brookings.edu/search/?s=',
        'div.list-content article', 'h4.title a', True, 'h1', 'div.post-body']]

sites = []
for row in siteData:
    sites.append(Website(row[0], row[1], row[2],
                         row[3], row[4], row[5], row[6], row[7]))
topics = ['python', 'data science']
for topic in topics:
    print("GETTING INFO ABOUT: " + topic)
    for targetSite in sites:
        crawler.search(topic, targetSite)

In [None]:
class Website:
    def __init__(self, name, url, targetPattern, absoluteUrl,
        titleTag, bodyTag):
        self.name = name
        self.url = url
        self.targetPattern = targetPattern
        self.absoluteUrl=absoluteUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag
       
class Content:
    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body
    def print(self):
        print("URL: {}".format(self.url))
        print("TITLE: {}".format(self.title))
        print("BODY:\n{}".format(self.body))

In [None]:
class Crawler:
    def __init__(self, site):
        self.site = site
        self.visited = []
       
    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None       
        return BeautifulSoup(req.text, 'html.parser')
    
    def safeGet(self, pageObj, selector):
        selectedElems = pageObj.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for
                elem in selectedElems])
        return ''
   
    def parse(self, url):
        bs = self.getPage(url)
        if bs is not None:
            title = self.safeGet(bs, self.site.titleTag)
            body = self.safeGet(bs, self.site.bodyTag)
            if title != '' and body != '':
                content = Content(url, title, body)
                content.print()
    def crawl(self):
        """
        Get pages from website home page
        """
        bs = self.getPage(self.site.url)
        targetPages = bs.findAll('a', href=re.compile(self.site.targetPattern))
        for targetPage in targetPages:
            targetPage = targetPage.attrs['href']
            if targetPage not in self.visited:
                self.visited.append(targetPage)
                if not self.site.absoluteUrl:
                    targetPage = '{}{}'.format(self.site.url, targetPage)
                self.parse(targetPage)

reuters = Website('Reuters', 'https://www.reuters.com', '^(/article/)', False,
    'h1', 'div.StandardArticleBody_body_1gnLA')
crawler = Crawler(reuters)
crawler.crawl()

In [None]:
class Website:
    """Common base class for all articles/pages"""
    def __init__(self, type, name, url, searchUrl, resultListing,
        resultUrl, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.titleTag = titleTag
        self.bodyTag = bodyTag
        self.pageType = pageType

# 6. Storing Data

In [None]:
import requests
from urllib.request import urlopen
from urllib.request import urlretrieve
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import re
import datetime
import random
from urllib.parse import urlparse

In [None]:
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen('http://www.pythonscraping.com')
bs = BeautifulSoup(html, 'html.parser')
imageLocation = bs.find('img', {'class': 'pagelayer-img pagelayer-wp-title-img'})['src']
urlretrieve (imageLocation, 'logo01.png')

('logo01.png', <http.client.HTTPMessage at 0x7fdd239dd110>)

In [None]:
print(imageLocation)

https://pythonscraping.com/wp-content/uploads/2021/08/logo01.png


In [None]:
import os
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup

downloadDirectory = 'downloaded'
baseUrl = 'http://pythonscraping.com'

def getAbsoluteURL(baseUrl, source):
    if source.startswith('http://www.'):
        url = 'http://{}'.format(source[11:])
    elif source.startswith('http://'):
        url = source
    elif source.startswith('www.'):
        url = source[4:]
        url = 'http://{}'.format(source)
    else:
        url = '{}/{}'.format(baseUrl, source)
    if baseUrl not in url:
        return None
    return url

def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
    path = absoluteUrl.replace('www.', '')
    path = path.replace(baseUrl, '')
    path = downloadDirectory+path
    directory = os.path.dirname(path)
    if not os.path.exists(directory):
        os.makedirs(directory)
    return path


html = urlopen('http://www.pythonscraping.com')
bs = BeautifulSoup(html, 'html.parser')
downloadList = bs.findAll(src=True)
for download in downloadList:
    fileUrl = getAbsoluteURL(baseUrl, download['src'])
    if fileUrl is not None:
        print(fileUrl)


urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory))

In [None]:
import csv

csvFile = open('test.csv', 'w+')

try:
    writer = csv.writer(csvFile)
    writer.writerow(('number', 'number plus 2', 'number times 2'))
    for i in range(10):
        writer.writerow( (i, i+2, i*2))
finally:
    csvFile.close()

In [None]:
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://en.wikipedia.org/wiki/Comparison_of_text_editors')
bs = BeautifulSoup(html, 'html.parser')

table = bs.findAll('table',{'class':'wikitable'})[0]
table

In [None]:
rows = table.findAll('tr')
rows

In [None]:
csvFile = open('editors.csv', 'wt+')
writer = csv.writer(csvFile)

try:
    for row in rows:
        csvRow = []
        for cell in row.findAll(['td', 'th']):
            csvRow.append(cell.get_text())
            writer.writerow(csvRow)
finally:
    csvFile.close()