In [1]:
# Import Python's requests library
import requests

In [2]:
# Prepare the request
url = 'https://jupyter.org'
req = requests.Request('GET', url)
req.headers['User-Agent'] = 'Mozilla/5.0'
req = req.prepare()

In [3]:
# Print the docstring for the prepared request string
req?

In [4]:
# Run the request and store the response in a variable
# We use the with statement so that the scope of the session is limited to the indented block
# This way, the session is closed automatically and we don't need to close it explicitly
with requests.Session() as sess:
    page = sess.send(req)

In [5]:
# Investigate the response - check to confirm a 200 status by printing the string representation
print(page)

<Response [200]>


In [6]:
# Print the first part of the html
page_html = page.text
page_html[:1000]

'<!DOCTYPE html>\n<html>\n\n  <head>\n\n    <meta charset="utf-8">\n    <meta http-equiv="X-UA-Compatible" content="IE=edge">\n    <meta name="viewport" content="width=device-width, initial-scale=1">\n    <meta name="description" content="">\n    <meta name="author" content="">\n\n    <title>Project Jupyter | Home</title>\n    <meta property="og:title" content="Project Jupyter" />\n    <meta property="og:description" content="The Jupyter Notebook is a web-based interactive computing platform. The notebook combines live code, equations, narrative text, visualizations, interactive dashboards and other media.\n">\n    <meta property="og:url" content="https://www.jupyter.org" />\n    <meta property="og:image" content="https://jupyter.org/assets/homepage.png" />\n    <!-- Bootstrap Core CSS -->\n    <link rel="stylesheet" href="/css/bootstrap.min.css">\n    <link rel="stylesheet" href="/css/logo-nav.css?1594832988204333248">\n    <link rel="stylesheet" href="/css/cardlist.css">\n    <link r

In [7]:
# Pretty print the output using the BeautifulSoup library
from bs4 import BeautifulSoup
print(BeautifulSoup(page_html, 'html.parser').prettify()[:1000])

<!DOCTYPE html>
<html>
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="" name="description"/>
  <meta content="" name="author"/>
  <title>
   Project Jupyter | Home
  </title>
  <meta content="Project Jupyter" property="og:title">
   <meta content="The Jupyter Notebook is a web-based interactive computing platform. The notebook combines live code, equations, narrative text, visualizations, interactive dashboards and other media.
" property="og:description"/>
   <meta content="https://www.jupyter.org" property="og:url">
    <meta content="https://jupyter.org/assets/homepage.png" property="og:image">
     <!-- Bootstrap Core CSS -->
     <link href="/css/bootstrap.min.css" rel="stylesheet"/>
     <link href="/css/logo-nav.css?1594832988204333248" rel="stylesheet"/>
     <link href="/css/cardlist.css" rel="stylesheet"/>
     <link href="/css/github-buttons.cs

In [8]:
# Use IPython to display the rendered HTML - note that no JS is executed and files with relatives paths aren't loaded
from IPython.display import HTML
HTML(page_html)

In [None]:
# Compare this output to the live website, which we can open using an iFrame
from IPython.display import IFrame
IFrame(src=url, height=800, width=800)

# NOTE: After doing this, it's a good idea to close the iFrame so that it doesn't eat memory.
# We do this by clicking the Current Outputs -> Clear option from the Cell menu.

In [10]:
# Shorthand method for making requests - this doesn't allow header customisation but involves less code
url = 'http://www.python.org/'
page = requests.get(url)
page

<Response [200]>

In [11]:
# Show the URL attribute of the response - note that it has been redirected to HTTPS
page.url

'https://www.python.org/'

In [12]:
# Show the redirect history
page.history

[<Response [301]>]

### Parsing HTML with Python
We will scrape the contents of the central bank interest rates Wikipedia page. Essentially, this will involve the following steps:
- First, view the page in a browser. Use Chrome Developer Tools to identify the elements of interest that we want to scrape for data.
- Get the page content using the requests library.
- Parse the content using BeautifulSoup.
- Display the attributes and methods available for the BeautifulSoup object and the tags we want to parse. In addition to the tables containing data, we might also want to get lists of image URLs, headings, etc.

The URL of the page we will scrape is https://en.wikipedia.org/wiki/List_of_countries_by_central_bank_interest_rates

In [13]:
# Get the page HTML
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_central_bank_interest_rates'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

In [14]:
# Display the attributes and methods for the BS4 object
try:
    import pdir
    dir = pdir
except:
    print('You need to install pdir with:\npip install pdir2')
dir(soup)

[0;33mproperty:[0m
[0;33mspecial attribute:[0m
    [0;36m__class__[0m[1;30m, [0m[0;36m__dict__[0m[1;30m, [0m[0;36m__doc__[0m[1;30m, [0m[0;36m__module__[0m[1;30m, [0m[0;36m__weakref__[0m
[0;33mabstract class:[0m
    [0;36m__subclasshook__[0m
[0;33mobject customization:[0m
    [0;36m__bool__[0m[1;30m, [0m[0;36m__format__[0m[1;30m, [0m[0;36m__hash__[0m[1;30m, [0m[0;36m__init__[0m[1;30m, [0m[0;36m__new__[0m[1;30m, [0m[0;36m__repr__[0m[1;30m, [0m[0;36m__sizeof__[0m[1;30m, [0m[0;36m__str__[0m
[0;33mrich comparison:[0m
    [0;36m__eq__[0m[1;30m, [0m[0;36m__ge__[0m[1;30m, [0m[0;36m__gt__[0m[1;30m, [0m[0;36m__le__[0m[1;30m, [0m[0;36m__lt__[0m[1;30m, [0m[0;36m__ne__[0m
[0;33mattribute access:[0m
    [0;36m__delattr__[0m[1;30m, [0m[0;36m__dir__[0m[1;30m, [0m[0;36m__getattr__[0m[1;30m, [0m[0;36m__getattribute__[0m[1;30m, [0m[0;36m__setattr__[0m
[0;33mclass customization:[0m
    [0;36m__init_sub

In [15]:
# Get the h1 heading for the page
h1 = soup.find_all('h1')
h1

[<h1 class="firstHeading" id="firstHeading" lang="en">List of countries by central bank interest rates</h1>]

In [16]:
# Print the HTML element attributes using the .attrs property
h1 = h1[0]
h1.attrs

{'id': 'firstHeading', 'class': ['firstHeading'], 'lang': 'en'}

In [17]:
# Get the visible text by printing h1.text
print(h1.text)

List of countries by central bank interest rates


In [18]:
# Get all the images on the page
imgs = soup.find_all('img')
len(imgs)

93

In [19]:
# Print the source of each image
# We use a list comprehension here to iterate through each element, find the src atttribute and select it if it exists
[element.attrs['src'] for element in imgs if 'src' in element.attrs.keys()]

['//upload.wikimedia.org/wikipedia/commons/thumb/3/36/Flag_of_Albania.svg/21px-Flag_of_Albania.svg.png',
 '//upload.wikimedia.org/wikipedia/commons/thumb/9/9d/Flag_of_Angola.svg/23px-Flag_of_Angola.svg.png',
 '//upload.wikimedia.org/wikipedia/commons/thumb/1/1a/Flag_of_Argentina.svg/23px-Flag_of_Argentina.svg.png',
 '//upload.wikimedia.org/wikipedia/commons/thumb/2/2f/Flag_of_Armenia.svg/23px-Flag_of_Armenia.svg.png',
 '//upload.wikimedia.org/wikipedia/commons/thumb/8/88/Flag_of_Australia_%28converted%29.svg/23px-Flag_of_Australia_%28converted%29.svg.png',
 '//upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Flag_of_Azerbaijan.svg/23px-Flag_of_Azerbaijan.svg.png',
 '//upload.wikimedia.org/wikipedia/commons/thumb/9/93/Flag_of_the_Bahamas.svg/23px-Flag_of_the_Bahamas.svg.png',
 '//upload.wikimedia.org/wikipedia/commons/thumb/2/2c/Flag_of_Bahrain.svg/23px-Flag_of_Bahrain.svg.png',
 '//upload.wikimedia.org/wikipedia/commons/thumb/f/f9/Flag_of_Bangladesh.svg/23px-Flag_of_Bangladesh.svg.png

In [20]:
# From Chrome dev tools, we know that the div containing the table has id="bodyContent". Select it.
body_content = soup.find('div', {'id': 'bodyContent'})

In [21]:
# Get the table headers
table_headers = body_content.find_all('th')[:6]
table_headers

[<th>Country or<br/>currency union</th>,
 <th>Central bank <br/> interest rate (%)</th>,
 <th>Date of last <br/> change
 </th>,
 <th>Average inflation rate 2013-2017 (%)
 <p>by <a href="/wiki/World_Bank" title="World Bank">WB</a> and <a href="/wiki/International_Monetary_Fund" title="International Monetary Fund">IMF</a><sup class="reference" id="cite_ref-1"><a href="#cite_note-1">[1]</a></sup><sup class="reference" id="cite_ref-2"><a href="#cite_note-2">[2]</a></sup> as in the <a href="/wiki/List_of_countries_by_inflation_rate" title="List of countries by inflation rate">List</a>
 </p>
 </th>,
 <th>Central bank interest rate <br/> minus <br/> average inflation rate (2013-2017)
 </th>,
 <th>Central bank interest rate <br/> divided by <br/> average inflation rate (2013-2017)
 </th>]

In [22]:
# Get the text itself without the tags and <br/>s
table_headers = [element.get_text().replace('\n', ' ') for element in table_headers]
table_headers

['Country orcurrency union',
 'Central bank  interest rate (%)',
 'Date of last  change ',
 'Average inflation rate 2013-2017 (%) by WB and IMF[1][2] as in the List  ',
 'Central bank interest rate  minus  average inflation rate (2013-2017) ',
 'Central bank interest rate  divided by  average inflation rate (2013-2017) ']

In [23]:
# Get the data for each cell in the second row - note we have six columns to parse
row_number = 2
d1, d2, d3, d4, d5, d6 = body_content.find_all('tr')[row_number].find_all('td')

In [24]:
# Print d1 and its text attribute
d1

<td align="left"><span class="flagicon"><img alt="" class="thumbborder" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/9d/Flag_of_Angola.svg/23px-Flag_of_Angola.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/9d/Flag_of_Angola.svg/35px-Flag_of_Angola.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/9d/Flag_of_Angola.svg/45px-Flag_of_Angola.svg.png 2x" width="23"/> </span><a href="/wiki/Angola" title="Angola">Angola</a></td>

In [25]:
d1.text

'\xa0Angola'

In [26]:
# We want to remove the undesirable characters from the front - so we just want the text from the anchor tags
d1.find('a').text

'Angola'

In [27]:
# Show d3 and its text
d3

<td><span data-sort-value="000000002019-05-24-0000" style="white-space:nowrap">24 May 2019</span><sup class="reference" id="cite_ref-CentralBankNews_3-1"><a href="#cite_note-CentralBankNews-3">[3]</a></sup>
</td>

In [28]:
d3.text

'24 May 2019[3]\n'

In [29]:
# Parse this text properly
d3.find_all('span')[0].text

'24 May 2019'

In [30]:
# Perform the full scrape by iterating over the elements
data = []
for i, row in enumerate(body_content.find_all('tr')):
    row_data = row.find_all('td')
    if len(row_data) != 6:
        print ('Ignoring row {} because it does not have six fields'.format(i))
        continue
    d1, d2, d3, d4, d5, d6 = row_data
    try:
        d1 = d1.find('a').text
        d2 = float(d2.text)
        d3 = d3.find_all('span')[0].text
    except:
        print('Ignoring row {} because exception raised'.format(i))
        continue
    data.append([d1, d2, d3])

Ignoring row 0 because it does not have six fields
Ignoring row 26 because exception raised
Ignoring row 36 because exception raised
Ignoring row 89 because exception raised
Ignoring row 94 because it does not have six fields
Ignoring row 95 because it does not have six fields
Ignoring row 96 because it does not have six fields
Ignoring row 97 because it does not have six fields
Ignoring row 98 because it does not have six fields
Ignoring row 99 because it does not have six fields
Ignoring row 100 because it does not have six fields
Ignoring row 101 because it does not have six fields
Ignoring row 102 because it does not have six fields
Ignoring row 103 because it does not have six fields
Ignoring row 104 because it does not have six fields
Ignoring row 105 because it does not have six fields
Ignoring row 106 because it does not have six fields
Ignoring row 107 because it does not have six fields
Ignoring row 108 because it does not have six fields
Ignoring row 109 because it does not 

In [31]:
# Print the head of the scraped data
print(data[:10])

[['Albania', 1.0, '6 June 2016'], ['Angola', 15.5, '24 May 2019'], ['Argentina', 38.0, '5 March 2020'], ['Armenia', 5.75, '29 January 2019'], ['Australia', 0.25, '19 March 2020'], ['Azerbaijan', 7.25, '1 May 2020'], ['Bahamas', 4.0, '22 December 2016'], ['Bahrain', 2.5, '31 July 2019'], ['Bangladesh', 6.0, '6 April 2018'], ['Barbados', 7.0, '1 June 2009']]


In [32]:
# Save the data to a CSV file for later use
f_path = './country_interest_rates.csv'
with open(f_path, 'w') as f:
    f.write('{},{},{}\n'.format(*table_headers))
    for d in data:
        f.write('{},{},{}\n'.format(*d))

In [33]:
# Collect some more data - this time population data. We will use this and the previous data for creating visualisations
url = 'http://www.worldometers.info/world-population/population-by-country/'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

In [34]:
body_content = soup.find('table', {'id': 'example2'})

In [35]:
table_headers = body_content.find_all('th')
table_headers

[<th>#</th>,
 <th>Country (or dependency)</th>,
 <th>Population<br/> (2020)</th>,
 <th>Yearly<br/> Change</th>,
 <th>Net<br/> Change</th>,
 <th>Density<br/> (P/Km²)</th>,
 <th>Land Area<br/> (Km²)</th>,
 <th>Migrants<br/> (net)</th>,
 <th>Fert.<br/> Rate</th>,
 <th>Med.<br/> Age</th>,
 <th>Urban<br/> Pop %</th>,
 <th>World<br/> Share</th>]

In [36]:
# We are only interested in scraping the first three columns after the index number column at the start
table_headers = table_headers[1:4]
table_headers = [t.text.replace('\n', '') for t in table_headers]

In [37]:
table_headers

['Country (or dependency)', 'Population (2020)', 'Yearly Change']

In [38]:
# Get data for a sample row
row_number = 2
row_data = soup.find_all('tr')[row_number].find_all('td')

In [39]:
print(len(row_data))

12


In [40]:
# Print the first elements
print(row_data[:4])

[<td>2</td>, <td style="font-weight: bold; font-size:15px; text-align:left"><a href="/world-population/india-population/">India</a></td>, <td style="font-weight: bold;">1,380,004,385</td>, <td>0.99 %</td>]


In [41]:
# We want indicdes 1, 2 and 3
d1, d2, d3 = row_data[1:4]

In [42]:
print(d1.find('a').text)
print(d2.text)
print(d3.text)

India
1,380,004,385
0.99 %


In [43]:
# Since this works, scrape the data
data = []
for i, row in enumerate(soup.find_all('tr')):
    try:
        d1, d2, d3 = row.find_all('td')[1:4]
        d1 = d1.find('a').text
        d2 = d2.text
        d3 = d3.text
        data.append([d1, d2, d3])
    except:
        print('Error parsing row {}'.format(i))

Error parsing row 0


In [44]:
# Print the head of the scraped data
print(data[:10])

[['China', '1,439,323,776', '0.39 %'], ['India', '1,380,004,385', '0.99 %'], ['United States', '331,002,651', '0.59 %'], ['Indonesia', '273,523,615', '1.07 %'], ['Pakistan', '220,892,340', '2.00 %'], ['Brazil', '212,559,417', '0.72 %'], ['Nigeria', '206,139,589', '2.58 %'], ['Bangladesh', '164,689,383', '1.01 %'], ['Russia', '145,934,462', '0.04 %'], ['Mexico', '128,932,753', '1.06 %']]


In [45]:
# Save this data to another CSV for later use
f_path = './country_populations.csv'
with open(f_path, 'w') as f:
    f.write('{},{},{}\n'.format(*table_headers))
    for d in data:
        f.write('{},{},{}\n'.format(*d))