# Chapter 16: Extracting text from web pages

## 16.1 The Structure of HTML Documents

In [51]:
# Defining a simple HTML string
html_contents = "<html>Hello</hello>"

In [52]:
# Rendering an HTML string
from IPython.display import display, HTML
def render(html_contents): display(HTML(html_contents))
render(html_contents)

In [53]:
# Defining a title in HTML
title = "<title>Data Science is Fun</title>"

In [54]:
# Adding a title to the HTML string
html_contents = f"<html>{title}Hello</html>"
render(html_contents)

In [55]:
# Adding a head and body to the HTML string
head = f"<head>{title}</head>"
body = "<body>Hello</body>"
html_contents = f"<html>{title} {body}</html>"

In [56]:
# Adding a header to the HTML string
header = "<h1>Data Science is Fun</h1>"
body = f"<body>{header}Hello</body>"
html_contents = f"<html>{title} {body}</html>"
render(html_contents)

In [57]:
# Adding paragraphs to the HTML string
paragraphs = ''
for i in range(2):
    paragraph_string = f"Paragraph {i} " * 40
    paragraphs += f"<p>{paragraph_string}</p>"

body = f"<body>{header}{paragraphs}</body>"
html_contents = f"<html>{title} {body}</html>"
render(html_contents)

In [58]:
# Adding id attributes to teh paragraphs
paragraphs = ''
for i in range(2):
    paragraph_string = f"Paragraph {i} " * 40
    attribute = f"id='paragraph {i}'"
    paragraphs += f"<p {attribute}>{paragraph_string}</p>"

body = f"<body>{header}{paragraphs}</body>"
html_contents = f"<html>{title} {body}</html>"
render(html_contents)

In [59]:
# Adding a hyperlink to the HTML string
link_text = "Data Science Bootcamp"
url = "https://www.manning.com/books/data-science-bookcamp"
hyperlink = f"<a href='{url}'>{link_text}</a>"
new_paragraph = f"<p id='paragraph 2'>Here is a link to {hyperlink}</p>"
paragraphs += new_paragraph
body = f"<body>{header}{paragraphs}</body>"
html_contents = f"<html>{title} {body}</html>"
render(html_contents)

In [60]:
# Defining a list of data science libraries
libraries = ['NumPy', 'SciPy', 'Pandas', 'Scikit-Learn']

In [61]:
# Demarcating list items with an <li> tag
items = ''
for library in libraries:
    items += f"<li>{library}</li>"

In [62]:
# Adding an unstructured list to the HTML string
unstructured_list = f"<ul>{items}</ul>"
header2 = '<h2>Common Data Science Libraries</h2>'
body = f"<body>{header}{paragraphs}{header2}{unstructured_list}</body>"
html_contents = f"<html>{title} {body}</html>"
render(html_contents)

In [63]:
# Adding divisions to the HTML string
div1 = f"<div id='paragraphs' class='text'>{paragraphs}</div>"
div2 = f"<div id='list' class='text'>{header2}{unstructured_list}</div>"
div3 = f"<div id='empty' class='empty'></div>"
body = f"<body>{header}{div1}{div2}{div3}</body>"
html_contents = f"<html>{title} {body}</html>"
render(html_contents)

In [64]:
# Printing the altered HTML string
print(html_contents)

<html><title>Data Science is Fun</title> <body><h1>Data Science is Fun</h1><div id='paragraphs' class='text'><p id='paragraph 0'>Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 </p><p id='paragraph 1'>Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragra

In [65]:
# Extracting the HTML title using basic Python
split_contents = html_contents.split('>')
for i, substring in enumerate(split_contents):
    if substring.endswith('<title'):
        next_string = split_contents[i + 1]
        title = next_string.split('<')[0]
        print(title)
        break

Data Science is Fun


## 16.2 Parsing HTML using Beautiful Soup

In [66]:
# Importing the BeautifulSoup as bs
from bs4 import BeautifulSoup as bs

In [67]:
# Initializing BeautifulSoup using an HTML string
soup = bs(html_contents)

In [68]:
# Printing readable HTML with Beautiful Soup
print(soup.prettify())

<html>
 <head>
  <title>
   Data Science is Fun
  </title>
 </head>
 <body>
  <h1>
   Data Science is Fun
  </h1>
  <div class="text" id="paragraphs">
   <p id="paragraph 0">
    Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0
   </p>
   <p id="paragraph 1">
    Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Parag

In [69]:
# Extracting the title with Beautiful Soup
title = soup.find('title')
print(title)

<title>Data Science is Fun</title>


In [70]:
# Outputting the title's data type
print(type(title))

<class 'bs4.element.Tag'>


In [71]:
# Outputting the title's text attribute
print(title.text)

Data Science is Fun


In [72]:
# Accessing the title's text attribute from soup
assert soup.title.text == title.text

In [73]:
# Accessing the body's text attribute from soup
body = soup.body
print(body.text)

Data Science is FunParagraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Here is a link to Dat

In [74]:
# Accessing the text of the first paragraph
assert body.p.text == soup.p.text
print(soup.p.text)

Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 


In [75]:
# Accessing all paragraphs in the body
paragraphs = body.find_all('p')
for i, paragraph in enumerate(paragraphs):
    print(f"\nPARAGRAPH {i}:")
    print(paragraph.text)


PARAGRAPH 0:
Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 

PARAGRAPH 1:
Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 

PARAGRAPH

In [76]:
# Accessing tall bullet points in the body
print([bullet.text for bullet in body.find_all('li')])

['NumPy', 'SciPy', 'Pandas', 'Scikit-Learn']


In [77]:
# Accessing a paragraph by ID
paragraph_2 = soup.find(id='paragraph 2')
print(paragraph_2.text)

Here is a link to Data Science Bootcamp


In [78]:
# Accessing an attribute in a tag
assert paragraph_2.get('id') == 'paragraph 2'
print(paragraph_2.a.get('href'))

https://www.manning.com/books/data-science-bookcamp


In [79]:
# Accessing divisions by their shared class attribute
for division in soup.find_all('div', class_='text'):
    id_ = division.get('id')
    print(f"\nDivision with id '{id_}'")
    print(division.text)


Division with id 'paragraphs'
Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 0 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Paragraph 1 Here is a

In [80]:
# Paragraph deletion with Beautiful Soup
body.find(id='paragraph 0').decompose()
soup.find(id='paragraph 1').decompose()
print(body.find(id='paragraphs').text)

Here is a link to Data Science Bootcamp


In [81]:
# Initializing an empty paragraph Tag
new_paragraph = soup.new_tag('p')
print(new_paragraph)

<p></p>


In [82]:
# Updating the text of an empty paragraph
new_paragraph.string = "This paragraph is new"
print(new_paragraph)

<p>This paragraph is new</p>


In [83]:
# Paragraph insertion with Beautiful Soup
soup.find(id='empty').append(new_paragraph)
render(soup.prettify())

## 16.3 Downloading and Parsing Online Data

In [84]:
# Importing the urlopen function
from urllib.request import urlopen

In [85]:
# Downloading an HTML document
url = "https://www.manning.com/books/data-science-bookcamp"
html_contents = urlopen(url).read()
print(html_contents[:1000])

b'\n<!DOCTYPE html>\n<!--[if lt IE 7 ]> <html lang="en" class="no-js ie6 ie"> <![endif]-->\n<!--[if IE 7 ]>    <html lang="en" class="no-js ie7 ie"> <![endif]-->\n<!--[if IE 8 ]>    <html lang="en" class="no-js ie8 ie"> <![endif]-->\n<!--[if IE 9 ]>    <html lang="en" class="no-js ie9 ie"> <![endif]-->\n<!--[if (gt IE 9)|!(IE)]><!--> <html lang="en" class="no-js"><!--<![endif]-->\n\n<head>\n    <meta name="theme-color" content="#333333">\n    <title>Data Science Bookcamp</title>\n\n\n\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\n<meta http-equiv="X-UA-Compatible" content="IE=edge">\n<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=0">\n<meta name="application-name" content="Data Science Bookcamp"/>\n<meta name="apple-mobile-web-app-title" content="Data Science Bookcamp"/>\n\n<meta property="og:title" content="Data Science Bookcamp"/>\n<meta name="twitter:title" content="Data Science Bookcamp"/>\n\n<meta name="tw

In [86]:
# Accesssing the title with Beautiful Soup
soup = bs(html_contents)
print(soup.title.text)

Data Science Bookcamp


In [87]:
# Accessing a description of this book
for division in soup.find_all('div'):
    header = division.h2
    if header is None:
        continue

    if header.text.lower() == 'about the book':
        print(division.text)


about the book

Data Science Bookcamp doesn’t stop with surface-level theory and toy examples. As you work through each project, you’ll learn how to troubleshoot common problems like missing data, messy data, and algorithms that don’t quite fit the model you’re building. You’ll appreciate the detailed setup instructions and the fully explained solutions that highlight common failure points. In the end, you’ll be confident in your skills because you can see the results.
    


# Key Words / Notes:
* Use `find()` to find the first instance of a tag
* Use `find_all()` to return an iterable result set of all instances of a tag
* Can use the `tag.decompose()` function to remove tags from a soup object
* Can use the `new_tag()` function to add tags to make tags to add to soup objects