In [None]:
# If installs are required
!pip install html5lib

In [None]:
!pip install bs4

In [None]:
from bs4 import BeautifulSoup # this module helps in web scraping.
import requests  # this module helps us to download a web page

Beautiful Soup is a Python library for pulling data out of HTML and XML files, we will focus on HTML files. This is accomplished by representing the HTML as a set of objects with methods used to parse the HTML. We can navigate the HTML as a tree, and/or filter out what we are looking for.

In [None]:
%%html
<!DOCTYPE html>
<html>
<head>
<title>Page Title</title>
</head>
<body>
<h3><b id='boldest'>Lebron James</b></h3>
<p> Salary: $ 92,000,000 </p>
<h3> Stephen Curry</h3>
<p> Salary: $85,000, 000 </p>
<h3> Kevin Durant </h3>
<p> Salary: $73,200, 000</p>
</body>
</html>

In [None]:
html="<!DOCTYPE html><html><head><title>Page Title</title></head><body><h3><b id='boldest'>Lebron James</b></h3><p> Salary: $ 92,000,000 </p><h3> Stephen Curry</h3><p> Salary: $85,000, 000 </p><h3> Kevin Durant </h3><p> Salary: $73,200, 000</p></body></html>"

To parse a document, pass it into the BeautifulSoup constructor. The BeautifulSoup object represents the document as a nested data structure:

In [None]:
soup = BeautifulSoup(html, 'html5lib')

First, the document is converted to Unicode (similar to ASCII) and HTML entities are converted to Unicode characters. Beautiful Soup transforms a complex HTML document into a complex tree of Python objects. The BeautifulSoup object can create other types of objects. In this lab, we will cover BeautifulSoup and Tag objects, that for the purposes of this lab are identical. Finally, we will look at NavigableString objects.

We can use the method prettify() to display the HTML in the nested structure:

In [None]:
print(soup.prettify())

Let's say we want the title of the page and the name of the top paid player. We can use the Tag. The Tag object corresponds to an HTML tag in the original document, for example, the tag title.

In [None]:
tag_object=soup.title
print("tag object:",tag_object)

In [None]:
print("tag object type:",type(tag_object))

In [None]:
tag_object=soup.h3
tag_object

As stated above, the Tag object is a tree of objects. We can access the child of the tag or navigate down the branch as follows:

In [None]:
tag_child =tag_object.b
tag_child

In [None]:
parent_tag=tag_child.parent
parent_tag

In [None]:
tag_object

In [None]:
tag_object.parent

In [None]:
sibling_1=tag_object.next_sibling
sibling_1

In [None]:
sibling_2=sibling_1.next_sibling
sibling_2

Use the object sibling_2 and the method next_sibling to find the salary of Stephen Curry:

In [None]:
# code here

If the tag has attributes, the tag id="boldest" has an attribute id whose value is boldest. You can access a tag's attributes by treating the tag like a dictionary:

In [None]:
tag_child['id']

In [None]:
# You can access that dictionary directly as attrs:
tag_child.attrs

In [None]:
# We can also obtain the content of the attribute of the tag using the Python get() method.
tag_child.get('id')

A string corresponds to a bit of text or content within a tag. Beautiful Soup uses the NavigableString class to contain this text. In our HTML we can obtain the name of the first player by extracting the string of the Tag object tag_child as follows:

In [None]:
tag_string=tag_child.string
tag_string

In [None]:
type(tag_string)

In [None]:
unicode_string = str(tag_string)
unicode_string

Filters allow you to find complex patterns, the simplest filter is a string. In this section we will pass a string to a different filter method and Beautiful Soup will perform a match against that exact string. Consider the following HTML of rocket launches:

In [None]:
%%html
<table>
  <tr>
    <td id='flight' >Flight No</td>
    <td>Launch site</td> 
    <td>Payload mass</td>
   </tr>
  <tr> 
    <td>1</td>
    <td><a href='https://en.wikipedia.org/wiki/Florida'>Florida</a></td>
    <td>300 kg</td>
  </tr>
  <tr>
    <td>2</td>
    <td><a href='https://en.wikipedia.org/wiki/Texas'>Texas</a></td>
    <td>94 kg</td>
  </tr>
  <tr>
    <td>3</td>
    <td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a> </td>
    <td>80 kg</td>
  </tr>
</table>

In [None]:
# We can store it as a string in the variable table:
table="<table><tr><td id='flight'>Flight No</td><td>Launch site</td> <td>Payload mass</td></tr><tr> <td>1</td><td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a></td><td>300 kg</td></tr><tr><td>2</td><td><a href='https://en.wikipedia.org/wiki/Texas'>Texas</a></td><td>94 kg</td></tr><tr><td>3</td><td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a> </td><td>80 kg</td></tr></table>"

In [None]:
table_bs = BeautifulSoup(table, 'html5lib')

The find_all() method looks through a tag's descendants and retrieves all descendants that match your filters.

The Method signature for find_all(name, attrs, recursive, string, limit, **kwargs)

When we set the name parameter to a tag name, the method will extract all the tags with that name and its children.

In [None]:
table_rows=table_bs.find_all('tr')
table_rows

The result is a Python iterable just like a list, each element is a tag object:

In [None]:
first_row =table_rows[0]
first_row

In [None]:
print(type(first_row))

In [None]:
first_row.td

If we iterate through the list, each element corresponds to a row in the table:

In [None]:
for i,row in enumerate(table_rows):
    print("row",i,"is",row)

As row is a cell object, we can apply the method find_all to it and extract table cells in the object cells using the tag td, this is all the children with the name td. The result is a list, each element corresponds to a cell and is a Tag object, we can iterate through this list as well. We can extract the content using the string attribute.

In [None]:
for i,row in enumerate(table_rows):
    print("row",i)
    cells=row.find_all('td')
    for j,cell in enumerate(cells):
        print('colunm',j,"cell",cell)

If we use a list we can match against any item in that list.

In [None]:
list_input=table_bs .find_all(name=["tr", "td"])
list_input

If the argument is not recognized it will be turned into a filter on the tag's attributes. For example with the id argument, Beautiful Soup will filter against each tag's id attribute. For example, the first td elements have a value of id of flight, therefore we can filter based on that id value.

In [None]:
table_bs.find_all(id="flight")

We can find all the elements that have links to the Florida Wikipedia page:

In [None]:
list_input=table_bs.find_all(href="https://en.wikipedia.org/wiki/Florida")
list_input

If we set the href attribute to True, regardless of what the value is, the code finds all tags with href value:

In [None]:
table_bs.find_all(href=True)
# Check out BeautifulSoup doc for more info

Using the logic above, find all the elements without href value.

In [None]:
# code here

Using the soup object soup, find the element with the id attribute content set to "boldest".

In [None]:
# code here

With string you can search for strings instead of tags, where we find all the elments with Florida:

In [None]:
table_bs.find_all(string="Florida")

The find_all() method scans the entire document looking for results. It’s useful if you are looking for one element, as you can use the find() method to find the first element in the document. Consider the following two tables:

In [None]:
%%html
<h3>Rocket Launch </h3>

<p>
<table class='rocket'>
  <tr>
    <td>Flight No</td>
    <td>Launch site</td> 
    <td>Payload mass</td>
  </tr>
  <tr>
    <td>1</td>
    <td>Florida</td>
    <td>300 kg</td>
  </tr>
  <tr>
    <td>2</td>
    <td>Texas</td>
    <td>94 kg</td>
  </tr>
  <tr>
    <td>3</td>
    <td>Florida </td>
    <td>80 kg</td>
  </tr>
</table>
</p>
<p>

<h3>Pizza Party  </h3>
  
    
<table class='pizza'>
  <tr>
    <td>Pizza Place</td>
    <td>Orders</td> 
    <td>Slices </td>
   </tr>
  <tr>
    <td>Domino's Pizza</td>
    <td>10</td>
    <td>100</td>
  </tr>
  <tr>
    <td>Little Caesars</td>
    <td>12</td>
    <td >144 </td>
  </tr>
  <tr>
    <td>Papa John's </td>
    <td>15 </td>
    <td>165</td>
  </tr>

In [None]:
two_tables="<h3>Rocket Launch </h3><p><table class='rocket'><tr><td>Flight No</td><td>Launch site</td> <td>Payload mass</td></tr><tr><td>1</td><td>Florida</td><td>300 kg</td></tr><tr><td>2</td><td>Texas</td><td>94 kg</td></tr><tr><td>3</td><td>Florida </td><td>80 kg</td></tr></table></p><p><h3>Pizza Party  </h3><table class='pizza'><tr><td>Pizza Place</td><td>Orders</td> <td>Slices </td></tr><tr><td>Domino's Pizza</td><td>10</td><td>100</td></tr><tr><td>Little Caesars</td><td>12</td><td >144 </td></tr><tr><td>Papa John's </td><td>15 </td><td>165</td></tr>"

In [None]:
two_tables_bs= BeautifulSoup(two_tables, 'html.parser')

In [None]:
two_tables_bs.find("table")

In [None]:
two_tables_bs.find("table",class_='pizza')

Downloading And Scraping The Contents Of A Web Page

We use get to download the contents of the webpage in text format and store in a variable called data:

In [None]:
url = "http://www.ibm.com"

In [None]:
data  = requests.get(url).text

In [None]:
soup = BeautifulSoup(data,"html5lib")  # create a soup object using the variable 'data'

In [None]:
# Scrape all links
for link in soup.find_all('a',href=True):  # in html anchor/link is represented by the tag <a>

    print(link.get('href'))

In [None]:
# Scrape all image tags
for link in soup.find_all('img'):# in html image is represented by the tag <img>
    print(link)
    print(link.get('src'))

In [None]:
# Scrape data from HTML tables
#The below url contains an html table with data about colors and color codes.
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/HTMLColorCodes.html"

Before proceeding to scrape a web site, you need to examine the contents and the way data is organized on the website. Open the above url in your browser and check how many rows and columns there are in the color table.

In [None]:
# get the contents of the webpage in text format and store in a variable called data
data  = requests.get(url).text

In [None]:
soup = BeautifulSoup(data,"html5lib")

In [None]:
#find a html table in the web page
table = soup.find('table') # in html table is represented by the tag <table>

In [None]:
#Get all rows from the table
for row in table.find_all('tr'): # in html table row is represented by the tag <tr>
    # Get all columns in each row.
    cols = row.find_all('td') # in html a column is represented by the tag <td>
    color_name = cols[2].string # store the value in column 3 as color_name
    color_code = cols[3].text # store the value in column 4 as color_code
    print("{}--->{}".format(color_name,color_code))