In [32]:
#Crawl a web page and store content
import requests
URL = 'https://en.wikipedia.org/wiki/Albert_Einstein' 
crawled_page= requests.get(URL)

html_doc=crawled_page.content

In [33]:
html_doc



In [34]:
#Use simple html instead for demonstration
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tilly" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [35]:
#Show content
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(html_doc)

("<html><head><title>The Dormouse's story</title></head>\n"
 '<body>\n'
 '<p class="title"><b>The Dormouse\'s story</b></p>\n'
 '\n'
 '<p class="story">Once upon a time there were three little sisters; and their '
 'names were\n'
 '<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,\n'
 '<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and\n'
 '<a href="http://example.com/tilly" class="sister" id="link3">Tillie</a>;\n'
 'and they lived at the bottom of a well.</p>\n'
 '\n'
 '<p class="story">...</p>\n')


In [36]:
#Creating a BeautifulSoup object, which represents the document as a nested data structure:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tilly" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


# Kinds of objects

In [37]:
#Select a tag, in this case the "title" tag
title_tag=soup.title
title_tag

<title>The Dormouse's story</title>

In [38]:
#Similar output like "find method"
soup.find('title')

<title>The Dormouse's story</title>

In [39]:
#In case tag is used more than once
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tilly" id="link3">Tillie</a>]

In [40]:
#Name of the tag (in this case name of tag "title")
title_tag.name

'title'

In [41]:
#Text string enclosed by tag
title_tag.string

"The Dormouse's story"

In [42]:
#attributes belonging to tag
a_tag=soup.a
a_tag.attrs

{'href': 'http://example.com/elsie', 'class': ['sister'], 'id': 'link1'}

In [43]:
#Show value of specific attribute
a_tag['id']

'link1'

# Navigating the tree - downwards

In [44]:
#Content of body tag
soup.body

<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tilly" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>

In [45]:
#Get the first <b> tag beneath the <body> tag
first_b_tag=soup.body.b
first_b_tag

<b>The Dormouse's story</b>

In [46]:
#A tag’s children are available in a list called .children
for child in soup.body.children:
    print(child)



<p class="title"><b>The Dormouse's story</b></p>


<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tilly" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>


<p class="story">...</p>




In [47]:
#Show children of children
for child in soup.body.descendants:
    print(child)



<p class="title"><b>The Dormouse's story</b></p>
<b>The Dormouse's story</b>
The Dormouse's story


<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tilly" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
Once upon a time there were three little sisters; and their names were

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
Elsie
,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
Lacie
 and

<a class="sister" href="http://example.com/tilly" id="link3">Tillie</a>
Tillie
;
and they lived at the bottom of a well.


<p class="story">...</p>
...




# Navigating the tree - upwards

In [48]:
#<head> tag is the parent of the <title> tag:
title_tag = soup.title
title_tag.parent

<head><title>The Dormouse's story</title></head>

In [49]:
#You can iterate over all of an element’s parents with .parents. 
link = soup.a

for parent in link.parents:
    print(parent.name)

p
body
html
[document]


# Searching the tree

Search within tags

In [50]:
#Search all <b> tags in document
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tilly" id="link3">Tillie</a>]

In [51]:
#Search with regular expression
#Find all tags whose names start with the letter “b”; in this case, the <body> tag and the <b> tag:
import re
for tag in soup.find_all(re.compile("^b")):
    print(tag.name)

body
b


In [52]:
#Find all tags whose names contain the letter ‘t’
for tag in soup.find_all(re.compile("t")):
    print(tag.name)

html
title


Search within IDs

In [53]:
#If you pass in a value for an argument called id, Beautiful Soup will filter against each tag’s ‘id’ attribute:
soup.find_all(id='link2')

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [54]:
#Find all tags, with an attribute "href" that contains the string "ie"
soup.find_all(href=re.compile("ie"))

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

Search within content / string

In [55]:
#Find all strings equal to "Elsie"
soup.find_all(string="Elsie")

['Elsie']

In [56]:
#Find all strings, which contain Dormouse
soup.find_all(string=re.compile("Dormouse"))

["The Dormouse's story", "The Dormouse's story"]

In [57]:
#Show enclosing tags of found strings
dormouse_list=soup.find_all(string=re.compile("Dormouse"))

for content in dormouse_list:
    print(content.parent)

<title>The Dormouse's story</title>
<b>The Dormouse's story</b>


# Often used

In [58]:
#extracting all the text from a page:
print(soup.get_text())

The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...



In [59]:
#extracting all the text from the body of a page:
print(soup.body.get_text())


The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...



In [60]:
#Extracting all the URLs found within a page’s <a> tags:
for link in soup.find_all('a'):
    print(link.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tilly


In [61]:
#Crawl all referenced pages
crawled_subpages=[]

for link in soup.find_all('a'):
    url=link.get('href')
    crawled_subpages.append(requests.get(URL))

In [62]:
crawled_subpages[0].content

