In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
html_doc="""
<!DOCTYPE html>
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
</html>
"""

In [3]:
soup=BeautifulSoup(html_doc, 'html.parser')

In [4]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>



### HTML to select elements

In [5]:
soup.title

<title>The Dormouse's story</title>

In [6]:
soup.body

<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>

In [7]:
p_tags=soup.find_all('p')
p_tags

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [8]:
for p in p_tags:
    print (p.get_text())

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...


In [9]:
a_tags=soup.find_all('a')
a_tags

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [10]:
for a in a_tags:
    print (a.get_text())

Elsie
Lacie
Tillie


In [11]:
## get('href')

for a in a_tags:
    print (a.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [12]:
soup.title.parent.string

"The Dormouse's story"

In [13]:
soup.title.parent.name

'head'

In [14]:
soup.text.count('were')

2

### css method

In [15]:
soup


<!DOCTYPE html>

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [16]:
soup.select('#link1') # # for id

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [17]:
soup.select(".sister") # . for class name

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [18]:
soup.select('a') # for class type

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [19]:
for a in soup.select('a'):
    print(a.get_text())

Elsie
Lacie
Tillie


In [20]:
# for an indexed value
print(soup.select('a')[0].get_text())

Elsie


In [21]:
# to select a class and label inside class
soup.select('p.story')

[<p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [22]:
soup.select('p.story')[0].get_text()

'Once upon a time there were three little sisters; and their names were\nElsie,\nLacie and\nTillie;\nand they lived at the bottom of a well.'

### activity

* all fun facts
* names of all places
* content (name plus fact) of ONLY cities
* names of only cities (not facts)

In [23]:
html_doc2 = """<!DOCTYPE html>
<html>
<head> Geography</head>
<body>
<div class="city">
  <h2>London</h2>
  <p>London is the most popular tourist destination in the world.</p>
</div>
<div class="city">
  <h2>Paris</h2>
  <p>Paris was originally a Roman City called Lutetia.</p>
</div>
<div class="country">
  <h2>Spain</h2>
  <p>Spain produces 43,8% of all the world’s Olive Oil.</p>
</div>
</body>
</html>"""

In [24]:
soup2=BeautifulSoup(html_doc2, 'html.parser')

#### fun facts

In [25]:
p_tags2=soup2.find_all('p')
p_tags2

[<p>London is the most popular tourist destination in the world.</p>,
 <p>Paris was originally a Roman City called Lutetia.</p>,
 <p>Spain produces 43,8% of all the world’s Olive Oil.</p>]

In [26]:
for p in p_tags2:
    print (p.get_text())

London is the most popular tourist destination in the world.
Paris was originally a Roman City called Lutetia.
Spain produces 43,8% of all the world’s Olive Oil.


#### names places

In [27]:
h2_tags2=soup2.find_all('h2')
h2_tags2

[<h2>London</h2>, <h2>Paris</h2>, <h2>Spain</h2>]

In [28]:
for h2 in h2_tags2:
    print (h2.get_text())

London
Paris
Spain


#### content (name + fact) of ONLY cities

In [29]:
soup2.find_all("div", {"class":"city"})

[<div class="city">
 <h2>London</h2>
 <p>London is the most popular tourist destination in the world.</p>
 </div>,
 <div class="city">
 <h2>Paris</h2>
 <p>Paris was originally a Roman City called Lutetia.</p>
 </div>]

In [30]:
for i in soup2.find_all("div", {"class":"city"}):
    print(i.get_text())


London
London is the most popular tourist destination in the world.


Paris
Paris was originally a Roman City called Lutetia.



#### names of only cities

In [31]:
h2_tags2=soup2.find_all('h2')
h2_tags2

[<h2>London</h2>, <h2>Paris</h2>, <h2>Spain</h2>]

In [32]:
for i in soup2.find_all("div", {"class":"city"}):
    print(i.h2.get_text())

London
Paris
