In [1]:
import urllib
from urllib.request import urlopen 
import re
from bs4 import BeautifulSoup

## String methods

In [2]:
url = "http://olympus.realpython.org/profiles/aphrodite"

In [3]:
page = urlopen(url) # HTTPResponse object
page

<http.client.HTTPResponse at 0x2978d4f8e80>

In [4]:
html_bytes = page.read()
html = html_bytes.decode("utf-8") # decode bytes to a string

In [5]:
print(html)

<html>
<head>
<title>Profile: Aphrodite</title>
</head>
<body bgcolor="yellow">
<center>
<br><br>
<img src="/static/aphrodite.gif" />
<h2>Name: Aphrodite</h2>
<br><br>
Favorite animal: Dove
<br><br>
Favorite color: Red
<br><br>
Hometown: Mount Olympus
</center>
</body>
</html>



In [6]:
title_index = html.find("<title>") # index off opening <title>
title_index

14

In [7]:
start_index = title_index + len("<title>") # index of the title
start_index

21

In [8]:
end_index = html.find("</title>") # index of closinig </title>
end_index

39

In [9]:
title = html[start_index:end_index]
title

'Profile: Aphrodite'

In [10]:
url = "http://olympus.realpython.org/profiles/poseidon"
page = urlopen(url)
html = page.read().decode("utf-8")
print(html)

<html>
<head>
<title >Profile: Poseidon</title>
</head>
<body bgcolor="yellow">
<center>
<br><br>
<img src="/static/poseidon.jpg" />
<h2>Name: Poseidon</h2>
<br><br>
Favorite animal: Dolphin
<br><br>
Favorite color: Blue
<br><br>
Hometown: Sea
</center>
</body>
</html>



In [11]:
start_index = html.find("<title>") + len("<title>")
end_index = html.find("</title>")
title = html[start_index:end_index]
title

'\n<head>\n<title >Profile: Poseidon'

## Regular expressions

### Finding and matchinig patterns

In [12]:
re.findall("ab*c","ac")

['ac']

In [13]:
re.findall("ab*c","abcd")

['abc']

In [14]:
re.findall("ab*c","acc")

['ac']

In [15]:
re.findall("ab*c","abcac")

['abc', 'ac']

In [16]:
re.findall("ab*c","abdc")

[]

In [17]:
re.findall("ab*c","ABC",re.IGNORECASE)

['ABC']

In [18]:
string = "Everything is <replaced> if it's in <tags>."
string = re.sub("<.*>", "ELEPHANTS", string) # finds longest possible string of text

In [19]:
string

'Everything is ELEPHANTS.'

In [20]:
string = "Everything is <replaced> if it's in <tags>."
string = re.sub("<.*?>", "ELEPHANTS", string) # finds shortest possible string of text
string

"Everything is ELEPHANTS if it's in ELEPHANTS."

In [21]:
url = "http://olympus.realpython.org/profiles/dionysus"
page = urlopen(url)
html = page.read().decode("utf-8")
print(html)

<html>
<head>
<TITLE >Profile: Dionysus</title  / >
</head>
<body bgcolor="yellow">
<center>
<br><br>
<img src="/static/dionysus.jpg" />
<h2>Name: Dionysus</h2>
<img src="/static/grapes.png"><br><br>
Hometown: Mount Olympus
<br><br>
Favorite animal: Leopard <br>
<br>
Favorite Color: Wine
</center>
</body>
</html>



In [22]:
pattern = "<title.*?>.*?</title.*?>"
match_results = re.search(pattern,html,re.IGNORECASE)
title = match_results.group()
title = re.sub("<.*?>","",title) # remove HTML tags

In [23]:
print(title)

Profile: Dionysus


## BeautifulSoup

In [24]:
url = "http://olympus.realpython.org/profiles/dionysus"
page = urlopen(url)
html = page.read().decode("utf-8")

In [25]:
print(html)

<html>
<head>
<TITLE >Profile: Dionysus</title  / >
</head>
<body bgcolor="yellow">
<center>
<br><br>
<img src="/static/dionysus.jpg" />
<h2>Name: Dionysus</h2>
<img src="/static/grapes.png"><br><br>
Hometown: Mount Olympus
<br><br>
Favorite animal: Leopard <br>
<br>
Favorite Color: Wine
</center>
</body>
</html>



In [26]:
soup = BeautifulSoup(html,"html.parser")

In [27]:
print(soup.get_text())



Profile: Dionysus





Name: Dionysus

Hometown: Mount Olympus

Favorite animal: Leopard 

Favorite Color: Wine






### Finding images

In [28]:
soup.find_all("img")

[<img src="/static/dionysus.jpg"/>, <img src="/static/grapes.png"/>]

In [29]:
image1, image2 = soup.find_all("img")

In [30]:
image1.name # returns HTML tag type

'img'

Get the source of the image.

In [31]:
image1["src"] # src as the only attribute here

'/static/dionysus.jpg'

In [32]:
image2["src"]

'/static/grapes.png'

### Finding title

In [33]:
soup.title

<title>Profile: Dionysus</title>

In [34]:
soup.title.string

'Profile: Dionysus'

In [35]:
soup.find_all("img", src="/static/dionysus.jpg")

[<img src="/static/dionysus.jpg"/>]

### URL Tutorial

First get the base url.

In [36]:
base_url = "http://olympus.realpython.org"

Then obtain the actual url and the html content.

In [37]:
html_page = urlopen(base_url+"/profiles")
html_text = html_page.read().decode("utf-8")

Now we create a BeautifulSoup object.

In [40]:
soup = BeautifulSoup(html_text,"html.parser")

In [42]:
soup.find_all("a")

[<a href="/profiles/aphrodite">Aphrodite</a>,
 <a href="/profiles/poseidon">Poseidon</a>,
 <a href="/profiles/dionysus">Dionysus</a>]

In [41]:
for link in soup.find_all("a"):
    link_url = base_url +link["href"]
    print(link_url)

http://olympus.realpython.org/profiles/aphrodite
http://olympus.realpython.org/profiles/poseidon
http://olympus.realpython.org/profiles/dionysus
