In [10]:
# Practice Python 17 - Decode a webpage

# Use the BeautifulSoup and requests Python packages to print out
# a list of all the article titles on the New York Times Homepage

# Mark, New York Times website wasn't too friendly. Use this example instead
# https://www.dataquest.io/blog/web-scraping-tutorial-python/

In [11]:
from bs4 import BeautifulSoup

In [12]:
import requests

In [32]:
page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")

In [33]:
page.status_code

200

In [34]:
soup = BeautifulSoup(page.content, 'html.parser')

In [35]:
# page.content # show a raw version of the HTML
print(soup.prettify()) # Show a clean, readable version of the HTML

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [36]:
list(soup.children)

['html', '\n', <html>
 <head>
 <title>A simple example page</title>
 </head>
 <body>
 <p>Here is some simple content for this page.</p>
 </body>
 </html>]

In [37]:
[type(item) for item in list(soup.children)]

[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

In [38]:
html = list(soup.children)[2]

In [39]:
print(html)

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>


In [40]:
list(html.children)

['\n', <head>
 <title>A simple example page</title>
 </head>, '\n', <body>
 <p>Here is some simple content for this page.</p>
 </body>, '\n']

In [43]:
body = list(html.children)[3]

In [44]:
body

<body>
<p>Here is some simple content for this page.</p>
</body>

In [45]:
list(body.children)

['\n', <p>Here is some simple content for this page.</p>, '\n']

In [46]:
p = list(body.children)[1]

In [49]:
p.get_text()

'Here is some simple content for this page.'

In [50]:
soup.find_all('p')

[<p>Here is some simple content for this page.</p>]

In [51]:
soup.find_all('p')[0].get_text()

'Here is some simple content for this page.'

In [52]:
# Search for tags by class and ID

In [53]:
page = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
soup = BeautifulSoup(page.content, 'html.parser')
soup

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<div>
<p class="inner-text first-item" id="first">
                First paragraph.
            </p>
<p class="inner-text">
                Second paragraph.
            </p>
</div>
<p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>
<p class="outer-text">
<b>
                Second outer paragraph.
            </b>
</p>
</body>
</html>

In [54]:
soup.find_all('p', class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [55]:
soup.find_all(class_="outer-text")

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [56]:
soup.find_all(id="first")

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>]