## Web Scraping with Python Using Beautiful Soup

In [7]:
import requests
page = requests.get("https://dataquestio.github.io/web-scraping-pages/simple.html")
page

<Response [200]>

In [9]:
page.content

b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'

In [10]:
# Parsing a page with BeautifulSoup

from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')

In [11]:
# We can now print out the HTML content of the page, formatted nicely, using the prettify method on the BeautifulSoup object.

print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [12]:
# As all the tags are nested, we can move through the structure one level at a time. We can first select all the elements 
# at the top level of the page using the children property of soup.

# Note that children returns a list generator, so we need to call the list function on:

list(soup.children)

# The above tells us that there are two tags at the top level of the page — the initial <!DOCTYPE html> tag, and the <html> tag. 
# There is a newline character (n) in the list as well

['html',
 '\n',
 <html>
 <head>
 <title>A simple example page</title>
 </head>
 <body>
 <p>Here is some simple content for this page.</p>
 </body>
 </html>]

In [13]:
# Let’s see what the type of each element in the list i

[type(item) for item in list(soup.children)]

# As we can see, all of the items are BeautifulSoup objects
# 1 The first is a Doctype object, which contains information about the type of the document
# 2 The second is a NavigableString, which represents text found in the HTML document.
# 3 The final item is a Tag object, which contains other nested tags

[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

In [14]:
# The most important object type, and the one we’ll deal with most often, is the Tag object.

# We can now select the html tag and its children by taking the third item in the list

html = list(soup.children)[2]

In [15]:
# Now, we can find the children inside the html tag

list(html.children)

['\n',
 <head>
 <title>A simple example page</title>
 </head>,
 '\n',
 <body>
 <p>Here is some simple content for this page.</p>
 </body>,
 '\n']

In [18]:
# As we can see above, there are two tags here, head, and body. We want to extract the text inside the p tag, 
# so we’ll dive into the body

body = list(html.children)[3]

In [19]:
# Now, we can get the p tag by finding the children of the body ta

list(body.children)

['\n', <p>Here is some simple content for this page.</p>, '\n']

In [20]:
# We can now isolate the p tag

p = list(body.children)[1]

In [21]:
# Once we’ve isolated the tag, we can use the get_text method to extract all of the text inside the tag

p.get_text()

'Here is some simple content for this page.'

In [22]:
# Finding all instances of a tag at once

In [23]:
soup = BeautifulSoup(page.content, 'html.parser')
soup.find_all('p')

[<p>Here is some simple content for this page.</p>]

In [24]:
# Note that find_all returns a list, so we’ll have to loop through, or use list indexing, it to extract text

soup.find_all('p')[0].get_text()

'Here is some simple content for this page.'

In [25]:
# find the first instance of a tag

soup.find('p')

<p>Here is some simple content for this page.</p>

In [26]:
# Searching the tag by class

page = requests.get("https://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
soup = BeautifulSoup(page.content, 'html.parser')
soup

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<div>
<p class="inner-text first-item" id="first">
                First paragraph.
            </p>
<p class="inner-text">
                Second paragraph.
            </p>
</div>
<p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>
<p class="outer-text">
<b>
                Second outer paragraph.
            </b>
</p>
</body>
</html>

In [27]:
# find_all method to search for items by class or by id

soup.find_all('p', class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [28]:
# we’ll look for any tag that has the class outer-text

soup.find_all(class_="outer-text")

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [29]:
# We can also search for elements by id

soup.find_all(id="first")

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>]

In [43]:
import bs4
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime

In [45]:
url='https://www.estesparkweather.net/archive_reports.phpdate=202005'
page = requests.get(url)
print(page)
soup =  BeautifulSoup(page.content,'html.parser')
print(soup)

<Response [404]>
<!DOCTYPE html>

<html style="height:100%">
<head>
<meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/>
<title> 404 Not Found
</title></head>
<body style="color: #444; margin:0;font: normal 14px/20px Arial, Helvetica, sans-serif; height:100%; background-color: #fff;">
<div style="height:auto; min-height:100%; "> <div style="text-align: center; width:800px; margin-left: -400px; position:absolute; top: 30%; left:50%;">
<h1 style="margin:0; font-size:150px; line-height:150px; font-weight:bold;">404</h1>
<h2 style="margin-top:20px;font-size: 30px;">Not Found
</h2>
<p>The resource requested could not be found on this server!</p>
</div></div><div style="color:#f0f0f0; font-size:12px;margin:auto;padding:0px 30px 0px 30px;position:relative;clear:both;height:100px;margin-top:-101px;background-color:#474747;border-top: 1px solid rgba(0,0,0,0.15);box-shadow: 0 1px 0 rgba(255, 255, 255, 0.3) inset;">
<br/>Proudly powered by  <a href="http://www

In [62]:
import requests
response = requests.get("https://news.ycombinator.com/")
if response.status_code != 200:
	print("Error fetching page")
	exit()
else:
	content = response.content
print(content)

b'<html lang="en" op="news"><head><meta name="referrer" content="origin"><meta name="viewport" content="width=device-width, initial-scale=1.0"><link rel="stylesheet" type="text/css" href="news.css?FJc2HJ5ky117YsmUhGiy">\n        <link rel="shortcut icon" href="favicon.ico">\n          <link rel="alternate" type="application/rss+xml" title="RSS" href="rss">\n        <title>Hacker News</title></head><body><center><table id="hnmain" border="0" cellpadding="0" cellspacing="0" width="85%" bgcolor="#f6f6ef">\n        <tr><td bgcolor="#ff6600"><table border="0" cellpadding="0" cellspacing="0" width="100%" style="padding:2px"><tr><td style="width:18px;padding-right:4px"><a href="https://news.ycombinator.com"><img src="y18.gif" width="18" height="18" style="border:1px white solid;"></a></td>\n                  <td style="line-height:12pt; height:10px;"><span class="pagetop"><b class="hnname"><a href="news">Hacker News</a></b>\n              <a href="newest">new</a> | <a href="front">past</a> | 

In [63]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

In [64]:
# The HTML title of the page
print(soup.title)

<title>Hacker News</title>


In [65]:
# The test title of the page
print(soup.title.string)

Hacker News


In [66]:
# All links in the page
nb_links = len(soup.find_all('a'))
print(f"There are {nb_links} links in this page")

There are 226 links in this page


In [67]:
# Text from the page
print(soup.get_text())




Hacker News

Hacker News
new | past | comments | ask | show | jobs | submit 
login




1. Widelands is a free, open-source real-time strategy game (widelands.org)
159 points by doener 3 hours ago  | hide | 21 comments 


2. Classical Data Structures That Can Outperform Learned Indexes (2018) (stanford.edu)
142 points by signa11 5 hours ago  | hide | 18 comments 


3. Email Authenticity 101: DKIM, Dmarc, and SPF (alexblackie.com)
268 points by alexblackie 9 hours ago  | hide | 62 comments 


4. TSMC’s Speciality Technologies (techtaiwan.com)
59 points by Klyonova 4 hours ago  | hide | 13 comments 


5. Software is a Process and hidden driver of Productivity (eth.link)
11 points by Wildgoose 1 hour ago  | hide | discuss 


6. Sirum (YC W15) Is Hiring a Lead Engineer to Improve Medicine Access
7 minutes ago | hide 


7. How to avoid being hit by a laser in a room of mirrors [video] (youtube.com)
191 points by sigil 7 hours ago  | hide | 56 comments 


8. Nerds Don't Respond to Marketin

In [69]:
first_link = soup.a
print(first_link)

<a href="https://news.ycombinator.com"><img height="18" src="y18.gif" style="border:1px white solid;" width="18"/></a>


In [70]:
# The text of the link
print(first_link.text)




In [72]:
# The href of the link
print(first_link.get('href'))

https://news.ycombinator.com


In [73]:
pagespace = soup.find(id="pagespace")
print(pagespace)

<tr id="pagespace" style="height:10px" title=""></tr>


In [74]:
# class is a reserved keyword in Python, hence the '_'
athing = soup.find(class_="athing")
print(athing)

<tr class="athing" id="28196178">
<td align="right" class="title" valign="top"><span class="rank">1.</span></td> <td class="votelinks" valign="top"><center><a href="vote?id=28196178&amp;how=up&amp;goto=news" id="up_28196178"><div class="votearrow" title="upvote"></div></a></center></td><td class="title"><a class="storylink" href="https://www.widelands.org/">Widelands is a free, open-source real-time strategy game</a><span class="sitebit comhead"> (<a href="from?site=widelands.org"><span class="sitestr">widelands.org</span></a>)</span></td></tr>


In [75]:
from collections import Counter
all_hrefs = [a.get('href') for a in soup.find_all('a')]
top_3_links = Counter(all_hrefs).most_common(3)
print(top_3_links)

[('item?id=28196178', 2), ('item?id=28195439', 2), ('item?id=28194477', 2)]
