In [1]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup 

In [2]:
# This doesn't work, why?
url = "https://medium.com/harvard-open-data-project"
html = urlopen(url)

HTTPError: HTTP Error 403: Forbidden

In [3]:
# We can disguise our request
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
# gcontext = ssl.SSLContext()
html = urlopen(req).read()

In [4]:
# Create a Beautiful Soup object (lxml is the html parser, don't worry too much about it for now)
soup = BeautifulSoup(html, 'lxml')
type(soup)

bs4.BeautifulSoup

In [5]:
# Get the title of the webpage
title = soup.title
print(title)

<title>Harvard College Open Data Project – Medium</title>


In [None]:
# How do we get the text of the webpage?
text = soup.text
print(text)

In [21]:
# Let's get the links of all articles
all_links = soup.findAll('a')
print(all_links)

[<a class="siteNav-logo u-fillTransparentBlackDarker u-flex0 u-flexCenter u-paddingTop0" data-log-event="home" href="https://medium.com/"><span class="svgIcon svgIcon--logoMonogram svgIcon--45px"><svg class="svgIcon-use" height="45" width="45"><path d="M5 40V5h35v35H5zm8.56-12.627c0 .555-.027.687-.318 1.03l-2.457 2.985v.396h6.974v-.396l-2.456-2.985c-.291-.343-.344-.502-.344-1.03V18.42l6.127 13.364h.714l5.256-13.364v10.644c0 .29 0 .342-.185.528l-1.848 1.796v.396h9.19v-.396l-1.822-1.796c-.184-.186-.21-.238-.21-.528V15.937c0-.291.026-.344.21-.528l1.823-1.797v-.396h-6.471l-4.622 11.542-5.203-11.542h-6.79v.396l2.14 2.64c.239.292.291.37.291.768v10.353z"></path></svg></span><span class="u-textScreenReader">Homepage</span></a>, <a class="button button--primary button--chromeless u-accentColor--buttonNormal is-inSiteNavBar u-xs-hide js-signInButton" data-action="sign-in-prompt" data-action-source="--------------------------nav_reg" data-redirect="https://medium.com/harvard-open-data-project" hr

In [None]:
# BeautifulSoup can get attributes for you! (saves you the hassle of using capture groups)
for link in all_links:
    print(link.get("href"))

In [23]:
# Let's find and print all the article titles
article_titles = soup.findAll('h3')
print(article_titles)

[<h3 class="u-contentSansBold u-lineHeightTightest u-xs-fontSize24 u-paddingBottom2 u-paddingTop5 u-fontSize32"><div class="u-letterSpacingTight u-lineHeightTighter u-breakWord u-textOverflowEllipsis u-lineClamp4 u-fontSize30 u-size12of12 u-xs-size12of12 u-xs-fontSize24">Harvard Student Opinion on the Second Democratic Debate</div></h3>, <h3 class="u-contentSansBold u-lineHeightTightest u-xs-fontSize24 u-paddingBottom2 u-paddingTop5 u-fontSize32"><div class="u-letterSpacingTight u-lineHeightTighter u-breakWord u-textOverflowEllipsis u-lineClamp3 u-fontSize24">How do Harvard students feel about the 2020 election?</div></h3>, <h3 class="u-contentSansBold u-lineHeightTightest u-xs-fontSize24 u-paddingBottom2 u-paddingTop5 u-fontSize32"><div class="u-letterSpacingTight u-lineHeightTighter u-breakWord u-textOverflowEllipsis u-lineClamp3 u-fontSize24">Harvard Housing Part 2: How Do Students Form Groups?</div></h3>, <h3 class="u-contentSansBold u-lineHeightTightest u-xs-fontSize24 u-paddingBo

In [24]:
# How can we clean up all this data?
str_titles = str(article_titles)
cleantext = BeautifulSoup(str_titles, "lxml").get_text()
print(cleantext)

# Not demonstrated here, but BeautifulSoup can also be excellent for storing text stored table style!

[Harvard Student Opinion on the Second Democratic Debate, How do Harvard students feel about the 2020 election?, Harvard Housing Part 2: How Do Students Form Groups?, Harvard Housing Part 1: Who Gets Quadded?, Recentering Social Life at Harvard, What’s Almost Left Unsaid: An Analysis of Harvard Confessions, Harvard’s Most (and Least) Desired Houses Updated: Housing Day 2019, How wealthy are Harvard student clubs?, A Pair of 35 lb Dumbbells Almost Won the Cabot Midterm Election, Demystifying UC Grants: Funding and Accountability Gaps in Harvard Student Group Funding, Single Gender Social Organizations and Safety, Predicting the 2018 UC Presidential Election Winners, Where Are Harvard’s Female Professors?]


In [None]:
# But what happens if your scraping requires you to click next?
# That's where scrapy and selenium come in (though there are workarounds for beautifulsoup)
# Scrapy: incredibly fast (up to 20 times faster according to some sites) and is asynchronous, supports xpath and css
# Selenium: would not recommend, was initially intended to be used for automated testing of web applications, could be very useful if the website you're using uses a lot of AJAX