<a href="https://colab.research.google.com/github/JonathanCornish/requests-html-practice/blob/master/Requests_HTML_Reddit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Scraping data using JavaScript

In [0]:
pip install requests_html



In [0]:
# Official documentation of the requests-html package: https://requests-html.readthedocs.io/en/latest/

Initial setup

In [0]:
# Loading the necessary packages
from requests_html import HTMLSession

In [0]:
# establish/open a session
session = HTMLSession()

In [0]:
# submitting a GET request
r = session.get("https://en.wikipedia.org/wiki/Association_football")
r.status_code

200

In [0]:
# The html response to the GET request is contained in the '.html' method
r.html

<HTML url='https://en.wikipedia.org/wiki/Association_football'>

Links

In [0]:
# We can extract all link addresses directly with '.links'
urls = r.html.links
urls

{'https://web.archive.org/web/20071011144942/http://fifa.com/flash/lotg/football/en/Laws1_04.htm',
 'https://www.fifa.com/flash/lotg/football/en/Laws3_02.htm',
 '/wiki/Wikipedia:Media_help',
 '/wiki/Glossary_of_association_football_terms',
 '/wiki/Birmingham',
 '/wiki/Nutmeg_(association_football)',
 '/wiki/Field_lacrosse',
 'https://ka.wikipedia.org/wiki/%E1%83%A4%E1%83%94%E1%83%AE%E1%83%91%E1%83%A3%E1%83%A0%E1%83%97%E1%83%98',
 '/wiki/Special:BookSources/978-1-136-43763-2',
 '/wiki/Canoe_polo',
 '/wiki/1871_in_association_football',
 '/wiki/Freestyle_football',
 '/wiki/Czech_handball',
 '/wiki/1904_in_association_football',
 'https://foundation.wikimedia.org/wiki/Privacy_policy',
 '/wiki/Rabona',
 '/wiki/1996_in_association_football',
 '/wiki/African_Nations_Championship',
 '/wiki/1984_Summer_Paralympics',
 'https://www.fifa.com/mm/document/footballdevelopment/refereeing/81/42/36/log2013en%5fneutral.pdf',
 'https://www.fifa.com/aboutfifa/organisation/ifab/media/news/newsid=707751/',


In [0]:
# Note that those are the relative URLs 

In [0]:
# To get absolute URLs we can use '.absolute_links' instead of '.links'
full_path_urls = r.html.absolute_links
full_path_urls

{'https://web.archive.org/web/20071011144942/http://fifa.com/flash/lotg/football/en/Laws1_04.htm',
 'https://www.fifa.com/flash/lotg/football/en/Laws3_02.htm',
 'https://en.wikipedia.org/wiki/South_American_U-15_Championship',
 'https://en.wikipedia.org/wiki/File:World_Map_FIFA.svg',
 'https://en.wikipedia.org/wiki/OFC_Nations_Cup',
 'https://en.wikipedia.org/wiki/Age_fraud_in_association_football',
 'https://en.wikipedia.org/wiki/1920_in_association_football',
 'https://ka.wikipedia.org/wiki/%E1%83%A4%E1%83%94%E1%83%AE%E1%83%91%E1%83%A3%E1%83%A0%E1%83%97%E1%83%98',
 'https://en.wikipedia.org/wiki/Asian_Cup',
 'https://en.wikipedia.org/wiki/Auto_polo',
 'https://en.wikipedia.org/wiki/FA_Premier_League_2005-06',
 'https://en.wikipedia.org/wiki/Administration_(British_football)',
 'https://en.wikipedia.org/wiki/Rezball',
 'https://foundation.wikimedia.org/wiki/Privacy_policy',
 'https://en.wikipedia.org/wiki/1912_in_association_football',
 'https://en.wikipedia.org/wiki/Calcio_Fiorentino

In [0]:
# An important thing to note is that these links (given by both methods) are returned in a SET, not a LIST
type(urls)

set

Searching for elements

In [0]:
# A quick note: requests-html uses CSS selectors for searching
# We will cover them in the next section,
# but here is a more thorough look into it: https://www.w3schools.com/cssref/css_selectors.asp

In [0]:
# We can search for elements similarly to Beautiful Soup using the find() method
# It behaves as find_all()

# find all 'a' tags
links = r.html.find("a")
links

[<Element 'a' id='top'>,
 <Element 'a' href='/wiki/Wikipedia:Featured_articles' title='This is a featured article. Click here for more information.'>,
 <Element 'a' href='/wiki/Wikipedia:Protection_policy#semi' title='This article is semi-protected.'>,
 <Element 'a' href='/wiki/File:Football_(soccer)_Part_One.ogg' title='Listen to this article'>,
 <Element 'a' class=('mw-jump-link',) href='#mw-head'>,
 <Element 'a' class=('mw-jump-link',) href='#p-search'>,
 <Element 'a' class=('mw-disambig',) href='/wiki/Soccer_(disambiguation)' title='Soccer (disambiguation)'>,
 <Element 'a' href='/wiki/Soccer_Team_(band)' title='Soccer Team (band)'>,
 <Element 'a' href='/wiki/Football' title='Football'>,
 <Element 'a' class=('image',) href='/wiki/File:Football_iu_1996.jpg'>,
 <Element 'a' class=('mw-redirect',) href='/wiki/Goal_(sport)' title='Goal (sport)'>,
 <Element 'a' href='/wiki/Sports_governing_body' title='Sports governing body'>,
 <Element 'a' href='/wiki/FIFA' title='FIFA'>,
 <Element 'a' 

In [0]:
links[4]

<Element 'a' class=('mw-jump-link',) href='#mw-head'>

In [0]:
# To get the raw HTML of an element use the '.html' method
links[4].html

'<a class="mw-jump-link" href="#mw-head">Jump to navigation</a>'

In [0]:
type(links[4].html)

str

In [0]:
# To extract the text inside an element, use ".text", just like in Beautiful Soup
links[4].text

'Jump to navigation'

In [0]:
# To obtain a dictionary of the element's attributes, use '.attrs' (exactly as in Beautiful Soup)
links[10].attrs

{'class': ('mw-redirect',),
 'href': '/wiki/Goal_(sport)',
 'title': 'Goal (sport)'}

In [0]:
# This package offers a couple of ways to filter tags based off text

# Choose only those tags that contain the string 'wikipedia' in their text (not in the 'href' attribute)
# Note: this is not case-sensitive
r.html.find("a", containing = "wikipedia")

[<Element 'a' href='//en.wikipedia.org/wiki/Wikipedia:Contact_us'>,
 <Element 'a' href='/wiki/Wikipedia:About' title='Wikipedia:About'>,
 <Element 'a' href='/wiki/Wikipedia:About' title='Find out about Wikipedia'>,
 <Element 'a' href='//shop.wikimedia.org' title='Visit the Wikipedia store'>,
 <Element 'a' href='https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en' title='Support us'>,
 <Element 'a' href='/wiki/Category:Wikipedia_articles_with_NDL_identifiers' title='Category:Wikipedia articles with NDL identifiers'>,
 <Element 'a' href='/wiki/Category:Wikipedia_articles_with_NARA_identifiers' title='Category:Wikipedia articles with NARA identifiers'>,
 <Element 'a' href='/wiki/Category:Wikipedia_articles_with_LCCN_identifiers' title='Category:Wikipedia articles with LCCN identifiers'>,
 <Element 'a' href='/wiki/Category:Wikipedia_articles_with_HDS_identifiers' title='Category:Wikipedia articles 

In [0]:
# display the text of those tags
[tag.text for tag in r.html.find("a", containing = "wikipedia")]

['Contact Wikipedia',
 'About Wikipedia',
 'About Wikipedia',
 'Wikipedia store',
 'Donate to Wikipedia',
 'Wikipedia articles with NDL identifiers',
 'Wikipedia articles with NARA identifiers',
 'Wikipedia articles with LCCN identifiers',
 'Wikipedia articles with HDS identifiers',
 'Wikipedia articles with GND identifiers',
 'Wikipedia articles with BNF identifiers',
 'Wikipedia indefinitely semi-protected pages',
 'https://en.wikipedia.org/w/index.php?title=Association_football&oldid=956365495']

In [0]:
# If we wish to find only the first element (similarly to Beautiful Soup .find()) we need to specify the 'first' parameter
r.html.find("p", first = True)

<Element 'p' class=('mw-empty-elt',)>

Searching for text

In [0]:
# The package also offers searching text based on the parse library
# The search() method can be thought of as the opposite of str.format():
# it finds the text instead of inserting it in the specified place

# For further details see https://pypi.org/project/parse/ 
# and https://docs.python.org/3/library/string.html#format-string-syntax

In [0]:
# The method searches for a matching string, where '{}' is replaced by the returned text
r.html.search("known{}soccer")

<Result (' as football field, football ground, ',) {}>

In [0]:
# To access the text, get the first element (index 0)
r.html.search("known{}soccer")[0]

' as football field, football ground, '

In [0]:
# search() finds only the shortest matching string
# To search for all matching strings use search_all()
r.html.search_all("known{}soccer")

[<Result (' as football field, football ground, ',) {}>,
 <Result (' as <b>football</b> or <b>',) {}>,
 <Result (' as the <a href="/wiki/Laws_of_the_Game_(association_football)" title="Laws of the Game (association football)">Laws of the Game</a>. The ball is 68–70&#160;cm (27–28&#160;in) in circumference and known as the <i><a href="/wiki/Ball_(association_football)" title="Ball (association football)">football</a></i>. The two teams each compete to get the ball into the other team\'s goal (between the posts and under the bar), thereby scoring a goal. The team that has scored more goals at the end of the game is the winner; if both teams have scored an equal number of goals then the game is a draw. Each team is led by a <a href="/wiki/Captain_(association_football)" title="Captain (association football)">captain</a> who has only one official responsibility as mandated by the Laws of the Game: to represent their team in the coin toss prior to kick-off or <a href="/wiki/Penalty_kick_(as

In [0]:
len(r.html.search_all("known{}soccer"))

19

In [0]:
# Further details at:
# -- https://pypi.org/project/parse/
# -- https://docs.python.org/3/library/string.html#format-string-syntax

CSS Selectors

In [0]:
# CSS selectors are a notation for selecting (filtering) different HTML elements (aka tags)
# The name stems from the styling language CSS - 
# in order for a style to be applied, you first need a way to specify (or 'select') the element the style will be applyed on

In [0]:
# You can find a complete CSS selectors reference at: https://www.w3schools.com/cssref/css_selectors.asp
# Let's showcase some CSS selectors below, with examples from the same wiki page

Select elements based on tag name

In [0]:
# Select by tag name as usual

# Select all 'span' tags
r.html.find("span")

[<Element 'span' id='Etymology'>,
 <Element 'span' id='Names'>,
 <Element 'span' class=('toctogglespan',)>,
 <Element 'span' class=('tocnumber',)>,
 <Element 'span' class=('toctext',)>,
 <Element 'span' class=('tocnumber',)>,
 <Element 'span' class=('toctext',)>,
 <Element 'span' class=('tocnumber',)>,
 <Element 'span' class=('toctext',)>,
 <Element 'span' class=('tocnumber',)>,
 <Element 'span' class=('toctext',)>,
 <Element 'span' class=('tocnumber',)>,
 <Element 'span' class=('toctext',)>,
 <Element 'span' class=('tocnumber',)>,
 <Element 'span' class=('toctext',)>,
 <Element 'span' class=('tocnumber',)>,
 <Element 'span' class=('toctext',)>,
 <Element 'span' class=('tocnumber',)>,
 <Element 'span' class=('toctext',)>,
 <Element 'span' class=('tocnumber',)>,
 <Element 'span' class=('toctext',)>,
 <Element 'span' class=('tocnumber',)>,
 <Element 'span' class=('toctext',)>,
 <Element 'span' class=('tocnumber',)>,
 <Element 'span' class=('toctext',)>,
 <Element 'span' class=('tocnumber

In [0]:
# Another example: select all 'div' tags
r.html.find("div")

[<Element 'div' class=('noprint',) id='mw-page-base'>,
 <Element 'div' class=('noprint',) id='mw-head-base'>,
 <Element 'div' class=('mw-body',) id='content' role='main'>,
 <Element 'div' class=('mw-body-content',) id='siteNotice'>,
 <Element 'div' class=('mw-indicators', 'mw-body-content')>,
 <Element 'div' class=('mw-indicator',) id='mw-indicator-featured-star'>,
 <Element 'div' class=('mw-indicator',) id='mw-indicator-pp-default'>,
 <Element 'div' class=('mw-indicator',) id='mw-indicator-spoken-icon'>,
 <Element 'div' class=('mw-body-content',) id='bodyContent'>,
 <Element 'div' class=('noprint',) id='siteSub'>,
 <Element 'div' id='contentSub'>,
 <Element 'div' id='jump-to-nav'>,
 <Element 'div' class=('mw-content-ltr',) dir='ltr' id='mw-content-text' lang='en'>,
 <Element 'div' class=('mw-parser-output',)>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=(

In [0]:
# That was the same as in Beautiful Soup
# However, CSS selectors now diverge a bit in style

Select elements based upon ID   

In [0]:
# Remember that the 'id' attribute is unique - no two elements may have the same 'id'
# Thus, filtering by 'id' will return either one or no results

In [0]:
# To search by id use '#id_here' (hashtag)

In [0]:
# Select the tag with id = Name
r.html.find("#Name")

[<Element 'span' class=('mw-headline',) id='Name'>]

In [0]:
# Note that the value is case-sensitive
r.html.find("#name")

[]

In [0]:
# Another example
# Since we know that it will return at most one element, we may get rid of the list by setting 'first' to 'True'
r.html.find("#Duration_and_tie-breaking_methods", first=True)

<Element 'span' class=('mw-headline',) id='Duration_and_tie-breaking_methods'>

Selecting by Class

In [0]:
# We can filter by class name with '.' (a dot)
# .class_name_here

In [0]:
# Select all tags with class='mw-headline'
r.html.find(".mw-headline")

[<Element 'span' class=('mw-headline',) id='Name'>,
 <Element 'span' class=('mw-headline',) id='History'>,
 <Element 'span' class=('mw-headline',) id="Women's_association_football">,
 <Element 'span' class=('mw-headline',) id="Early_women's_football">,
 <Element 'span' class=('mw-headline',) id='20th_and_21st_century'>,
 <Element 'span' class=('mw-headline',) id='Gameplay'>,
 <Element 'span' class=('mw-headline',) id='Laws'>,
 <Element 'span' class=('mw-headline',) id='Players,_equipment,_and_officials'>,
 <Element 'span' class=('mw-headline',) id='Ball'>,
 <Element 'span' class=('mw-headline',) id='Pitch'>,
 <Element 'span' class=('mw-headline',) id='Duration_and_tie-breaking_methods'>,
 <Element 'span' class=('mw-headline',) id='90-minute_ordinary_time'>,
 <Element 'span' class=('mw-headline',) id='Tie-breaking'>,
 <Element 'span' class=('mw-headline',) id='Ball_in_and_out_of_play'>,
 <Element 'span' class=('mw-headline',) id='Misconduct'>,
 <Element 'span' class=('mw-headline',) id=

In [0]:
# Select all tags with class = 'metadata'
r.html.find(".metadata")

[<Element 'table' class=('metadata', 'mbox-small') role='presentation' style='background-color:#f9f9f9;border:1px solid #aaa;color:#000'>,
 <Element 'div' aria-labelledby='sister-projects' class=('metadata', 'plainlinks', 'sistersitebox', 'plainlist', 'mbox-small') role='navigation' style='border:1px solid #aaa; padding:0; background:#f9f9f9;'>]

In [0]:
# We can stack different CSS selectors one after the other

In [0]:
# For instance, we can search for elements with two class values
r.html.find(".metadata.plainlinks")  # Note there is no space between the two

[<Element 'div' aria-labelledby='sister-projects' class=('metadata', 'plainlinks', 'sistersitebox', 'plainlist', 'mbox-small') role='navigation' style='border:1px solid #aaa; padding:0; background:#f9f9f9;'>]

Selecting Based on Other Attributes

In [0]:
# If we want to search for tags with attributes beside 'class' and 'id' we should use this notation:

# [attribute] -- selects all tags that have defined the attribute
# [attribute=value] -- selects all tags with that particular value of the attribute
# [attribute*=value] -- attribute contains the SUBSTRING 'value'
# [attribute~=value] -- attribute contains the WORD 'value'
# [attribute|=value] -- attribute starts with 'value', followed with a dash '-', or is 'value' itself
# [attribute^=value] -- attribute begins with 'value'
# [attribute$=value] -- attribute ends with 'value'

In [0]:
# Select all tags that have 'target' attribute
r.html.find("[target]")

[<Element 'a' href='//upload.wikimedia.org/wikipedia/commons/3/30/O_Jogo_Bonito_%28The_Beautiful_Game%29.webm' target='new' title='Play media'>]

In [0]:
# For instance, select all tags with the 'role' attribute set to 'note'
r.html.find("[role=note]")

[<Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,

In [0]:
# Select all tags that contain the string 'wikipedia' in their 'href' attribute
r.html.find("[href*=wikipedia]")

[<Element 'link' href='/static/apple-touch/wikipedia.png' rel=('apple-touch-icon',)>,
 <Element 'link' href='/static/favicon/wikipedia.ico' rel=('shortcut', 'icon')>,
 <Element 'link' href='//en.wikipedia.org/w/api.php?action=rsd' rel=('EditURI',) type='application/rsd+xml'>,
 <Element 'link' href='https://en.wikipedia.org/wiki/Association_football' rel=('canonical',)>,
 <Element 'a' href='//upload.wikimedia.org/wikipedia/commons/3/30/O_Jogo_Bonito_%28The_Beautiful_Game%29.webm' target='new' title='Play media'>,
 <Element 'a' class=('internal',) href='//upload.wikimedia.org/wikipedia/commons/a/a1/Football_%28soccer%29_Part_One.ogg' title='Football (soccer) Part One.ogg'>,
 <Element 'a' class=('internal',) href='//upload.wikimedia.org/wikipedia/commons/c/cb/Football_%28soccer%29_Part_Two.ogg' title='Football (soccer) Part Two.ogg'>,
 <Element 'a' class=('external', 'text') href='https://en.wikipedia.org/w/index.php?title=Template:Association_football&action=edit'>,
 <Element 'a' class=(

In [0]:
# Note that this technique works for 'class' and 'id', as well

Combining different filters together into a compound selector

In [0]:
# We can stack all the different selectors we looked at up until now for a more precise filtering

In [0]:
# Looking at the last example, we see that there are 5 'link' elements selected along the 'a' tags
# We can look only at the 'a' tags using a compound selector

# we have selector that filters 'a' tags and a selector that filters tags with 'href' containing 'wikipedia'
# by combining those we can select only the 'a' tags containing 'wikipedia' in their 'href' attribute
r.html.find("a[href*=wikipedia]")

[<Element 'a' href='//upload.wikimedia.org/wikipedia/commons/3/30/O_Jogo_Bonito_%28The_Beautiful_Game%29.webm' target='new' title='Play media'>,
 <Element 'a' class=('internal',) href='//upload.wikimedia.org/wikipedia/commons/a/a1/Football_%28soccer%29_Part_One.ogg' title='Football (soccer) Part One.ogg'>,
 <Element 'a' class=('internal',) href='//upload.wikimedia.org/wikipedia/commons/c/cb/Football_%28soccer%29_Part_Two.ogg' title='Football (soccer) Part Two.ogg'>,
 <Element 'a' class=('external', 'text') href='https://en.wikipedia.org/w/index.php?title=Template:Association_football&action=edit'>,
 <Element 'a' class=('external', 'text') href='https://en.wikipedia.org/w/index.php?title=Template:International_football&action=edit'>,
 <Element 'a' class=('external', 'text') href='https://en.wikipedia.org/w/index.php?title=Template:Association_football_laws&action=edit'>,
 <Element 'a' class=('external', 'text') href='https://en.wikipedia.org/w/index.php?title=Template:Association_footba

In [0]:
# Select all 'a' tags with class 'internal'
r.html.find("a.internal")

[<Element 'a' class=('internal',) href='/wiki/File:AstonVilla1896-97.jpg' title='Enlarge'>,
 <Element 'a' class=('internal',) href='/wiki/File:Mia1997.JPG' title='Enlarge'>,
 <Element 'a' class=('internal',) href='/wiki/File:Women%27s_football_match_Menai_Bridge_against_Penrhos_(24622680915).jpg' title='Enlarge'>,
 <Element 'a' class=('internal',) href='/wiki/File:U20-WorldCup2007-Okotie-Onka_edit2.jpg' title='Enlarge'>,
 <Element 'a' class=('internal',) href='/wiki/File:Slidetackle.JPG' title='Enlarge'>,
 <Element 'a' class=('internal',) href='/wiki/File:Howard_Webb3.jpg' title='Enlarge'>,
 <Element 'a' class=('internal',) href='/wiki/File:Football_pitch_metric.svg' title='Enlarge'>,
 <Element 'a' class=('internal',) href='/wiki/File:Didier_Drogba_Manuel_Neuer_last_penalty_kick_Champions_League_Final_2012.jpg' title='Enlarge'>,
 <Element 'a' class=('internal',) href='/wiki/File:Shunsuke1_20080622.jpg' title='Enlarge'>,
 <Element 'a' class=('internal',) href='/wiki/File:FIFA-Headquarte

In [0]:
# Select all 'div' tags with classes 'thumb' and 'tright'
r.html.find("div.thumb.tright")

[<Element 'div' class=('thumb', 'tmulti', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tmulti', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>,
 <Element 'div' class=('thumb', 'tright')>]

In [0]:
# Select all 'div' notes, again, but including the class as well
r.html.find("div[role=note][class='hatnote navigation-not-searchable']")

# Some important notes:
# - when using [] syntax for class, instead of '.', you need to specify every value, i.e. all classes
# - when an attribute value contains space, you need to enclose it in quotes

[<Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,
 <Element 'div' class=('hatnote', 'navigation-not-searchable') role='note'>,

Incorporating tag hierarchy

In [0]:
# Sometimes we may want to search for an element nested inside another tag
# We can achieve this with a space

In [0]:
# Selecting all 'span' tags that are inside an 'h2' tag
r.html.find("h2 span")

[<Element 'span' class=('mw-headline',) id='Name'>,
 <Element 'span' class=('mw-headline',) id='History'>,
 <Element 'span' class=('mw-headline',) id='Gameplay'>,
 <Element 'span' class=('mw-headline',) id='Laws'>,
 <Element 'span' class=('mw-headline',) id='Governing_bodies'>,
 <Element 'span' class=('mw-headline',) id='International_competitions'>,
 <Element 'span' class=('mw-headline',) id='Domestic_competitions'>,
 <Element 'span' class=('mw-headline',) id='Professionalism'>,
 <Element 'span' class=('mw-headline',) id='Hooliganism'>,
 <Element 'span' class=('mw-headline',) id='Variants_and_casual_play'>,
 <Element 'span' class=('mw-headline',) id='See_also'>,
 <Element 'span' class=('mw-headline',) id='Notes'>,
 <Element 'span' class=('mw-headline',) id='References'>,
 <Element 'span' class=('mw-headline',) id='External_links'>]

In [0]:
# If we use '>' instead of a space, the parent tag should be the direct parent

# Selecting only paragraphs that are directly contained in a 'div' (their immidiate parent is a 'div')
r.html.find("div > p")

[<Element 'p' class=('mw-empty-elt',)>,
 <Element 'p' class=('mw-empty-elt',)>,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <Element 'p' >,
 <

In [0]:
# When we are done with scraping the page, it is a good idea to close the session object we opened at the begining
# If not, and we open new sessions, a lot of background processes may draw memory and processor resources

In [0]:
# close session object
session.close()