# Set-up and Workflow

### Importing the packages

In [1]:
# Load the packages
import requests
from bs4 import BeautifulSoup

### Making a GET request

In [2]:
# Defining the url of the site
base_site = "https://en.wikipedia.org/wiki/Music"

# Making a get request
response = requests.get(base_site)
response.status_code

200

In [197]:
# Extracting the HTML
html = response.content

# Checking that the reply is indeed an HTML code by inspecting the first 100 symbols
html[:100]

b'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-la'

### Making the soup

In [198]:
type(html)

bytes

In [4]:
# Convert HTML to a BeautifulSoup object. This will allow us to parse out content from the HTML more easily.
# Using the default parser as it is included in Python
soup = BeautifulSoup(html, "html.parser")

### Exporting the HTML to a file

In [5]:
# It is extremely useful to be able to check this file when searching where some info is located
# or to see how was the document parsed

# Exporting the HTML to a file
with open('Wiki_response.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))


# the 'with' statement is shorthand for a 'try-finally' block
# open is function for opening/creating a file to edit
# the 'wb' argument signifies the mode in which to edit the file - Writing in Bytes format
# .prettify() modifies the HTML code with additional indentations for better readability

# Searching and navigating the HTML tree

## Searching - find() and find_all()

In [6]:
# The soup variable (BeautifulSoup object) we defined earlier can be seen as representing the whole document
soup

<!DOCTYPE html>

<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-appearance-disabled vector-feature-appearance-pinned-clientpref-0 vector-feature-night-mode-disabled skin-theme-clientpref-day vector-toc-available" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Music - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited

In [7]:
# We can search by tag name
# This returns as the element with all its contents and nested elements inside
# html tags https://www.w3schools.com/tags/
soup.find('head')

<head>
<meta charset="utf-8"/>
<title>Music - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-appearance-disabled vector-feature-appearance-pinned-clientpref-0 vector-feature-night-mode-disabled skin-theme-clientpref-day vector-toc-available";var cookie=document.cookie.match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFr

In [8]:
# If there is no result it returns None
# Note: None is not displayed in IPython unless print() or repr() is used
soup.find('video')

In [9]:
# Display the None value
print(soup.find('video'))

None


In [10]:
# verify the type of output
type(soup.find('video'))

NoneType

In [11]:
# .find() returns only the first such result
soup.find('a')

<a class="mw-jump-link" href="#bodyContent">Jump to content</a>

In [12]:
# If we want all the results we use find_all() 
links = soup.find_all('a')
links

[<a class="mw-jump-link" href="#bodyContent">Jump to content</a>,
 <a accesskey="z" href="/wiki/Main_Page" title="Visit the main page [z]"><span>Main page</span></a>,
 <a href="/wiki/Wikipedia:Contents" title="Guides to browsing Wikipedia"><span>Contents</span></a>,
 <a href="/wiki/Portal:Current_events" title="Articles related to current events"><span>Current events</span></a>,
 <a accesskey="x" href="/wiki/Special:Random" title="Visit a randomly selected article [x]"><span>Random article</span></a>,
 <a href="/wiki/Wikipedia:About" title="Learn about Wikipedia and how it works"><span>About Wikipedia</span></a>,
 <a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" title="How to contact Wikipedia"><span>Contact us</span></a>,
 <a href="https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&amp;utm_medium=sidebar&amp;utm_campaign=C13_en.wikipedia.org&amp;uselang=en" title="Support us by donating to the Wikimedia Foundation"><span>Donate</span></a>,
 <a href=

In [13]:
# find_all returns a list of all results
isinstance(links, list)

True

In [14]:
# We must be careful when using find_all()
# If no result is found it returns an empty list
soup.find_all('video')

[]

In [15]:
# How many links are on the page?
len(links)

2499

In [16]:
# Usually, we prefer to store the result in a variable
# Let's store the body of a table in a table variable
table = soup.find('tbody')

In [17]:
# Inspect the value of the variable
table

<tbody><tr><td class="sidebar-pretitle" style="background:antiquewhite;">Part of <a href="/wiki/Category:Performing_arts" title="Category:Performing arts">a series</a> on</td></tr><tr><th class="sidebar-title-with-pretitle" style="background:antiquewhite;;display:block;margin-bottom:0.4em;"><a href="/wiki/Performing_arts" title="Performing arts">Performing arts</a></th></tr><tr><td class="sidebar-content hlist">
<ul><li><a href="/wiki/Acrobatics" title="Acrobatics">Acrobatics</a></li>
<li><a href="/wiki/Ballet" title="Ballet">Ballet</a></li>
<li><a href="/wiki/List_of_circus_skills" title="List of circus skills">Circus skills</a></li>
<li><a href="/wiki/Clown" title="Clown">Clown</a></li>
<li><a href="/wiki/Dance" title="Dance">Dance</a></li>
<li><a href="/wiki/Gymnastics" title="Gymnastics">Gymnastics</a></li>
<li><a href="/wiki/Magic_(illusion)" title="Magic (illusion)">Magic</a></li>
<li><a href="/wiki/Mime_artist" title="Mime artist">Mime</a></li>
<li><a class="mw-selflink selflink

In [18]:
# Inspect the type of the variable
type(table)

bs4.element.Tag

In [19]:
# A tag can be searched in the same way we search the whole document
table.find_all('td')

[<td class="sidebar-pretitle" style="background:antiquewhite;">Part of <a href="/wiki/Category:Performing_arts" title="Category:Performing arts">a series</a> on</td>,
 <td class="sidebar-content hlist">
 <ul><li><a href="/wiki/Acrobatics" title="Acrobatics">Acrobatics</a></li>
 <li><a href="/wiki/Ballet" title="Ballet">Ballet</a></li>
 <li><a href="/wiki/List_of_circus_skills" title="List of circus skills">Circus skills</a></li>
 <li><a href="/wiki/Clown" title="Clown">Clown</a></li>
 <li><a href="/wiki/Dance" title="Dance">Dance</a></li>
 <li><a href="/wiki/Gymnastics" title="Gymnastics">Gymnastics</a></li>
 <li><a href="/wiki/Magic_(illusion)" title="Magic (illusion)">Magic</a></li>
 <li><a href="/wiki/Mime_artist" title="Mime artist">Mime</a></li>
 <li><a class="mw-selflink selflink">Music</a></li>
 <li><a href="/wiki/Opera" title="Opera">Opera</a></li>
 <li><a href="/wiki/Professional_wrestling" title="Professional wrestling">Professional wrestling</a></li>
 <li><a href="/wiki/Pupp

In [20]:
# Since we used find_all, the result is a list
len(table.find_all('td'))

3

## Navigating the tree

In [105]:
table

<tbody><tr><td class="sidebar-pretitle" style="background:antiquewhite;">Part of <a href="/wiki/Category:Performing_arts" title="Category:Performing arts">a series</a> on</td></tr><tr><th class="sidebar-title-with-pretitle" style="background:antiquewhite;;display:block;margin-bottom:0.4em;"><a href="/wiki/Performing_arts" title="Performing arts">Performing arts</a></th></tr><tr><td class="sidebar-content hlist">
<ul><li><a href="/wiki/Acrobatics" title="Acrobatics">Acrobatics</a></li>
<li><a href="/wiki/Ballet" title="Ballet">Ballet</a></li>
<li><a href="/wiki/List_of_circus_skills" title="List of circus skills">Circus skills</a></li>
<li><a href="/wiki/Clown" title="Clown">Clown</a></li>
<li><a href="/wiki/Dance" title="Dance">Dance</a></li>
<li><a href="/wiki/Gymnastics" title="Gymnastics">Gymnastics</a></li>
<li><a href="/wiki/Magic_(illusion)" title="Magic (illusion)">Magic</a></li>
<li><a href="/wiki/Mime_artist" title="Mime artist">Mime</a></li>
<li><a class="mw-selflink selflink

In [107]:
# A tag's children are stored in a list, accessed with .contents
table.contents

[<tr><td class="sidebar-pretitle" style="background:antiquewhite;">Part of <a href="/wiki/Category:Performing_arts" title="Category:Performing arts">a series</a> on</td></tr>,
 <tr><th class="sidebar-title-with-pretitle" style="background:antiquewhite;;display:block;margin-bottom:0.4em;"><a href="/wiki/Performing_arts" title="Performing arts">Performing arts</a></th></tr>,
 <tr><td class="sidebar-content hlist">
 <ul><li><a href="/wiki/Acrobatics" title="Acrobatics">Acrobatics</a></li>
 <li><a href="/wiki/Ballet" title="Ballet">Ballet</a></li>
 <li><a href="/wiki/List_of_circus_skills" title="List of circus skills">Circus skills</a></li>
 <li><a href="/wiki/Clown" title="Clown">Clown</a></li>
 <li><a href="/wiki/Dance" title="Dance">Dance</a></li>
 <li><a href="/wiki/Gymnastics" title="Gymnastics">Gymnastics</a></li>
 <li><a href="/wiki/Magic_(illusion)" title="Magic (illusion)">Magic</a></li>
 <li><a href="/wiki/Mime_artist" title="Mime artist">Mime</a></li>
 <li><a class="mw-selflink

In [22]:
len(table.contents)

4

In [23]:
table.contents[1]

<tr><th class="sidebar-title-with-pretitle" style="background:antiquewhite;;display:block;margin-bottom:0.4em;"><a href="/wiki/Performing_arts" title="Performing arts">Performing arts</a></th></tr>

In [24]:
# We can also go up the tree with .parent
table.parent

<table class="sidebar nomobile nowraplinks"><tbody><tr><td class="sidebar-pretitle" style="background:antiquewhite;">Part of <a href="/wiki/Category:Performing_arts" title="Category:Performing arts">a series</a> on</td></tr><tr><th class="sidebar-title-with-pretitle" style="background:antiquewhite;;display:block;margin-bottom:0.4em;"><a href="/wiki/Performing_arts" title="Performing arts">Performing arts</a></th></tr><tr><td class="sidebar-content hlist">
<ul><li><a href="/wiki/Acrobatics" title="Acrobatics">Acrobatics</a></li>
<li><a href="/wiki/Ballet" title="Ballet">Ballet</a></li>
<li><a href="/wiki/List_of_circus_skills" title="List of circus skills">Circus skills</a></li>
<li><a href="/wiki/Clown" title="Clown">Clown</a></li>
<li><a href="/wiki/Dance" title="Dance">Dance</a></li>
<li><a href="/wiki/Gymnastics" title="Gymnastics">Gymnastics</a></li>
<li><a href="/wiki/Magic_(illusion)" title="Magic (illusion)">Magic</a></li>
<li><a href="/wiki/Mime_artist" title="Mime artist">Mime

In [25]:
# table.parent is also a tag
# Thus, we can use .parent on it as well
table.parent.parent

<div class="mw-content-ltr mw-parser-output" dir="ltr" lang="en"><div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">Form of art using sound</div>
<style data-mw-deduplicate="TemplateStyles:r1033289096">.mw-parser-output .hatnote{font-style:italic}.mw-parser-output div.hatnote{padding-left:1.6em;margin-bottom:0.5em}.mw-parser-output .hatnote i{font-style:normal}.mw-parser-output .hatnote+link+.hatnote{margin-top:-0.5em}</style><div class="hatnote navigation-not-searchable" role="note">For other uses, see <a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>.</div>
<p class="mw-empty-elt">
</p>
<figure class="mw-default-size" typeof="mw:File/Thumb"><a class="mw-file-description" href="/wiki/File:The_Sounds_of_Earth_-_GPN-2000-001976.jpg"><img class="mw-file-element" data-file-height="2388" data-file-width="2389" decoding="async" height="260" src="//upload.wikimedia.org/wikipedia/commons/th

In [26]:
# We use .parent to go up the tree
# But what about .children?
table.children

<list_iterator at 0x106b009d0>

In [113]:
# If we want a list of an element's children, we need to use table.contents as shown before
# .children is an iterator over that list, 
# which means we can use it in a for loop to iterate over all the children

for child in table.children:
    print(child)

<tr><td class="sidebar-pretitle" style="background:antiquewhite;">Part of <a href="/wiki/Category:Performing_arts" title="Category:Performing arts">a series</a> on</td></tr>
<tr><th class="sidebar-title-with-pretitle" style="background:antiquewhite;;display:block;margin-bottom:0.4em;"><a href="/wiki/Performing_arts" title="Performing arts">Performing arts</a></th></tr>
<tr><td class="sidebar-content hlist">
<ul><li><a href="/wiki/Acrobatics" title="Acrobatics">Acrobatics</a></li>
<li><a href="/wiki/Ballet" title="Ballet">Ballet</a></li>
<li><a href="/wiki/List_of_circus_skills" title="List of circus skills">Circus skills</a></li>
<li><a href="/wiki/Clown" title="Clown">Clown</a></li>
<li><a href="/wiki/Dance" title="Dance">Dance</a></li>
<li><a href="/wiki/Gymnastics" title="Gymnastics">Gymnastics</a></li>
<li><a href="/wiki/Magic_(illusion)" title="Magic (illusion)">Magic</a></li>
<li><a href="/wiki/Mime_artist" title="Mime artist">Mime</a></li>
<li><a class="mw-selflink selflink">Mus

## Searching by attributes

In [28]:
# We can search for tags based on their attributes, in addition to their name
soup.find('div', id = 'siteSub')

<div class="noprint" id="siteSub">From Wikipedia, the free encyclopedia</div>

In [128]:
soup.find_all("h2", class_="vector-pinnable-header-label")

[<h2 class="vector-pinnable-header-label">Contents</h2>]

### Passing attributes as function parameters

In [132]:
# By writing them as function parameters
# Notice that since class is a reserved word, we write class_
soup.find_all('a', class_ = 'vector-toc-link')

[<a class="vector-toc-link" href="#">
 <div class="vector-toc-text">(Top)</div>
 </a>,
 <a class="vector-toc-link" href="#Etymology_and_terminology">
 <div class="vector-toc-text">
 <span class="vector-toc-numb">1</span>Etymology and terminology</div>
 </a>,
 <a class="vector-toc-link" href="#History">
 <div class="vector-toc-text">
 <span class="vector-toc-numb">2</span>History</div>
 </a>,
 <a class="vector-toc-link" href="#Origins_and_prehistory">
 <div class="vector-toc-text">
 <span class="vector-toc-numb">2.1</span>Origins and prehistory</div>
 </a>,
 <a class="vector-toc-link" href="#Antiquity">
 <div class="vector-toc-text">
 <span class="vector-toc-numb">2.2</span>Antiquity</div>
 </a>,
 <a class="vector-toc-link" href="#Asian_cultures">
 <div class="vector-toc-text">
 <span class="vector-toc-numb">2.3</span>Asian cultures</div>
 </a>,
 <a class="vector-toc-link" href="#Western_classical">
 <div class="vector-toc-text">
 <span class="vector-toc-numb">2.4</span>Western classica

In [133]:
# We can filter against multiple attributes at once
soup.find('a', class_ = 'vector-toc-link', href = '#Etymology_and_terminology')

<a class="vector-toc-link" href="#Etymology_and_terminology">
<div class="vector-toc-text">
<span class="vector-toc-numb">1</span>Etymology and terminology</div>
</a>

### Placing the attributes in a dictionary

In [139]:
# By writting the attributes in a dictionary
soup.find('a', attrs={'class':'vector-toc-link', 'href':'#Etymology_and_terminology'})

<a class="vector-toc-link" href="#Etymology_and_terminology">
<div class="vector-toc-text">
<span class="vector-toc-numb">1</span>Etymology and terminology</div>
</a>

In [140]:
soup.find('div', {'id' : 'vector-main-menu'})

<div class="vector-main-menu vector-pinnable-element" id="vector-main-menu">
<div class="vector-pinnable-header vector-main-menu-pinnable-header vector-pinnable-header-unpinned" data-feature-name="main-menu-pinned" data-pinnable-element-id="vector-main-menu" data-pinned-container-id="vector-main-menu-pinned-container" data-unpinned-container-id="vector-main-menu-unpinned-container">
<div class="vector-pinnable-header-label">Main menu</div>
<button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-main-menu.pin">move to sidebar</button>
<button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-main-menu.unpin">hide</button>
</div>
<div class="vector-menu mw-portlet mw-portlet-navigation" id="p-navigation">
<div class="vector-menu-heading">
		Navigation
	</div>
<div class="vector-menu-content">
<ul class="vector-menu-content-list">
<li class="mw-list

# Extracting data from the HTML tree

In [145]:
# Let's use some placeholder object to manipulate in the examples below
hyplink = soup.find('a', class_ = 'mw-jump-link')
type(hyplink)

bs4.element.Tag

In [142]:
# We can obtain the name of the tag with the .name attribute
hyplink.name

'a'

## Getting the attribute value

In [36]:
# We can access a tag’s attributes by treating the tag just like a dictionary

In [143]:
# First way
hyplink['href']

'#bodyContent'

In [146]:
# Notice how multi-valued attributes, such as class, return a list
hyplink['class']

['mw-jump-link']

In [147]:
# Second way
hyplink.get('href')

'#bodyContent'

In [148]:
# Again, class returns a list
hyplink.get('class')

['mw-jump-link']

#### Differences between these methods manifest when the key is missing

In [41]:
# tag['missing-key'] returns an error
# a['id'] will raise an error, if uncommented

In [149]:
# tag.get('missing-key') returns a default value None
hyplink.get('id')

In [150]:
# We can use repr() function to display all special characters and combinations (None, \n...)
print(hyplink.get('id'))

None


In [151]:
# We can also get all attribute name-value pairs in a dictionary
hyplink.attrs

{'class': ['mw-jump-link'], 'href': '#bodyContent'}

## Extracting the text

### .string vs .text

In [152]:
# We can access the raw string of an element by using .string
hyplink.string

'Jump to content'

In [153]:
# Alternativelly we can use .text
hyplink.text

'Jump to content'

#### They exhibit different behaviour when the element contains more than one distinct string

In [157]:
# This paragraph has many nested elements, with lots of different fragments of text
p = soup.find_all('p')[1]
p

<p><b>Music</b> is the arrangement of <a href="/wiki/Sound" title="Sound">sound</a> to create some combination of <a href="/wiki/Musical_form" title="Musical form">form</a>, <a href="/wiki/Harmony" title="Harmony">harmony</a>, <a href="/wiki/Melody" title="Melody">melody</a>, <a href="/wiki/Rhythm" title="Rhythm">rhythm</a>, or otherwise <a href="/wiki/Musical_expression" title="Musical expression">expressive content</a>.<sup class="reference" id="cite_ref-FOOTNOTE''OED''§&amp;nbsp;1_1-0"><a href="#cite_note-FOOTNOTE''OED''§&amp;nbsp;1-1">[1]</a></sup><sup class="reference" id="cite_ref-FOOTNOTE''AHD''§&amp;nbsp;1_2-0"><a href="#cite_note-FOOTNOTE''AHD''§&amp;nbsp;1-2">[2]</a></sup><sup class="reference" id="cite_ref-FOOTNOTEEpperson2022§_para._1_3-0"><a href="#cite_note-FOOTNOTEEpperson2022§_para._1-3">[3]</a></sup> However, <a href="/wiki/Definition_of_music" title="Definition of music">definitions of music</a> vary depending on culture,<sup class="reference" id="cite_ref-FOOTNOTEMit

In [158]:
# .text returns everything inside the element
p.text

'Music is the arrangement of sound to create some combination of form, harmony, melody, rhythm, or otherwise expressive content.[1][2][3] However, definitions of music vary depending on culture,[4] though it is an aspect of all human societies and a cultural universal.[5] While scholars agree that music is defined by a few specific elements, there is no consensus on their precise definitions.[6] The creation of music is commonly divided into musical composition, musical improvisation, and musical performance,[7] though the topic itself extends into academic disciplines, criticism, philosophy, psychology, and therapeutic contexts. Music may be performed using a vast range of instruments, including the human voice to sing, and thus is often credited for its extreme versatility and opportunity for creativity.[8]\n'

In [49]:
# .string returns None when there is more than 1 string
p.string

In [154]:
print(p.string)

None


In [51]:
p.parent

<div class="mw-content-ltr mw-parser-output" dir="ltr" lang="en"><div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">Form of art using sound</div>
<style data-mw-deduplicate="TemplateStyles:r1033289096">.mw-parser-output .hatnote{font-style:italic}.mw-parser-output div.hatnote{padding-left:1.6em;margin-bottom:0.5em}.mw-parser-output .hatnote i{font-style:normal}.mw-parser-output .hatnote+link+.hatnote{margin-top:-0.5em}</style><div class="hatnote navigation-not-searchable" role="note">For other uses, see <a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>.</div>
<p class="mw-empty-elt">
</p>
<figure class="mw-default-size" typeof="mw:File/Thumb"><a class="mw-file-description" href="/wiki/File:The_Sounds_of_Earth_-_GPN-2000-001976.jpg"><img class="mw-file-element" data-file-height="2388" data-file-width="2389" decoding="async" height="260" src="//upload.wikimedia.org/wikipedia/commons/th

In [52]:
# We can stack different operations one after the other
p.parent.text

'Form of art using sound\nFor other uses, see Music (disambiguation).\n\n\nGrooved side of the Voyager Golden Record launched along the Voyager probes to space, which feature music from around the world\nPart of a series onPerforming arts\nAcrobatics\nBallet\nCircus skills\nClown\nDance\nGymnastics\nMagic\nMime\nMusic\nOpera\nProfessional wrestling\nPuppetry\nSpeech\nStand-up comedy\nStreet performance\nTheatre\nVentriloquism\nvte\nMusic is the arrangement of sound to create some combination of form, harmony, melody, rhythm, or otherwise expressive content.[1][2][3] However, definitions of music vary depending on culture,[4] though it is an aspect of all human societies and a cultural universal.[5] While scholars agree that music is defined by a few specific elements, there is no consensus on their precise definitions.[6] The creation of music is commonly divided into musical composition, musical improvisation, and musical performance,[7] though the topic itself extends into academic d

In [53]:
# semi-properly displayed text
print(p.parent.text)

Form of art using sound
For other uses, see Music (disambiguation).


Grooved side of the Voyager Golden Record launched along the Voyager probes to space, which feature music from around the world
Part of a series onPerforming arts
Acrobatics
Ballet
Circus skills
Clown
Dance
Gymnastics
Magic
Mime
Music
Opera
Professional wrestling
Puppetry
Speech
Stand-up comedy
Street performance
Theatre
Ventriloquism
vte
Music is the arrangement of sound to create some combination of form, harmony, melody, rhythm, or otherwise expressive content.[1][2][3] However, definitions of music vary depending on culture,[4] though it is an aspect of all human societies and a cultural universal.[5] While scholars agree that music is defined by a few specific elements, there is no consensus on their precise definitions.[6] The creation of music is commonly divided into musical composition, musical improvisation, and musical performance,[7] though the topic itself extends into academic disciplines, criticism, ph

In [54]:
# We can also use .get_text() instead of .text
p.parent.get_text()

'Form of art using sound\nFor other uses, see Music (disambiguation).\n\n\nGrooved side of the Voyager Golden Record launched along the Voyager probes to space, which feature music from around the world\nPart of a series onPerforming arts\nAcrobatics\nBallet\nCircus skills\nClown\nDance\nGymnastics\nMagic\nMime\nMusic\nOpera\nProfessional wrestling\nPuppetry\nSpeech\nStand-up comedy\nStreet performance\nTheatre\nVentriloquism\nvte\nMusic is the arrangement of sound to create some combination of form, harmony, melody, rhythm, or otherwise expressive content.[1][2][3] However, definitions of music vary depending on culture,[4] though it is an aspect of all human societies and a cultural universal.[5] While scholars agree that music is defined by a few specific elements, there is no consensus on their precise definitions.[6] The creation of music is commonly divided into musical composition, musical improvisation, and musical performance,[7] though the topic itself extends into academic d

In [55]:
print(p.parent.get_text())

Form of art using sound
For other uses, see Music (disambiguation).


Grooved side of the Voyager Golden Record launched along the Voyager probes to space, which feature music from around the world
Part of a series onPerforming arts
Acrobatics
Ballet
Circus skills
Clown
Dance
Gymnastics
Magic
Mime
Music
Opera
Professional wrestling
Puppetry
Speech
Stand-up comedy
Street performance
Theatre
Ventriloquism
vte
Music is the arrangement of sound to create some combination of form, harmony, melody, rhythm, or otherwise expressive content.[1][2][3] However, definitions of music vary depending on culture,[4] though it is an aspect of all human societies and a cultural universal.[5] While scholars agree that music is defined by a few specific elements, there is no consensus on their precise definitions.[6] The creation of music is commonly divided into musical composition, musical improvisation, and musical performance,[7] though the topic itself extends into academic disciplines, criticism, ph

In [56]:
# We can also extract the whole text of the webpage
# CAUTION: This includes JavaScript text, CSS and other not directly displayed text
print(soup.text)





Music - Wikipedia



































Jump to content







Main menu





Main menu
move to sidebar
hide



		Navigation
	


Main pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate





		Contribute
	


HelpLearn to editCommunity portalRecent changesUpload file



















Search











Search





























Create account

Log in








Personal tools





 Create account Log in





		Pages for logged out editors learn more



ContributionsTalk




























Contents
move to sidebar
hide




(Top)





1Etymology and terminology







2History



Toggle History subsection





2.1Origins and prehistory







2.2Antiquity







2.3Asian cultures







2.4Western classical





2.4.1Early music







2.4.2Common practice period





2.4.2.1Baroque







2.4.2.2Classicism







2.4.2.3Romanticism











2.520th and 21st century









3Creation



Toggle Creation subsection





3.1Composition

### .strings and .stripped_strings

In [160]:
p

<p><b>Music</b> is the arrangement of <a href="/wiki/Sound" title="Sound">sound</a> to create some combination of <a href="/wiki/Musical_form" title="Musical form">form</a>, <a href="/wiki/Harmony" title="Harmony">harmony</a>, <a href="/wiki/Melody" title="Melody">melody</a>, <a href="/wiki/Rhythm" title="Rhythm">rhythm</a>, or otherwise <a href="/wiki/Musical_expression" title="Musical expression">expressive content</a>.<sup class="reference" id="cite_ref-FOOTNOTE''OED''§&amp;nbsp;1_1-0"><a href="#cite_note-FOOTNOTE''OED''§&amp;nbsp;1-1">[1]</a></sup><sup class="reference" id="cite_ref-FOOTNOTE''AHD''§&amp;nbsp;1_2-0"><a href="#cite_note-FOOTNOTE''AHD''§&amp;nbsp;1-2">[2]</a></sup><sup class="reference" id="cite_ref-FOOTNOTEEpperson2022§_para._1_3-0"><a href="#cite_note-FOOTNOTEEpperson2022§_para._1-3">[3]</a></sup> However, <a href="/wiki/Definition_of_music" title="Definition of music">definitions of music</a> vary depending on culture,<sup class="reference" id="cite_ref-FOOTNOTEMit

In [57]:
# All strings inside an element can be accessed separatelly by using the .strings iterator

In [159]:
for s in p.strings:
    print(s)

Music
 is the arrangement of 
sound
 to create some combination of 
form
, 
harmony
, 
melody
, 
rhythm
, or otherwise 
expressive content
.
[1]
[2]
[3]
 However, 
definitions of music
 vary depending on culture,
[4]
 though it is an aspect of all human societies and a 
cultural universal
.
[5]
 While scholars agree that music is defined by a 
few specific elements
, there is 
no consensus
 on their precise definitions.
[6]
 The creation of music is commonly divided into 
musical composition
, 
musical improvisation
, and musical 
performance
,
[7]
 though the topic itself extends into 
academic disciplines
, 
criticism
, 
philosophy
, 
psychology
, and 
therapeutic contexts
. Music may be performed using a vast range of 
instruments
, including the 
human voice
 to 
sing
, and thus is often credited for its extreme versatility and opportunity for creativity.
[8]




In [161]:
# The extra whitespace can be removed by using the .stripped_strings iterator instead
for s in p.stripped_strings:
    print(s)

Music
is the arrangement of
sound
to create some combination of
form
,
harmony
,
melody
,
rhythm
, or otherwise
expressive content
.
[1]
[2]
[3]
However,
definitions of music
vary depending on culture,
[4]
though it is an aspect of all human societies and a
cultural universal
.
[5]
While scholars agree that music is defined by a
few specific elements
, there is
no consensus
on their precise definitions.
[6]
The creation of music is commonly divided into
musical composition
,
musical improvisation
, and musical
performance
,
[7]
though the topic itself extends into
academic disciplines
,
criticism
,
philosophy
,
psychology
, and
therapeutic contexts
. Music may be performed using a vast range of
instruments
, including the
human voice
to
sing
, and thus is often credited for its extreme versatility and opportunity for creativity.
[8]


# Practical examples

## Links - absolute path URL

In [60]:
# Let's use the variable links we defined a couple of lectures ago for this example
# It contains all the 'a' tags on this page
links

[<a class="mw-jump-link" href="#bodyContent">Jump to content</a>,
 <a accesskey="z" href="/wiki/Main_Page" title="Visit the main page [z]"><span>Main page</span></a>,
 <a href="/wiki/Wikipedia:Contents" title="Guides to browsing Wikipedia"><span>Contents</span></a>,
 <a href="/wiki/Portal:Current_events" title="Articles related to current events"><span>Current events</span></a>,
 <a accesskey="x" href="/wiki/Special:Random" title="Visit a randomly selected article [x]"><span>Random article</span></a>,
 <a href="/wiki/Wikipedia:About" title="Learn about Wikipedia and how it works"><span>About Wikipedia</span></a>,
 <a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" title="How to contact Wikipedia"><span>Contact us</span></a>,
 <a href="https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&amp;utm_medium=sidebar&amp;utm_campaign=C13_en.wikipedia.org&amp;uselang=en" title="Support us by donating to the Wikimedia Foundation"><span>Donate</span></a>,
 <a href=

In [61]:
# Let's choose one link to manipulate
link = links[26]
link

<a class="vector-toc-link" href="#Antiquity">
<div class="vector-toc-text">
<span class="vector-toc-numb">2.2</span>Antiquity</div>
</a>

In [62]:
# Get the link's text
link.string

In [63]:
# Extract the link's URL
link['href']

'#Antiquity'

In [64]:
# This is a relative URL
# To obtain the absolute URL address we will use urljoin

from urllib.parse import urljoin

In [65]:
# Now we need the address of the current page + the relative URL to compute the full-path URL
base_site

'https://en.wikipedia.org/wiki/Music'

In [66]:
relative_url = link['href']
relative_url

'#Antiquity'

In [67]:
full_url = urljoin(base_site, relative_url)
full_url

'https://en.wikipedia.org/wiki/Music#Antiquity'

## Processing multiple links at once

In [68]:
# We will work with:
links

[<a class="mw-jump-link" href="#bodyContent">Jump to content</a>,
 <a accesskey="z" href="/wiki/Main_Page" title="Visit the main page [z]"><span>Main page</span></a>,
 <a href="/wiki/Wikipedia:Contents" title="Guides to browsing Wikipedia"><span>Contents</span></a>,
 <a href="/wiki/Portal:Current_events" title="Articles related to current events"><span>Current events</span></a>,
 <a accesskey="x" href="/wiki/Special:Random" title="Visit a randomly selected article [x]"><span>Random article</span></a>,
 <a href="/wiki/Wikipedia:About" title="Learn about Wikipedia and how it works"><span>About Wikipedia</span></a>,
 <a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" title="How to contact Wikipedia"><span>Contact us</span></a>,
 <a href="https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&amp;utm_medium=sidebar&amp;utm_campaign=C13_en.wikipedia.org&amp;uselang=en" title="Support us by donating to the Wikimedia Foundation"><span>Donate</span></a>,
 <a href=

In [174]:
# Examining the link's addresses
[l.get('href') for l in links]   # Note that if l['href'] was written instead of l.get('href'), this would produce an error

['#bodyContent',
 '/wiki/Main_Page',
 '/wiki/Wikipedia:Contents',
 '/wiki/Portal:Current_events',
 '/wiki/Special:Random',
 '/wiki/Wikipedia:About',
 '//en.wikipedia.org/wiki/Wikipedia:Contact_us',
 'https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en',
 '/wiki/Help:Contents',
 '/wiki/Help:Introduction',
 '/wiki/Wikipedia:Community_portal',
 '/wiki/Special:RecentChanges',
 '/wiki/Wikipedia:File_upload_wizard',
 '/wiki/Main_Page',
 '/wiki/Special:Search',
 '/w/index.php?title=Special:CreateAccount&returnto=Music',
 '/w/index.php?title=Special:UserLogin&returnto=Music',
 '/w/index.php?title=Special:CreateAccount&returnto=Music',
 '/w/index.php?title=Special:UserLogin&returnto=Music',
 '/wiki/Help:Introduction',
 '/wiki/Special:MyContributions',
 '/wiki/Special:MyTalk',
 '#',
 '#Etymology_and_terminology',
 '#History',
 '#Origins_and_prehistory',
 '#Antiquity',
 '#Asian_cultures',
 '#Western_class

In [176]:
# Notice that some links don't have URL (None appears)

# Dropping the links without href attribute
clean_links = [l.get('href') for l in links if l.get('href') != None]
len(clean_links)

2497

In [72]:
# Transforming to absolute path URLs
full_urls = [urljoin(base_site, url) for url in relative_urls]
full_urls

['https://en.wikipedia.org/wiki/Music#bodyContent',
 'https://en.wikipedia.org/wiki/Main_Page',
 'https://en.wikipedia.org/wiki/Wikipedia:Contents',
 'https://en.wikipedia.org/wiki/Portal:Current_events',
 'https://en.wikipedia.org/wiki/Special:Random',
 'https://en.wikipedia.org/wiki/Wikipedia:About',
 'https://en.wikipedia.org/wiki/Wikipedia:Contact_us',
 'https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en',
 'https://en.wikipedia.org/wiki/Help:Contents',
 'https://en.wikipedia.org/wiki/Help:Introduction',
 'https://en.wikipedia.org/wiki/Wikipedia:Community_portal',
 'https://en.wikipedia.org/wiki/Special:RecentChanges',
 'https://en.wikipedia.org/wiki/Wikipedia:File_upload_wizard',
 'https://en.wikipedia.org/wiki/Main_Page',
 'https://en.wikipedia.org/wiki/Special:Search',
 'https://en.wikipedia.org/w/index.php?title=Special:CreateAccount&returnto=Music',
 'https://en.wikipedia.org/w/index.

In [73]:
# Extracting only URLs pointing to Wikipedia (internal URLs)
internal_links = [url for url in full_urls if 'wikipedia.org' in url]
internal_links

['https://en.wikipedia.org/wiki/Music#bodyContent',
 'https://en.wikipedia.org/wiki/Main_Page',
 'https://en.wikipedia.org/wiki/Wikipedia:Contents',
 'https://en.wikipedia.org/wiki/Portal:Current_events',
 'https://en.wikipedia.org/wiki/Special:Random',
 'https://en.wikipedia.org/wiki/Wikipedia:About',
 'https://en.wikipedia.org/wiki/Wikipedia:Contact_us',
 'https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en',
 'https://en.wikipedia.org/wiki/Help:Contents',
 'https://en.wikipedia.org/wiki/Help:Introduction',
 'https://en.wikipedia.org/wiki/Wikipedia:Community_portal',
 'https://en.wikipedia.org/wiki/Special:RecentChanges',
 'https://en.wikipedia.org/wiki/Wikipedia:File_upload_wizard',
 'https://en.wikipedia.org/wiki/Main_Page',
 'https://en.wikipedia.org/wiki/Special:Search',
 'https://en.wikipedia.org/w/index.php?title=Special:CreateAccount&returnto=Music',
 'https://en.wikipedia.org/w/index.

# Extracting data from nested tags

In [181]:
# Our objective now is to extract all links that can be found under a section heading
# Marked as 'Main article:' or 'See also:'
# By quick inspection, we see that these are contained in div tags with attribute 'role' set to 'note'

div_notes = soup.find_all("div", {"role": "note"})
div_notes

[<div class="hatnote navigation-not-searchable" role="note">For other uses, see <a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>.</div>,
 <div class="hatnote navigation-not-searchable" role="note">Main article: <a href="/wiki/History_of_music" title="History of music">History of music</a></div>,
 <div class="hatnote navigation-not-searchable" role="note">Further information: <a class="mw-redirect" href="/wiki/Origins_of_music" title="Origins of music">Origins of music</a> and <a href="/wiki/Prehistoric_music" title="Prehistoric music">Prehistoric music</a></div>,
 <div class="hatnote navigation-not-searchable" role="note">Main article: <a href="/wiki/Ancient_music" title="Ancient music">Ancient music</a></div>,
 <div class="hatnote navigation-not-searchable" role="note">Main article: <a href="/wiki/Music_of_Asia" title="Music of Asia">Music of Asia</a></div>,
 <div class="hatnote navigation-not-searchable" role="note">

In [75]:
div_notes[0]

<div class="hatnote navigation-not-searchable" role="note">For other uses, see <a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>.</div>

In [76]:
# We can apply find() and find_all() to a tag in the same way we do it to the whole document
div_notes[0].find('a')

<a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>

In [189]:
# A naive approach to get all links would be to use find
div_links = [div.find('a') for div in div_notes]
div_links

[<a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>,
 <a href="/wiki/History_of_music" title="History of music">History of music</a>,
 <a class="mw-redirect" href="/wiki/Origins_of_music" title="Origins of music">Origins of music</a>,
 <a href="/wiki/Ancient_music" title="Ancient music">Ancient music</a>,
 <a href="/wiki/Music_of_Asia" title="Music of Asia">Music of Asia</a>,
 <a href="/wiki/Classical_music" title="Classical music">Classical music</a>,
 <a href="/wiki/Baroque_music" title="Baroque music">Baroque music</a>,
 <a href="/wiki/Classical_period_(music)" title="Classical period (music)">Classical period (music)</a>,
 <a href="/wiki/Romantic_music" title="Romantic music">Romantic music</a>,
 <a href="/wiki/20th-century_music" title="20th-century music">20th-century music</a>,
 <a href="/wiki/Musical_composition" title="Musical composition">Musical composition</a>,
 <a href="/wiki/Performance" title="Performance"

In [78]:
len(div_links)

39

In [186]:
# However, some divs have more than 1 link
div_notes[2]

<div class="hatnote navigation-not-searchable" role="note">Further information: <a class="mw-redirect" href="/wiki/Origins_of_music" title="Origins of music">Origins of music</a> and <a href="/wiki/Prehistoric_music" title="Prehistoric music">Prehistoric music</a></div>

In [187]:
# This div has 2 links in it
div_notes[2].find_all('a')

[<a class="mw-redirect" href="/wiki/Origins_of_music" title="Origins of music">Origins of music</a>,
 <a href="/wiki/Prehistoric_music" title="Prehistoric music">Prehistoric music</a>]

In [81]:
# Therefore we need to use find_all
# Let's use a for loop

# Define initially empty list of links
div_links = []

for div in div_notes:
    anchors = div.find_all('a')
    
    # Need to add every link from anchors to div_links
    for a in anchors:
        div_links.append(a)
    
    # Can use div_links.extend(anchors) instead of the for loop
    

In [82]:
div_links

[<a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>,
 <a href="/wiki/History_of_music" title="History of music">History of music</a>,
 <a class="mw-redirect" href="/wiki/Origins_of_music" title="Origins of music">Origins of music</a>,
 <a href="/wiki/Prehistoric_music" title="Prehistoric music">Prehistoric music</a>,
 <a href="/wiki/Ancient_music" title="Ancient music">Ancient music</a>,
 <a href="/wiki/Music_of_Asia" title="Music of Asia">Music of Asia</a>,
 <a href="/wiki/Classical_music" title="Classical music">Classical music</a>,
 <a href="/wiki/Baroque_music" title="Baroque music">Baroque music</a>,
 <a href="/wiki/Classical_period_(music)" title="Classical period (music)">Classical period (music)</a>,
 <a href="/wiki/Romantic_music" title="Romantic music">Romantic music</a>,
 <a href="/wiki/20th-century_music" title="20th-century music">20th-century music</a>,
 <a href="/wiki/Musical_composition" title="Musical co

In [83]:
# We now have a complete list
len(div_links)

44

In [84]:
# Let's get the URLs
note_urls = [urljoin(base_site, l.get('href')) for l in div_links]
note_urls

['https://en.wikipedia.org/wiki/Music_(disambiguation)',
 'https://en.wikipedia.org/wiki/History_of_music',
 'https://en.wikipedia.org/wiki/Origins_of_music',
 'https://en.wikipedia.org/wiki/Prehistoric_music',
 'https://en.wikipedia.org/wiki/Ancient_music',
 'https://en.wikipedia.org/wiki/Music_of_Asia',
 'https://en.wikipedia.org/wiki/Classical_music',
 'https://en.wikipedia.org/wiki/Baroque_music',
 'https://en.wikipedia.org/wiki/Classical_period_(music)',
 'https://en.wikipedia.org/wiki/Romantic_music',
 'https://en.wikipedia.org/wiki/20th-century_music',
 'https://en.wikipedia.org/wiki/Musical_composition',
 'https://en.wikipedia.org/wiki/Performance',
 'https://en.wikipedia.org/wiki/Musical_improvisation',
 'https://en.wikipedia.org/wiki/Musical_notation',
 'https://en.wikipedia.org/wiki/Elements_of_music',
 'https://en.wikipedia.org/wiki/Pitch_(music)',
 'https://en.wikipedia.org/wiki/Melody',
 'https://en.wikipedia.org/wiki/Harmony',
 'https://en.wikipedia.org/wiki/Rhythm',
 'h

In [85]:
len(note_urls)

44

# Scraping multiple pages automatically - Extracting all the text from the note URLs

In [86]:
# We will use the links we obtained above
note_urls

['https://en.wikipedia.org/wiki/Music_(disambiguation)',
 'https://en.wikipedia.org/wiki/History_of_music',
 'https://en.wikipedia.org/wiki/Origins_of_music',
 'https://en.wikipedia.org/wiki/Prehistoric_music',
 'https://en.wikipedia.org/wiki/Ancient_music',
 'https://en.wikipedia.org/wiki/Music_of_Asia',
 'https://en.wikipedia.org/wiki/Classical_music',
 'https://en.wikipedia.org/wiki/Baroque_music',
 'https://en.wikipedia.org/wiki/Classical_period_(music)',
 'https://en.wikipedia.org/wiki/Romantic_music',
 'https://en.wikipedia.org/wiki/20th-century_music',
 'https://en.wikipedia.org/wiki/Musical_composition',
 'https://en.wikipedia.org/wiki/Performance',
 'https://en.wikipedia.org/wiki/Musical_improvisation',
 'https://en.wikipedia.org/wiki/Musical_notation',
 'https://en.wikipedia.org/wiki/Elements_of_music',
 'https://en.wikipedia.org/wiki/Pitch_(music)',
 'https://en.wikipedia.org/wiki/Melody',
 'https://en.wikipedia.org/wiki/Harmony',
 'https://en.wikipedia.org/wiki/Rhythm',
 'h

In [87]:
# The objective is to get all the useful text from those wikipedia pages

# We will do that by extracting all text contained in a paragraph element,
# for all paragraphs on a page,
# for all pages (in note_urls)

In [204]:
# initialize list to store paragraph text for each webpage
import time
start_time = time.time()
par_text = []


# creating a loop counter
i = 0

# Loop through each URL in note_urls
for url in note_urls:
    
    # connect to every webpage
    note_resp = requests.get(url)
    
    # checking if the request is successful
    if note_resp.status_code == 200:            # Everything is OK!
        print('URL #{0}: {1}'.format(i+1,url))    # print out the number of iteration and the URL to keep track of place in loop
    
    else:                                       # Something is wrong!
        print('Status code {0}: Skipping URL #{1}: {2}'.format(note_resp.status_code, i+1, url))
        i = i+1
        continue
        
    
    # get HTML from webpage
    note_html = note_resp.content
    
    # convert HTML to BeautifulSoup object
    note_soup = BeautifulSoup(note_html, "lxml") #compare running time with "html.parser"
    
    # find all "p" tags on the webpage
    note_pars = note_soup.find_all("p")
    
    # Get the text from each "p" tag
    text = [p.text for p in note_pars]
    
    # Append text from each "p" tag to our list, par_text
    par_text.append(text)
    
    # Incrementing the loop counter
    i = i+1

execution_time = time.time() - start_time
print("Execution time in seconds:", execution_time)

URL #1: https://en.wikipedia.org/wiki/Music_(disambiguation)
URL #2: https://en.wikipedia.org/wiki/History_of_music
URL #3: https://en.wikipedia.org/wiki/Origins_of_music
URL #4: https://en.wikipedia.org/wiki/Prehistoric_music
URL #5: https://en.wikipedia.org/wiki/Ancient_music
URL #6: https://en.wikipedia.org/wiki/Music_of_Asia
URL #7: https://en.wikipedia.org/wiki/Classical_music
URL #8: https://en.wikipedia.org/wiki/Baroque_music
URL #9: https://en.wikipedia.org/wiki/Classical_period_(music)
URL #10: https://en.wikipedia.org/wiki/Romantic_music
URL #11: https://en.wikipedia.org/wiki/20th-century_music
URL #12: https://en.wikipedia.org/wiki/Musical_composition
URL #13: https://en.wikipedia.org/wiki/Performance
URL #14: https://en.wikipedia.org/wiki/Musical_improvisation
URL #15: https://en.wikipedia.org/wiki/Musical_notation
URL #16: https://en.wikipedia.org/wiki/Elements_of_music
URL #17: https://en.wikipedia.org/wiki/Pitch_(music)
URL #18: https://en.wikipedia.org/wiki/Melody
URL #

In [230]:
# Inspecting the result for the first page
par_text[0]

['Music is an art form consisting of sound and silence, expressed through time.\n',
 'Music may also refer to:\n']

In [232]:
# We see that we have a list of all paragraph strings
# It would be more useful to have all the text as one string, not as a list of strings

# Merging all paragraphs of the first page into one long string
page_text = "".join(par_text[0])
page_text

'Music is an art form consisting of sound and silence, expressed through time.\nMusic may also refer to:\n'

In [213]:
len(par_text)

44

In [239]:
# Let's do that for all pages

# Merging all paragraphs for all pages
page_text = ["".join(text) for text in par_text]#

# Make it nicer to read
string_text = "".join(first_text)
print(string_text)

Music is an art form consisting of sound and silence, expressed through time.
Music may also refer to:

Although definitions of music vary wildly throughout the world, every known culture partakes in it, and it is thus considered a cultural universal. The origins of music remain highly contentious; commentators often relate it to the origin of language, with much disagreement surrounding whether music arose before, after or simultaneously with language. Many theories have been proposed by scholars from a wide range of disciplines, though none has achieved broad approval. Most cultures have their own mythical origins concerning the invention of music, generally rooted in their respective mythological, religious or philosophical beliefs.
The music of prehistoric cultures is first firmly dated to c. 40,000 BP of the Upper Paleolithic by evidence of bone flutes, though it remains unclear whether or not the actual origins lie in the earlier Middle Paleolithic period (300,000 to 50,000 BP). 

In [242]:
# Inspect result
print(page_text[0])

Music is an art form consisting of sound and silence, expressed through time.
Music may also refer to:



In [243]:
note_urls

['https://en.wikipedia.org/wiki/Music_(disambiguation)',
 'https://en.wikipedia.org/wiki/History_of_music',
 'https://en.wikipedia.org/wiki/Origins_of_music',
 'https://en.wikipedia.org/wiki/Prehistoric_music',
 'https://en.wikipedia.org/wiki/Ancient_music',
 'https://en.wikipedia.org/wiki/Music_of_Asia',
 'https://en.wikipedia.org/wiki/Classical_music',
 'https://en.wikipedia.org/wiki/Baroque_music',
 'https://en.wikipedia.org/wiki/Classical_period_(music)',
 'https://en.wikipedia.org/wiki/Romantic_music',
 'https://en.wikipedia.org/wiki/20th-century_music',
 'https://en.wikipedia.org/wiki/Musical_composition',
 'https://en.wikipedia.org/wiki/Performance',
 'https://en.wikipedia.org/wiki/Musical_improvisation',
 'https://en.wikipedia.org/wiki/Musical_notation',
 'https://en.wikipedia.org/wiki/Elements_of_music',
 'https://en.wikipedia.org/wiki/Pitch_(music)',
 'https://en.wikipedia.org/wiki/Melody',
 'https://en.wikipedia.org/wiki/Harmony',
 'https://en.wikipedia.org/wiki/Rhythm',
 'h

In [93]:
# Creating a dictionary with the (key,value) pairs being (url,text)
url_to_text = dict(zip(note_urls, page_text))  # You don't need to know the specifics of these functions

In [94]:
print(url_to_text['https://en.wikipedia.org/wiki/Music_theory'])


Music theory is the study of the practices and possibilities of music. The Oxford Companion to Music describes three interrelated uses of the term "music theory": The first is the "rudiments", that are needed to understand music notation (key signatures, time signatures, and rhythmic notation); the second is learning scholars' views on music from antiquity to the present; the third is a sub-topic of musicology that "seeks to define processes and general principles in music". The musicological approach to theory differs from music analysis "in that it takes as its starting-point not the individual work or performance but the fundamental materials from which it is built."[1]
Music theory is frequently concerned with describing how musicians and composers make music, including tuning systems and composition methods among other topics. Because of the ever-expanding conception of what constitutes music, a more inclusive definition could be the consideration of any sonic phenomena, includin

In [95]:
# A word of caution:
# We have not extracted all of the main content's text,
# as some text may be contained in lists and tables, outside of paragraphs we scraped