### Importing the packages

In [1]:
# Load the packages
import requests
from bs4 import BeautifulSoup

### Making a get request

In [2]:
# Defining the url of the site
base_site = "https://en.wikipedia.org/wiki/Music"

# Making a get request
response = requests.get(base_site)
response

<Response [200]>

In [4]:
# Extracting the HTML
html = response.content
html

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Music - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"fe02a1ae-2e43-4345-8113-8e5a9843e50b","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Music","wgTitle":"Music","wgCurRevisionId":1119417309,"wgRevisionId":1119417309,"wgArticleId":18839,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles containing Ancient Greek (to 1453)-language text","Webarchive template wayback links","Pages containing links to subscription-only content","Wikipedia articles nee

### Making the soup

In [5]:
# Convert HTML to a BeautifulSoup object. This will allow us to parse out content from the HTML more easily.
# Using the default parser as it is included in Python
soup = BeautifulSoup(html, "html.parser")

### 1. Extract all existing titles of links

In [16]:
# Find all links on the page 
links = soup.find_all('a')
len(links)

2469

In [42]:
# Dropping the links without 'href' attribute
links_href = [ l for l in links if (l.get("href") != None) ]
len(links_href)

2466

In [41]:
# Getting all titles
titles = [l.get("title") for l in links_href ]
len(titles)

2466

In [40]:
# Removing the 'None' titles
titles = [t for t in titles if t != None]
len(titles)

1894

### 2. Extract all heading 2 strings.

In [44]:
# Inspect all h2 tags
h2 = soup.find_all("h2")
h2

[<h2 id="mw-toc-heading">Contents</h2>,
 <h2><span class="mw-headline" id="Etymology_and_terminology">Etymology and terminology</span></h2>,
 <h2><span class="mw-headline" id="History">History</span></h2>,
 <h2><span class="mw-headline" id="Creation">Creation</span></h2>,
 <h2><span class="mw-headline" id="Art_and_entertainment">Art and entertainment</span></h2>,
 <h2><span class="mw-headline" id="Elements">Elements</span></h2>,
 <h2><span class="mw-headline" id="Philosophy">Philosophy</span></h2>,
 <h2><span class="mw-headline" id="Psychology">Psychology</span></h2>,
 <h2><span class="mw-headline" id="Sociological_aspects">Sociological aspects</span></h2>,
 <h2><span class="mw-headline" id="Media_and_technology">Media and technology</span></h2>,
 <h2><span class="mw-headline" id="Education">Education</span></h2>,
 <h2><span class="mw-headline" id="Academic_study">Academic study</span></h2>,
 <h2><span class="mw-headline" id="Therapy">Therapy</span></h2>,
 <h2><span class="mw-headline"

In [47]:
# Get the text
text = [h.string for h in h2]
text

['Contents',
 'Etymology and terminology',
 'History',
 'Creation',
 'Art and entertainment',
 'Elements',
 'Philosophy',
 'Psychology',
 'Sociological aspects',
 'Media and technology',
 'Education',
 'Academic study',
 'Therapy',
 'See also',
 'References',
 'Further reading',
 'External links',
 'Navigation menu']

### 3. Print the whole footer text.

In [52]:
# By inspection: we see that the footer is contained inside a ...
footer = soup.find("footer")
print(footer.text)



 This page was last edited on 1 November 2022, at 13:36 (UTC).
Text is available under the Creative Commons Attribution-ShareAlike License 3.0;
additional terms may apply.  By using this site, you agree to the Terms of Use and Privacy Policy. Wikipedia® is a registered trademark of the Wikimedia Foundation, Inc., a non-profit organization.


Privacy policy
About Wikipedia
Disclaimers
Contact Wikipedia
Mobile view
Developers
Statistics
Cookie statement






