# Set-up and Workflow

### Importing the packages

In [1]:
# Load the packages
import requests
from bs4 import BeautifulSoup

### Making a GET request

In [2]:
# Defining the url of the site
base_site = "https://en.wikipedia.org/wiki/Music"

# Making a get request
response = requests.get(base_site)
response.status_code

200

In [3]:
# Extracting the HTML
html = response.content

# Checking that the reply is indeed an HTML code by inspecting the first 100 symbols
html[:100]

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title'

### Making the soup

In [4]:
# Convert HTML to a BeautifulSoup object. This will allow us to parse out content from the HTML more easily.
# Using the default parser as it is included in Python
soup = BeautifulSoup(html, "html.parser")

### Getting links on page

In [5]:
# If we want all the results we use find_all()
links = soup.find_all('a')
links

[<a id="top"></a>,
 <a href="/wiki/Wikipedia:Protection_policy#semi" title="This article is semi-protected."><img alt="Page semi-protected" data-file-height="512" data-file-width="512" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/20px-Semi-protection-shackle.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/30px-Semi-protection-shackle.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/40px-Semi-protection-shackle.svg.png 2x" width="20"/></a>,
 <a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#searchInput">Jump to search</a>,
 <a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>,
 <a class="external text" href="https://en.wikipedia.org/w/index.php?title=Music&amp;action=edit">improve it</a>,
 <a href="/wiki/Talk:Music" title="Talk:Music

In [9]:
# Examining the link's addresses
[l.get('title') for l in links]   # Note that if l['href'] was written instead of l.get('href'), this would produce an error

[None,
 'This article is semi-protected.',
 None,
 None,
 'Music (disambiguation)',
 None,
 'Talk:Music',
 'Help:Maintenance template removal',
 None,
 'Wikipedia:Verifiability',
 None,
 'Help:Referencing for beginners',
 None,
 None,
 None,
 None,
 None,
 None,
 'Help:Maintenance template removal',
 'Wikipedia:Article size',
 'Wikipedia:Splitting',
 'Wikipedia:Summary style',
 'Help:Section',
 'Talk:Music',
 'Help:Maintenance template removal',
 None,
 'Enlarge',
 'Voyager Golden Record',
 'Voyager program',
 'Category:Performing arts',
 'Performing arts',
 'Acrobatics',
 'Ballet',
 'List of circus skills',
 'Clown',
 'Dance',
 'Gymnastics',
 'Magic (illusion)',
 'Mime artist',
 None,
 'Opera',
 'Professional wrestling',
 'Puppetry',
 'Public speaking',
 'Stand-up comedy',
 'Theatre',
 'Ventriloquism',
 'Template:Performing arts',
 'Template talk:Performing arts',
 None,
 'The arts',
 'Sound',
 'Musical form',
 'Harmony',
 'Melody',
 'Rhythm',
 'Musical expression',
 None,
 None,
 Non

In [10]:
# Notice that some links don't have title (None appears)

# Dropping the links without title attribute
clean_links = [l for l in links if l.get('title') != None]

In [13]:
# Obtaining the titles
titles_of_links = [link.get('title') for link in clean_links]
titles_of_links

['This article is semi-protected.',
 'Music (disambiguation)',
 'Talk:Music',
 'Help:Maintenance template removal',
 'Wikipedia:Verifiability',
 'Help:Referencing for beginners',
 'Help:Maintenance template removal',
 'Wikipedia:Article size',
 'Wikipedia:Splitting',
 'Wikipedia:Summary style',
 'Help:Section',
 'Talk:Music',
 'Help:Maintenance template removal',
 'Enlarge',
 'Voyager Golden Record',
 'Voyager program',
 'Category:Performing arts',
 'Performing arts',
 'Acrobatics',
 'Ballet',
 'List of circus skills',
 'Clown',
 'Dance',
 'Gymnastics',
 'Magic (illusion)',
 'Mime artist',
 'Opera',
 'Professional wrestling',
 'Puppetry',
 'Public speaking',
 'Stand-up comedy',
 'Theatre',
 'Ventriloquism',
 'Template:Performing arts',
 'Template talk:Performing arts',
 'The arts',
 'Sound',
 'Musical form',
 'Harmony',
 'Melody',
 'Rhythm',
 'Musical expression',
 'Definition of music',
 'Cultural universal',
 'Elements of music',
 'Elements of music',
 'Musical composition',
 'Musica