# Lab | Web scraping multiple pages

In [1]:
import pandas as pd
import numpy as np 
import requests
import time
import re

from bs4 import BeautifulSoup

In [2]:
def get_bs_object(url):
    soup = None
    
    response = requests.get(url)
    
    if (response.status_code < 300):
        soup = BeautifulSoup(response.content, "html.parser");
    
    return soup

## Get songs of the century

In [3]:
url = "https://en.wikipedia.org/wiki/Songs_of_the_Century"

In [4]:
soup = get_bs_object(url)

In [5]:
#soup.prettify()

In [6]:
bs_tags = soup.select("td")

In [7]:
tags = []

for element in bs_tags:
    tags.append(element.get_text())

In [8]:
tags

['1',
 '"Over the Rainbow" (Harold Arlen, E.Y. Harburg)',
 'Judy Garland',
 '1939\n',
 '2',
 '"White Christmas" (Irving Berlin)',
 'Bing Crosby',
 '1942\n',
 '3',
 '"This Land Is Your Land" (Woody Guthrie)',
 'Woody Guthrie',
 '1940\n',
 '4',
 '"Respect" (Otis Redding)',
 'Aretha Franklin',
 '1967\n',
 '5',
 '"American Pie" (Don McLean)',
 'Don McLean',
 '1972\n',
 '6',
 '"Boogie Woogie Bugle Boy" (Don Raye, Hughie Prince)',
 'The Andrews Sisters',
 '1941\n',
 '7',
 'West Side Story (album)',
 'Leonard Bernstein and Stephen Sondheim',
 '1957\n',
 '8',
 '"Take Me Out to the Ball Game" (Jack Norworth, Albert Von Tilzer)',
 'Billy Murray',
 '1908\n',
 '9',
 '"You\'ve Lost That Lovin\' Feelin\'" (Phil Spector, Barry Mann and Cynthia Weil)',
 'The Righteous Brothers',
 '1964\n',
 '10',
 '"The Entertainer" (Scott Joplin)',
 'Scott Joplin',
 '1902\n',
 '11',
 '"In the Mood" (Wingy Manone, Andy Razaf, Joe Garland)',
 'Glenn Miller Orchestra',
 '1940\n',
 '12',
 '"Rock Around the Clock" (Max C.

In [9]:
pattern_numbers = r'[^\d+]'

list_songs = []

for song in tags:
    if re.findall(pattern_numbers, song):
        list_songs.append(song)

In [10]:
pattern_date = r'\d{4}\n'

In [11]:
list_songs = [song for song in list_songs if not re.search(pattern_date, song)]

In [12]:
list_songs

['"Over the Rainbow" (Harold Arlen, E.Y. Harburg)',
 'Judy Garland',
 '"White Christmas" (Irving Berlin)',
 'Bing Crosby',
 '"This Land Is Your Land" (Woody Guthrie)',
 'Woody Guthrie',
 '"Respect" (Otis Redding)',
 'Aretha Franklin',
 '"American Pie" (Don McLean)',
 'Don McLean',
 '"Boogie Woogie Bugle Boy" (Don Raye, Hughie Prince)',
 'The Andrews Sisters',
 'West Side Story (album)',
 'Leonard Bernstein and Stephen Sondheim',
 '"Take Me Out to the Ball Game" (Jack Norworth, Albert Von Tilzer)',
 'Billy Murray',
 '"You\'ve Lost That Lovin\' Feelin\'" (Phil Spector, Barry Mann and Cynthia Weil)',
 'The Righteous Brothers',
 '"The Entertainer" (Scott Joplin)',
 'Scott Joplin',
 '"In the Mood" (Wingy Manone, Andy Razaf, Joe Garland)',
 'Glenn Miller Orchestra',
 '"Rock Around the Clock" (Max C. Freedman, James E. Myers)',
 'Bill Haley & His Comets',
 '"When the Saints Go Marching In"',
 'Louis Armstrong',
 '"You Are My Sunshine"',
 'Jimmie Davis',
 '"Mack the Knife" (Kurt Weill, Berthol

In [13]:
list_songs = [song.replace("'", '').replace('"', "") for song in list_songs]

In [14]:
songs = [song for song in list_songs[0::2]]

In [15]:
songs

['Over the Rainbow (Harold Arlen, E.Y. Harburg)',
 'White Christmas (Irving Berlin)',
 'This Land Is Your Land (Woody Guthrie)',
 'Respect (Otis Redding)',
 'American Pie (Don McLean)',
 'Boogie Woogie Bugle Boy (Don Raye, Hughie Prince)',
 'West Side Story (album)',
 'Take Me Out to the Ball Game (Jack Norworth, Albert Von Tilzer)',
 'Youve Lost That Lovin Feelin (Phil Spector, Barry Mann and Cynthia Weil)',
 'The Entertainer (Scott Joplin)',
 'In the Mood (Wingy Manone, Andy Razaf, Joe Garland)',
 'Rock Around the Clock (Max C. Freedman, James E. Myers)',
 'When the Saints Go Marching In',
 'You Are My Sunshine',
 'Mack the Knife (Kurt Weill, Bertholt Brecht)',
 '(I Cant Get No) Satisfaction (Mick Jagger, Keith Richards)',
 'Take the A Train (Billy Strayhorn, Joya Sherrill)',
 'Blueberry Hill (Larry Stock, Al Lewis)',
 'God Bless America (Irving Berlin)',
 'The Stars and Stripes Forever (John Philip Sousa)',
 'I Heard It Through the Grapevine (Norman Whitfield, Barrett Strong)',
 '(S

In [16]:
artist = [artist for artist in list_songs[1::2]]

In [17]:
artist

['Judy Garland',
 'Bing Crosby',
 'Woody Guthrie',
 'Aretha Franklin',
 'Don McLean',
 'The Andrews Sisters',
 'Leonard Bernstein and Stephen Sondheim',
 'Billy Murray',
 'The Righteous Brothers',
 'Scott Joplin',
 'Glenn Miller Orchestra',
 'Bill Haley & His Comets',
 'Louis Armstrong',
 'Jimmie Davis',
 'Bobby Darin',
 'The Rolling Stones',
 'Duke Ellington Orchestra',
 'Fats Domino',
 'Kate Smith',
 'Sousas Band',
 'Marvin Gaye',
 'Otis Redding',
 'Tony Bennett',
 'The Beach Boys',
 'Ben E. King']

In [18]:
century_songs = pd.DataFrame(list(zip(songs, artist)), columns=['Song', 'Artist'])

In [19]:
century_songs

Unnamed: 0,Song,Artist
0,"Over the Rainbow (Harold Arlen, E.Y. Harburg)",Judy Garland
1,White Christmas (Irving Berlin),Bing Crosby
2,This Land Is Your Land (Woody Guthrie),Woody Guthrie
3,Respect (Otis Redding),Aretha Franklin
4,American Pie (Don McLean),Don McLean
5,"Boogie Woogie Bugle Boy (Don Raye, Hughie Prince)",The Andrews Sisters
6,West Side Story (album),Leonard Bernstein and Stephen Sondheim
7,"Take Me Out to the Ball Game (Jack Norworth, A...",Billy Murray
8,"Youve Lost That Lovin Feelin (Phil Spector, Ba...",The Righteous Brothers
9,The Entertainer (Scott Joplin),Scott Joplin


In [20]:
pattern = r'\(.*?\)'

In [21]:
century_songs['Song'] = century_songs['Song'].apply(lambda x: re.sub(pattern, '', x))

In [22]:
century_songs

Unnamed: 0,Song,Artist
0,Over the Rainbow,Judy Garland
1,White Christmas,Bing Crosby
2,This Land Is Your Land,Woody Guthrie
3,Respect,Aretha Franklin
4,American Pie,Don McLean
5,Boogie Woogie Bugle Boy,The Andrews Sisters
6,West Side Story,Leonard Bernstein and Stephen Sondheim
7,Take Me Out to the Ball Game,Billy Murray
8,Youve Lost That Lovin Feelin,The Righteous Brothers
9,The Entertainer,Scott Joplin


------------

## Get singles of 2022

In [23]:
request_url = "https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_2022"

In [24]:
soup = get_bs_object(request_url)

In [25]:
soup

<!DOCTYPE html>

<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Billboard Year-End Hot 100 singles of 2022 - Wikipedia</title>
<script>document.documentElement.className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled";(function(){var cookie=document.cookie.mat

In [26]:
song_tags = soup.find_all("tbody")

In [27]:
song_tags

[<tbody><tr>
 <th scope="col" style="background:#dde;"><abbr title="Number">No.</abbr>
 </th>
 <th scope="col" style="background:#dde;">Title
 </th>
 <th scope="col" style="background:#dde;">Artist(s)
 </th></tr>
 <tr>
 <td scope="row">1
 </td>
 <td>"<a href="/wiki/Heat_Waves" title="Heat Waves">Heat Waves</a>"</td>
 <td><a href="/wiki/Glass_Animals" title="Glass Animals">Glass Animals</a>
 </td></tr>
 <tr>
 <td scope="row">2
 </td>
 <td>"<a href="/wiki/As_It_Was" title="As It Was">As It Was</a>"</td>
 <td><a href="/wiki/Harry_Styles" title="Harry Styles">Harry Styles</a>
 </td></tr>
 <tr>
 <td scope="row">3
 </td>
 <td>"<a href="/wiki/Stay_(The_Kid_Laroi_and_Justin_Bieber_song)" title="Stay (The Kid Laroi and Justin Bieber song)">Stay</a>"</td>
 <td><a href="/wiki/The_Kid_Laroi" title="The Kid Laroi">The Kid Laroi</a> and <a href="/wiki/Justin_Bieber" title="Justin Bieber">Justin Bieber</a>
 </td></tr>
 <tr>
 <td scope="row">4
 </td>
 <td>"<a href="/wiki/Easy_on_Me" title="Easy on Me"

In [28]:
table = song_tags[0]

In [29]:
table

<tbody><tr>
<th scope="col" style="background:#dde;"><abbr title="Number">No.</abbr>
</th>
<th scope="col" style="background:#dde;">Title
</th>
<th scope="col" style="background:#dde;">Artist(s)
</th></tr>
<tr>
<td scope="row">1
</td>
<td>"<a href="/wiki/Heat_Waves" title="Heat Waves">Heat Waves</a>"</td>
<td><a href="/wiki/Glass_Animals" title="Glass Animals">Glass Animals</a>
</td></tr>
<tr>
<td scope="row">2
</td>
<td>"<a href="/wiki/As_It_Was" title="As It Was">As It Was</a>"</td>
<td><a href="/wiki/Harry_Styles" title="Harry Styles">Harry Styles</a>
</td></tr>
<tr>
<td scope="row">3
</td>
<td>"<a href="/wiki/Stay_(The_Kid_Laroi_and_Justin_Bieber_song)" title="Stay (The Kid Laroi and Justin Bieber song)">Stay</a>"</td>
<td><a href="/wiki/The_Kid_Laroi" title="The Kid Laroi">The Kid Laroi</a> and <a href="/wiki/Justin_Bieber" title="Justin Bieber">Justin Bieber</a>
</td></tr>
<tr>
<td scope="row">4
</td>
<td>"<a href="/wiki/Easy_on_Me" title="Easy on Me">Easy on Me</a>"</td>
<td><a 

In [30]:
table = table.find_all('td')

In [31]:
table

[<td scope="row">1
 </td>,
 <td>"<a href="/wiki/Heat_Waves" title="Heat Waves">Heat Waves</a>"</td>,
 <td><a href="/wiki/Glass_Animals" title="Glass Animals">Glass Animals</a>
 </td>,
 <td scope="row">2
 </td>,
 <td>"<a href="/wiki/As_It_Was" title="As It Was">As It Was</a>"</td>,
 <td><a href="/wiki/Harry_Styles" title="Harry Styles">Harry Styles</a>
 </td>,
 <td scope="row">3
 </td>,
 <td>"<a href="/wiki/Stay_(The_Kid_Laroi_and_Justin_Bieber_song)" title="Stay (The Kid Laroi and Justin Bieber song)">Stay</a>"</td>,
 <td><a href="/wiki/The_Kid_Laroi" title="The Kid Laroi">The Kid Laroi</a> and <a href="/wiki/Justin_Bieber" title="Justin Bieber">Justin Bieber</a>
 </td>,
 <td scope="row">4
 </td>,
 <td>"<a href="/wiki/Easy_on_Me" title="Easy on Me">Easy on Me</a>"</td>,
 <td><a href="/wiki/Adele" title="Adele">Adele</a>
 </td>,
 <td scope="row">5
 </td>,
 <td>"<a href="/wiki/Shivers_(Ed_Sheeran_song)" title="Shivers (Ed Sheeran song)">Shivers</a>"</td>,
 <td><a href="/wiki/Ed_Sheeran" 

In [32]:
test_table = [label.get_text() for label in table]

In [33]:
test_table

['1\n',
 '"Heat Waves"',
 'Glass Animals\n',
 '2\n',
 '"As It Was"',
 'Harry Styles\n',
 '3\n',
 '"Stay"',
 'The Kid Laroi and Justin Bieber\n',
 '4\n',
 '"Easy on Me"',
 'Adele\n',
 '5\n',
 '"Shivers"',
 'Ed Sheeran\n',
 '6\n',
 '"First Class"',
 'Jack Harlow\n',
 '7\n',
 '"Big Energy"',
 'Latto\n',
 '8\n',
 '"Ghost"',
 'Justin Bieber\n',
 '9\n',
 '"Super Gremlin"',
 'Kodak Black\n',
 '10\n',
 '"Cold Heart (Pnau remix)"',
 'Elton John and Dua Lipa\n',
 '11\n',
 '"Wait for U"',
 'Future featuring Drake and Tems\n',
 '12\n',
 '"About Damn Time"',
 'Lizzo\n',
 '13\n',
 '"Bad Habits"',
 'Ed Sheeran\n',
 '14\n',
 '"Thats What I Want"',
 'Lil Nas X\n',
 '15\n',
 '"Enemy"',
 'Imagine Dragons and JID\n',
 '16\n',
 '"Industry Baby"',
 'Lil Nas X and Jack Harlow\n',
 '17\n',
 '"ABCDEFU"',
 'Gayle\n',
 '18\n',
 '"Need to Know"',
 'Doja Cat\n',
 '19\n',
 '"Wasted on You"',
 'Morgan Wallen\n',
 '20\n',
 '"Me Porto Bonito"',
 'Bad Bunny and Chencho Corleone\n',
 '21\n',
 '"Woman"',
 'Doja Cat\n',
 

In [34]:
pattern = r'\d{1}\n'

In [35]:
test_table = [row for row in test_table if not re.search(pattern, row)]

In [36]:
test_table = [row.replace("'", "").replace('"', '').replace("\n", "") for row in test_table]

In [37]:
test_table

['Heat Waves',
 'Glass Animals',
 'As It Was',
 'Harry Styles',
 'Stay',
 'The Kid Laroi and Justin Bieber',
 'Easy on Me',
 'Adele',
 'Shivers',
 'Ed Sheeran',
 'First Class',
 'Jack Harlow',
 'Big Energy',
 'Latto',
 'Ghost',
 'Justin Bieber',
 'Super Gremlin',
 'Kodak Black',
 'Cold Heart (Pnau remix)',
 'Elton John and Dua Lipa',
 'Wait for U',
 'Future featuring Drake and Tems',
 'About Damn Time',
 'Lizzo',
 'Bad Habits',
 'Ed Sheeran',
 'Thats What I Want',
 'Lil Nas X',
 'Enemy',
 'Imagine Dragons and JID',
 'Industry Baby',
 'Lil Nas X and Jack Harlow',
 'ABCDEFU',
 'Gayle',
 'Need to Know',
 'Doja Cat',
 'Wasted on You',
 'Morgan Wallen',
 'Me Porto Bonito',
 'Bad Bunny and Chencho Corleone',
 'Woman',
 'Doja Cat',
 'Tití Me Preguntó',
 'Bad Bunny',
 'Running Up That Hill (A Deal with God)',
 'Kate Bush',
 'We Dont Talk About Bruno',
 'Carolina Gaitán, Mauro Castillo, Adassa, Rhenzy Feliz, Diane Guerrero, Stephanie Beatriz and the Encanto cast',
 'Late Night Talking',
 'Harry

In [38]:
pop_artists = test_table[1::2]

In [39]:
pop_songs = test_table[0::2]

In [40]:
pop_songs_2022_df = pd.DataFrame(list(zip(pop_artists, pop_songs)), columns = ['Artist', 'Song'])

-------------------------------------

In [41]:
pop_songs_2022_df.head()

Unnamed: 0,Artist,Song
0,Glass Animals,Heat Waves
1,Harry Styles,As It Was
2,The Kid Laroi and Justin Bieber,Stay
3,Adele,Easy on Me
4,Ed Sheeran,Shivers


In [42]:
century_songs.head()

Unnamed: 0,Song,Artist
0,Over the Rainbow,Judy Garland
1,White Christmas,Bing Crosby
2,This Land Is Your Land,Woody Guthrie
3,Respect,Aretha Franklin
4,American Pie,Don McLean
