# Scraping one page

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

### getting the soup from the web page

In [2]:
url = 'https://www.popvortex.com/music/charts/top-100-songs.php'

In [3]:
response = requests.get(url)
response.status_code

200

In [4]:
soup = BeautifulSoup(response.content, "html.parser")
# soup

In [5]:
#chart-position-1 > div.chart-content.col-xs-12.col-sm-8 > p

### looking for the information we want (title and artists)

In [6]:
soup.select('#chart-position-1 > div.chart-content.col-xs-12.col-sm-8 > p')

[<p class="title-artist"><cite class="title">Unholy</cite><em class="artist">Sam Smith &amp; Kim Petras</em></p>]

In [7]:
soup.select('.title')[0].get_text()

'Unholy'

In [8]:
soup.select('.artist')[0].get_text()

'Sam Smith & Kim Petras'

In [9]:
len(soup.select('.title'))

100

### making 2 lists: titles and artists 

In [10]:
title = []
artist = []


num_iter = len(soup.select('.title'))

t_list = soup.select('.title')
a_list = soup.select('.artist')

for i in range(num_iter):
    title.append(t_list[i].get_text())
    artist.append(a_list[i].get_text())

### converting the lists into a data frame

In [11]:
top_songs = pd.DataFrame({'title':title,
                         'artist':artist})

In [12]:
top_songs

Unnamed: 0,title,artist
0,Unholy,Sam Smith & Kim Petras
1,I'm Good (Blue),David Guetta & Bebe Rexha
2,Thank God,Kane Brown & Katelyn Brown
3,wait in the truck,HARDY & Lainey Wilson
4,Everywhere,Fleetwood Mac
...,...,...
95,Boulevard of Broken Dreams (feat. John Gallagh...,Green Day
96,Betty (Get Money),Yung Gravy
97,Boom Clap,Charli XCX
98,One Way or Another (Hocus Pocus 2 Version),"Bette Midler, Sarah Jessica Parker & Kathy Najimy"


### finding a way to get a random song from the title column values and getting the input-output

In [13]:
import random

In [14]:
song = input("What's your favourite song?  ")

if song in top_songs['title'].values:
    print('Maybe this song will also like you! : ' + random.choice(title))
else:
    print("Sorry we don't have any suggestions")

What's your favourite song?  Unholy
Maybe this song will also like you! :1,2,3 Eoi!


# Scraping multiple pages

In [15]:
url1 = 'https://playback.fm/charts/top-100-songs/1960'

### Respectful scraping:

In [16]:
from time import sleep
from random import randint

In [17]:
iterations = range(1960,2021, 10)
pages = []

for i in iterations:
    # assemble the url:
    start_at= str(i)
    url = "https://playback.fm/charts/top-100-songs/" + start_at
    
    # download html with a get request:
#     response = requests.get(url)
    response = requests.get(url, headers = {"Accept-Language": "en-US"})

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    # store response into "pages" list
    pages.append(response)

    # respectful nap:
    wait_time = randint(1,4000)
    print("I will sleep for " + str(wait_time/1000) + " second/s.")
    sleep(wait_time/1000)

Status code: 200
I will sleep for 2.541 second/s.
Status code: 200
I will sleep for 2.421 second/s.
Status code: 200
I will sleep for 1.359 second/s.
Status code: 200
I will sleep for 3.943 second/s.
Status code: 200
I will sleep for 1.436 second/s.
Status code: 200
I will sleep for 1.89 second/s.
Status code: 200
I will sleep for 1.895 second/s.


### checking information from one page to get the titles and artists 

In [18]:
# print(BeautifulSoup(pages[0].content, "html.parser").prettify())

In [19]:
soup1 = BeautifulSoup(pages[0].content, "html.parser")

In [20]:
# get the title
soup1.select('span > a')[0].get_text()

'\nAre You Lonesome Tonight?\n'

In [21]:
# get the artist
soup1.select('a.artist')[0].get_text()

'\nElvis Presley\n'

### Getting the list of titles and the list of artists from each page

In [22]:
titles = []


for i in range(len(pages)):
    parsed = BeautifulSoup(pages[i].content, "html.parser")
#     print(parsed)
    songs_html = parsed.select('td > span.song')
#     print(songs_html)
    for j in range(len(songs_html)):
        title = songs_html[j].get_text()
        titles.append(title)  


print(len(titles)) 


700


In [23]:
artists = []


for i in range(len(pages)):
    parsed = BeautifulSoup(pages[i].content, "html.parser")
#     print(parsed)
    songs_html = parsed.select('td > a.artist')
#     print(songs_html)
    for j in range(len(songs_html)):
#         print(songs_html[j])
        artist = songs_html[j].get_text()
        artists.append(artist)


print(len(artists)) # output: 631?, no 200 (just took first 4 pages)


700


### Converting the lists into data frames

In [24]:
decades_songs = pd.DataFrame({'title':titles,
                             'artist':artists})

decades_songs

Unnamed: 0,title,artist
0,\n\nAre You Lonesome Tonight?\n\n,\nElvis Presley\n
1,\n\nIt's Now Or Never\n\n,\nElvis Presley\n
2,\n\nMarina\n\n,\nRocco Granata\n
3,\n\nThe Twist\n\n,\nChubby Checker\n
4,\n\nTheme From 'A Summer Place'\n\n,\nPercy Faith\n
...,...,...
695,\n\nKings & Queens\n\n,\nAva Max\n
696,\n\nBang!\n\n,\nAJR\n
697,\n\nBandit\n\n,\nJuice Wrld & YoungBoy Never Broke Again\n
698,\n\nPhysical\n\n,\nDua Lipa\n


### Cleaning the \n

In [25]:
for col in decades_songs.columns:
     decades_songs[col] = decades_songs[col].str.replace('\n','')

In [26]:
decades_songs

Unnamed: 0,title,artist
0,Are You Lonesome Tonight?,Elvis Presley
1,It's Now Or Never,Elvis Presley
2,Marina,Rocco Granata
3,The Twist,Chubby Checker
4,Theme From 'A Summer Place',Percy Faith
...,...,...
695,Kings & Queens,Ava Max
696,Bang!,AJR
697,Bandit,Juice Wrld & YoungBoy Never Broke Again
698,Physical,Dua Lipa


# Getting another 100 songs to add to the top_songs data frame

In [27]:
url = 'https://www.lahiguera.net/musicalia/lista/lista_espanola_canciones/'

In [28]:
response = requests.get(url)
response.status_code

200

In [34]:
soup3 = BeautifulSoup(response.content, "html.parser")
soup3.select('#principal > div.listado > article:nth-child(2) > b:nth-child(4)')

[<b>Quevedo || BZRP Music Sessions #52</b>]

In [69]:
soup3.select('#principal > div.listado > article:nth-child(20)')[0].get_text()


'19\xa0\xa017\xa0\xa018Mariposas(Aitana)Aitana, Sangiovanni '

# Getting another 40 songs to add to the top_songs data frame

In [70]:
url = 'https://los40.com/lista40/'

In [71]:
response = requests.get(url)
response.status_code

200

In [77]:
soup4 = BeautifulSoup(response.content, "html.parser")
soup4.select('#portada > main > div.contenedor_principal.estirar > div.contenido_principal.estirar > div.columnas_principal_y_secundaria > div > div.lista40.principal > div:nth-child(4) > div.data-video > div.info_grupo > p')[0].get_text()

'Carretera y manta'

In [76]:
soup4.select('#portada > main > div.contenedor_principal.estirar > div.contenido_principal.estirar > div.columnas_principal_y_secundaria > div > div.lista40.principal > div:nth-child(4) > div.data-video > div.info_grupo > h4 > a')[0].get_text()

'Ana Mena;Belinda'

In [84]:
len(soup4.select('#portada > main > div.contenedor_principal.estirar > div.contenido_principal.estirar > div.columnas_principal_y_secundaria > div > div.lista40.principal > div.article'))

40

In [85]:
my_soup = soup4.select('#portada > main > div.contenedor_principal.estirar > div.contenido_principal.estirar > div.columnas_principal_y_secundaria > div > div.lista40.principal > div.article')

In [105]:
my_soup[0].select('p')[0].get_text()

'Las 12'

In [107]:
my_soup[0].select('h4 > a')[0].get_text()

'Ana Mena;Belinda'

In [114]:
my_soup[19].select('p')[0].get_text()

'Latidos'

In [116]:
my_soup[1].select('h4')[0].get_text()

'Rosalía'

In [119]:
song = []
artist = []


for i in range(len(my_soup)):
#     print(i)
    song.append(my_soup[i].select('p')[0].get_text())
    artist.append(my_soup[i].select('h4')[0].get_text())

In [120]:
spain_40 = pd.DataFrame({'title':song, 'artist':artist})
spain_40

Unnamed: 0,title,artist
0,Las 12,Ana Mena;Belinda
1,Despechá,Rosalía
2,Quevedo: Bzrp music sessions Vol. 52,Bizarrap;Quevedo
3,Mariposas,Aitana;Sangiovanni
4,I ain't worried,OneRepublic
5,Don't you worry,The Black Eyed Peas;David Guetta;Shakira
6,Hold me closer,Britney Spears;Elton John
7,Envolver,Anitta
8,As it was,Harry Styles
9,Carretera y manta,Pablo Alborán


In [121]:
spain_40['artist'] = spain_40['artist'].str.replace(';',' & ')
spain_40

Unnamed: 0,title,artist
0,Las 12,Ana Mena & Belinda
1,Despechá,Rosalía
2,Quevedo: Bzrp music sessions Vol. 52,Bizarrap & Quevedo
3,Mariposas,Aitana & Sangiovanni
4,I ain't worried,OneRepublic
5,Don't you worry,The Black Eyed Peas & David Guetta & Shakira
6,Hold me closer,Britney Spears & Elton John
7,Envolver,Anitta
8,As it was,Harry Styles
9,Carretera y manta,Pablo Alborán


In [122]:
top_hits = pd.concat([top_songs,spain_40],axis=0)
top_hits

Unnamed: 0,title,artist
0,Unholy,Sam Smith & Kim Petras
1,I'm Good (Blue),David Guetta & Bebe Rexha
2,Thank God,Kane Brown & Katelyn Brown
3,wait in the truck,HARDY & Lainey Wilson
4,Everywhere,Fleetwood Mac
...,...,...
35,Pegao,Camilo
36,SloMo,Chanel
37,Levantaremos al sol,Álvaro de Luna
38,Los niños del parque,Walls


In [123]:
top_hits.to_csv('top_hits.csv', index=False)