# Scraping one page

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

### getting the soup from the web page

In [2]:
url = 'https://www.popvortex.com/music/charts/top-100-songs.php'

In [3]:
response = requests.get(url)
response.status_code

200

In [30]:
soup = BeautifulSoup(response.content, "html.parser")
# soup

In [5]:
#chart-position-1 > div.chart-content.col-xs-12.col-sm-8 > p

### looking for the information we want (title and artists)

In [6]:
soup.select('#chart-position-1 > div.chart-content.col-xs-12.col-sm-8 > p')

[<p class="title-artist"><cite class="title">Unholy</cite><em class="artist">Sam Smith &amp; Kim Petras</em></p>]

In [7]:
soup.select('.title')[0].get_text()

'Unholy'

In [8]:
soup.select('.artist')[0].get_text()

'Sam Smith & Kim Petras'

In [9]:
len(soup.select('.title'))

99

### making 2 lists: titles and artists 

In [10]:
title = []
artist = []


num_iter = len(soup.select('.title'))

t_list = soup.select('.title')
a_list = soup.select('.artist')

for i in range(num_iter):
    title.append(t_list[i].get_text())
    artist.append(a_list[i].get_text())

### converting the lists into a data frame

In [11]:
top_songs = pd.DataFrame({'title':title,
                         'artist':artist})

In [12]:
top_songs

Unnamed: 0,title,artist
0,Unholy,Sam Smith & Kim Petras
1,Eagle (feat. KB),Transformation Worship
2,Everywhere,Fleetwood Mac
3,I'm Good (Blue),David Guetta & Bebe Rexha
4,Make It With You,Bread
...,...,...
94,Perfectly Loved (feat. TobyMac),Rachael Lampa
95,Surrender,Godsmack
96,Betty (Get Money),Yung Gravy
97,Country On,Luke Bryan


### finding a way to get a random song from the title column values and getting the input-output

In [13]:
import random

In [14]:
song = input("What's your favourite song?  ")

if song in top_songs['title'].values:
    print('Maybe this song will also like you! :' + random.choice(title))
else:
    print("Sorry we don't have any suggestions")

What's your favourite song?  Unholy
Maybe this song will also like you! You Proof


# Scraping multiple pages

In [15]:
url1 = 'https://playback.fm/charts/top-100-songs/1960'

### Respectful scraping:

In [16]:
from time import sleep
from random import randint

In [17]:
iterations = range(1960,2021, 10)
pages = []

for i in iterations:
    # assemble the url:
    start_at= str(i)
    url = "https://playback.fm/charts/top-100-songs/" + start_at
    
    # download html with a get request:
#     response = requests.get(url)
    response = requests.get(url, headers = {"Accept-Language": "en-US"})

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    # store response into "pages" list
    pages.append(response)

    # respectful nap:
    wait_time = randint(1,4000)
    print("I will sleep for " + str(wait_time/1000) + " second/s.")
    sleep(wait_time/1000)

Status code: 200
I will sleep for 3.992 second/s.
Status code: 200
I will sleep for 3.721 second/s.
Status code: 200
I will sleep for 1.992 second/s.
Status code: 200
I will sleep for 3.522 second/s.
Status code: 200
I will sleep for 2.783 second/s.
Status code: 200
I will sleep for 3.65 second/s.
Status code: 200
I will sleep for 1.316 second/s.


### checking information from one page to get the titles and artists 

In [29]:
# print(BeautifulSoup(pages[0].content, "html.parser").prettify())

In [19]:
soup1 = BeautifulSoup(pages[0].content, "html.parser")

In [20]:
# get the title
soup1.select('span > a')[0].get_text()

'\nAre You Lonesome Tonight?\n'

In [21]:
# get the artist
soup1.select('a.artist')[0].get_text()

'\nElvis Presley\n'

### Getting the list of titles and the list of artists from each page

In [22]:
titles = []


for i in range(len(pages)):
    parsed = BeautifulSoup(pages[i].content, "html.parser")
#     print(parsed)
    songs_html = parsed.select('td > span.song')
#     print(songs_html)
    for j in range(len(songs_html)):
        title = songs_html[j].get_text()
        titles.append(title)  


print(len(titles)) 


700


In [23]:
artists = []


for i in range(len(pages)):
    parsed = BeautifulSoup(pages[i].content, "html.parser")
#     print(parsed)
    songs_html = parsed.select('td > a.artist')
#     print(songs_html)
    for j in range(len(songs_html)):
#         print(songs_html[j])
        artist = songs_html[j].get_text()
        artists.append(artist)


print(len(artists)) # output: 631?, no 200 (just took first 4 pages)


700


### Converting the lists into data frames

In [24]:
decades_songs = pd.DataFrame({'title':titles,
                             'artist':artists})

decades_songs

Unnamed: 0,title,artist
0,\n\nAre You Lonesome Tonight?\n\n,\nElvis Presley\n
1,\n\nIt's Now Or Never\n\n,\nElvis Presley\n
2,\n\nMarina\n\n,\nRocco Granata\n
3,\n\nThe Twist\n\n,\nChubby Checker\n
4,\n\nTheme From 'A Summer Place'\n\n,\nPercy Faith\n
...,...,...
695,\n\nKings & Queens\n\n,\nAva Max\n
696,\n\nBang!\n\n,\nAJR\n
697,\n\nBandit\n\n,\nJuice Wrld & YoungBoy Never Broke Again\n
698,\n\nPhysical\n\n,\nDua Lipa\n


### Cleaning the \n

In [25]:
for col in decades_songs.columns:
     decades_songs[col] = decades_songs[col].str.replace('\n','')

In [26]:
decades_songs

Unnamed: 0,title,artist
0,Are You Lonesome Tonight?,Elvis Presley
1,It's Now Or Never,Elvis Presley
2,Marina,Rocco Granata
3,The Twist,Chubby Checker
4,Theme From 'A Summer Place',Percy Faith
...,...,...
695,Kings & Queens,Ava Max
696,Bang!,AJR
697,Bandit,Juice Wrld & YoungBoy Never Broke Again
698,Physical,Dua Lipa
