In [2]:
import requests
import os
from bs4 import BeautifulSoup
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import json

## Scrape Billboard

In [2]:
# define the range of years and months
years = list(range(1960, 2023))
months = list(range(1, 13))

# initialize the list of URLs
urls = []

# loop over the years and months and create URLs
for year in years:
    for month in months:
        url = f'https://www.billboard.com/charts/hot-100/{year:04d}-{month:02d}-01/' # using year and month formatting 
        urls.append(url)

In [4]:
# create a dictionary to store each song and its corresponding URL
song_url_dict = {}

for url in urls:
    result = requests.get(url)
    soup = BeautifulSoup(result.text, "html.parser")

    # get song 1
    topSong = soup.find("a", {"href": "#",
                            "class": "c-title__link lrv-a-unstyle-link"})

    # add to dictionary and remove linebreak characters
    song_name = topSong.text.strip().replace('\n', '')
    song_url_dict[song_name] = url

    # get songs 2-100
    song = soup.findAll("h3", {"class": "c-title a-no-trucate a-font-primary-bold-s u-letter-spacing-0021 lrv-u-font-size"
                                        "-18@tablet lrv-u-font-size-16 u-line-height-125 u-line-height-normal@mobile-max "
                                        "a-truncate-ellipsis u-max-width-330 u-max-width-230@tablet-only",
                            "id": "title-of-a-story"})

    # add to dictionary and remove linebreak characters
    for i in range(99):
        try:
            song_name = song[i].text.strip().replace('\n', '')
        except IndexError:
            song_name = 'could_not_find'
        song_url_dict[song_name] = url

# remove the missing key-value pair 
removed_value = song_url_dict.pop('could_not_find') # 1 song from march 1977 https://www.billboard.com/charts/hot-100/1977-03-01/'

In [20]:
# save dictionary to not have to rerun the scrape
with open("song_url_dict.json", "w") as file:
    json.dump(song_url_dict, file)

## Scrape Spotify

In [None]:
# read dictionary
with open("song_url_dict.json", "r") as file:
    song_url_dict = json.load(file)

In [14]:
# create a requests session
session = requests.Session()

# set up credentials
client_id = ''
client_secret = ''

# create the Spotify client
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager, requests_session=session)

audio_features = []
song_names = []
urls = []

for song_name, url in song_url_dict.items(): # for each song name and url
    
    # track the number of retries
    retries = 0
    
    # make the API request and retry up to 3 times if a ReadTimeout error occurs
    while True:
        try:
            results = sp.search(q=song_name, type='track', limit=1)
            break
        except requests.exceptions.ReadTimeout as e:
            if retries < 3:
                retries += 1
                print(f'ReadTimeout error encountered for "{song_name}". Retrying in 5 seconds (retry {retries}/3)')
                time.sleep(5)
            else:
                print(f'ReadTimeout error encountered for "{song_name}". Maximum retries reached. Skipping.')
                break

    if results['tracks']['total'] > 0:
        track_id = results['tracks']['items'][0]['id']
        
        # make the API request and retry up to 3 times if a ReadTimeout error occurs
        retries = 0
        while True:
            try:
                features = sp.audio_features(track_id)[0]
                break
            except requests.exceptions.ReadTimeout as e:
                if retries < 3:
                    retries += 1
                    print(f'ReadTimeout error encountered for "{song_name}". Retrying in 5 seconds (retry {retries}/3)')
                    time.sleep(5)
                else:
                    print(f'ReadTimeout error encountered for "{song_name}". Maximum retries reached. Skipping.')
                    break
        
        # check if features is None before appending to list
        if features is not None:
            audio_features.append(features)
            song_names.append(song_name)
            urls.append(url)

# convert features to a dataframe
df = pd.DataFrame(audio_features)

# add relevant columns to the dataframe
df['song_name'] = song_names
df['scraped_url'] = urls

# extract date from URL and add to new column
df['date'] = df['scraped_url'].apply(lambda x: x.split('/')[-2])


In [10]:
# # save as csv
outfile = os.path.join("..", "data", "top_songs_by_month.csv") # point to data location
df.to_csv(outfile)