In [4]:
import requests
from bs4 import BeautifulSoup
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import os

## Scrape Wikipedia

In [17]:
# get years of interest
years = list(range(1960, 2023)) # range of years
urls = []

# specify the URL of the Wikipedia page(s)
for year in years:
    url = f'https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_{year}'
    urls.append(url)
    

In [7]:
# iterate over each URL and scrape the column entries
column_entries_list = []
for url in urls:
    
    # make a GET request to the URL and get the page content
    response = requests.get(url)
    content = response.content

    # parse the page content 
    soup = BeautifulSoup(content, 'html.parser')

    # find the table that contains the data 
    table = soup.find('table', {'class': 'wikitable sortable'})

    # scrape first column
    column_index = 1
    column_entries = []

    # iterate over each row in the table and get the entry in the specified column
    for row in table.find_all('tr')[1:]:
        column_entry = row.find_all('td')[column_index].text.strip()
        column_entries.append(column_entry)

    # remove one of the quotation marks from each element in the list
    column_entries = [element.replace('"', '') for element in column_entries]

    # add the list of column entries to the list of lists
    column_entries_list.append(column_entries)

## Scrape Spotify

In [8]:
# set up the credentials
client_id = ''
client_secret = ''

client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

dfs_by_year = {} # initialize dictionary

for i, column_entries in enumerate(column_entries_list): # for each year's top

    audio_features = []
    song_names = []
    
    for song_name in column_entries: # for each song name within the year
        
        results = sp.search(q=song_name, type='track', limit=1)

        if results['tracks']['total'] > 0:
            track_id = results['tracks']['items'][0]['id']
            features = sp.audio_features(track_id)[0]
            
            # check if features is None before appending to list
            if features is not None:
                audio_features.append(features)
                song_names.append(song_name)

    # convert features to a dataframe
    df = pd.DataFrame(audio_features)

    # add the song name and year to the dataframe
    df['song_name'] = song_names
    df['year'] = years[i]

    # store the dataframe in the dictionary
    dfs_by_year[years[i]] = df


In [29]:
# concatenate dataframes
full_df = pd.concat(dfs_by_year)

# save as csv
outfile = os.path.join("..", "data", "top_songs_by_year.csv") # point to data location
full_df.to_csv(outfile)