In [4]:
import requests
from bs4 import BeautifulSoup
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd

## Scrape Wikipedia

In [17]:
# get years of interest
years = list(range(1960, 2023)) # range of years
urls = []

# specify the URL of the Wikipedia page(s)
for year in years:
    url = f'https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_{year}'
    urls.append(url)
    

In [7]:
# iterate over each URL and scrape the column entries
column_entries_list = []
for url in urls:
    
    # make a GET request to the URL and get the page content
    response = requests.get(url)
    content = response.content

    # parse the page content 
    soup = BeautifulSoup(content, 'html.parser')

    # find the table that contains the data 
    table = soup.find('table', {'class': 'wikitable sortable'})

    # scrape first column
    column_index = 1
    column_entries = []

    # iterate over each row in the table and get the entry in the specified column
    for row in table.find_all('tr')[1:]:
        column_entry = row.find_all('td')[column_index].text.strip()
        column_entries.append(column_entry)

    # remove one of the quotation marks from each element in the list
    column_entries = [element.replace('"', '') for element in column_entries]

    # add the list of column entries to the list of lists
    column_entries_list.append(column_entries)

## Scrape Spotify

In [8]:
# set up the credentials
client_id = 'ac02b942434b4829916fd286f9466ba9'
client_secret = '72034cfab81a4335bf8f8525b0715dcb'

client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

dfs_by_year = {} # initialize dictionary

for i, column_entries in enumerate(column_entries_list): # for each year's top

    audio_features = []
    song_names = []
    
    for song_name in column_entries: # for each song name within the year
        
        results = sp.search(q=song_name, type='track', limit=1)

        if results['tracks']['total'] > 0:
            track_id = results['tracks']['items'][0]['id']
            features = sp.audio_features(track_id)[0]
            
            # check if features is None before appending to list
            if features is not None:
                audio_features.append(features)
                song_names.append(song_name)

    # convert features to a dataframe
    df = pd.DataFrame(audio_features)

    # add the song name and year to the dataframe
    df['song_name'] = song_names
    df['year'] = years[i]

    # store the dataframe in the dictionary
    dfs_by_year[years[i]] = df


In [12]:
# inspect
dfs_by_year[2022].head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,song_name,year
0,0.761,0.525,11,-6.9,1,0.0944,0.44,7e-06,0.0921,0.531,80.87,audio_features,3USxtqRwSYz57Ewm6wWRMp,spotify:track:3USxtqRwSYz57Ewm6wWRMp,https://api.spotify.com/v1/tracks/3USxtqRwSYz5...,https://api.spotify.com/v1/audio-analysis/3USx...,238805,4,Heat Waves,2022
1,0.52,0.731,6,-5.338,0,0.0557,0.342,0.00101,0.311,0.662,173.93,audio_features,4LRPiXqCikLlN15c3yImP7,spotify:track:4LRPiXqCikLlN15c3yImP7,https://api.spotify.com/v1/tracks/4LRPiXqCikLl...,https://api.spotify.com/v1/audio-analysis/4LRP...,167303,4,As It Was,2022
2,0.621,0.31,9,-10.164,0,0.0283,0.945,6.1e-05,0.117,0.125,111.893,audio_features,789CxjEOtO76BVD1A9yJQH,spotify:track:789CxjEOtO76BVD1A9yJQH,https://api.spotify.com/v1/tracks/789CxjEOtO76...,https://api.spotify.com/v1/audio-analysis/789C...,240707,4,Stay,2022
3,0.604,0.366,5,-7.519,1,0.0282,0.578,0.0,0.133,0.13,141.981,audio_features,0gplL1WMoJ6iYaPgMCL0gX,spotify:track:0gplL1WMoJ6iYaPgMCL0gX,https://api.spotify.com/v1/tracks/0gplL1WMoJ6i...,https://api.spotify.com/v1/audio-analysis/0gpl...,224695,4,Easy on Me,2022
4,0.788,0.859,2,-2.724,1,0.0856,0.281,0.0,0.0424,0.822,141.02,audio_features,50nfwKoDiSYg8zOCREWAm5,spotify:track:50nfwKoDiSYg8zOCREWAm5,https://api.spotify.com/v1/tracks/50nfwKoDiSYg...,https://api.spotify.com/v1/audio-analysis/50nf...,207853,4,Shivers,2022


In [18]:
# put dataframes into list
dfs = []
for year in years:
    if year in dfs_by_year:
        dfs.append(dfs_by_year[year])

# concatenate dataframes from list
full_df = pd.concat(dfs, axis=0)

In [19]:
full_df.tail()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,song_name,year
95,0.527,0.461,7,-5.908,1,0.0269,0.118,0.0,0.0831,0.227,128.153,audio_features,0De9jFjJ4eRLl7Yww2eBw1,spotify:track:0De9jFjJ4eRLl7Yww2eBw1,https://api.spotify.com/v1/tracks/0De9jFjJ4eRL...,https://api.spotify.com/v1/audio-analysis/0De9...,214405,3,Flower Shops,2022
96,0.745,0.65,2,-11.814,1,0.346,0.0451,0.00758,0.111,0.386,144.047,audio_features,5vUnjhBzRJJIAOJPde6zDx,spotify:track:5vUnjhBzRJJIAOJPde6zDx,https://api.spotify.com/v1/tracks/5vUnjhBzRJJI...,https://api.spotify.com/v1/audio-analysis/5vUn...,152137,4,To the Moon,2022
97,0.714,0.472,2,-7.375,1,0.0864,0.013,5e-06,0.266,0.238,131.121,audio_features,3nqQXoyQOWXiESFLlDF1hG,spotify:track:3nqQXoyQOWXiESFLlDF1hG,https://api.spotify.com/v1/tracks/3nqQXoyQOWXi...,https://api.spotify.com/v1/audio-analysis/3nqQ...,156943,4,Unholy,2022
98,0.471,0.846,0,-5.269,1,0.0389,0.00279,3e-06,0.145,0.539,100.089,audio_features,4FdPnT2cFrpWCmWZd7GXc3,spotify:track:4FdPnT2cFrpWCmWZd7GXc3,https://api.spotify.com/v1/tracks/4FdPnT2cFrpW...,https://api.spotify.com/v1/audio-analysis/4FdP...,213719,4,One Mississippi,2022
99,0.591,0.814,4,-4.986,1,0.0468,0.015,0.0,0.117,0.815,149.9,audio_features,13G5xv1wUKvJYbK0wYmioN,spotify:track:13G5xv1wUKvJYbK0wYmioN,https://api.spotify.com/v1/tracks/13G5xv1wUKvJ...,https://api.spotify.com/v1/audio-analysis/13G5...,195760,4,Circles Around This Town,2022


In [20]:
# save as csv
full_df.to_csv("top_songs_by_year.csv")