In [1]:
import requests
from bs4 import BeautifulSoup
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd

## Scrape Wikipedia

In [2]:
# get years of interest
# specify the URL of the Wikipedia page(s)
years = [2019, 2020] # decide which years
urls = []

for year in years:
    url = f'https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_{year}'
    urls.append(url)
    
print(urls)

['https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_2019', 'https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_2020']


In [3]:
# iterate over each URL and scrape the column entries
column_entries_list = []
for url in urls:
    
    # make a GET request to the URL and get the page content
    response = requests.get(url)
    content = response.content

    # parse the page content with BeautifulSoup
    soup = BeautifulSoup(content, 'html.parser')

    # find the table that contains the data you want to scrape
    table = soup.find('table', {'class': 'wikitable sortable'})

    # find the column you want to scrape (in this example, we'll scrape the second column)
    column_index = 1
    column_entries = []

    # iterate over each row in the table and get the entry in the specified column
    for row in table.find_all('tr')[1:]:
        column_entry = row.find_all('td')[column_index].text.strip()
        column_entries.append(column_entry)

    # remove one of the quotation marks from each element in the list
    column_entries = [element.replace('"', '') for element in column_entries]

    # add the list of column entries to the list of lists
    column_entries_list.append(column_entries)

# print the list of lists
print(column_entries_list[0])


['Old Town Road', 'Sunflower', 'Without Me', 'Bad Guy', 'Wow', 'Happier', '7 Rings', 'Talk', 'Sicko Mode', 'Sucker', 'High Hopes', 'Thank U, Next', 'Truth Hurts', 'Dancing with a Stranger', 'Señorita', "I Don't Care", 'Eastside', 'Going Bad', 'Shallow', 'Better', 'No Guidance', 'Girls Like You', 'Sweet but Psycho', 'Suge', 'Middle Child', 'Drip Too Hard', 'Someone You Loved', 'Ransom', "If I Can't Have You", 'Goodbyes', 'Zeze', 'Better Now', 'Youngblood', 'Money in the Grave', 'Speechless', "Break Up with Your Girlfriend, I'm Bored", 'Please Me', 'Money', 'You Need to Calm Down', 'Panini', 'Look Back at It', 'A Lot', 'Me!', 'Mia', 'Pop Out', 'Beautiful Crazy', 'Thotiana', 'Lucid Dreams', 'Mo Bamba', 'Beautiful People', 'Wake Up in the Sky', 'Whiskey Glasses', "God's Country", 'Be Alright', 'Pure Water', 'The Git Up', 'Taki Taki', 'Close to Me', 'Envy Me', 'You Say', 'Hey Look Ma, I Made It', 'Circles', 'Beer Never Broke My Heart', 'The London', 'Con Calma', 'Murder on My Mind', "When t

## Scrape Spotify

In [5]:
# set up the credentials
client_id = 'ac02b942434b4829916fd286f9466ba9'
client_secret = '72034cfab81a4335bf8f8525b0715dcb'

client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

dfs_by_year = {} # initialize dictionary

for i, column_entries in enumerate(column_entries_list): # for each year's top

    audio_features = []
    song_names = []
    
    for song_name in column_entries: # for each song name within the year
        
        results = sp.search(q=song_name, type='track', limit=1)

        if results['tracks']['total'] > 0:
            track_id = results['tracks']['items'][0]['id']
            features = sp.audio_features(track_id)[0]
            
            # Check if features is None before appending to list
            if features is not None:
                audio_features.append(features)
                song_names.append(song_name)

    # convert the features to a dataframe
    df = pd.DataFrame(audio_features)

    # add the song name column to the dataframe
    df['song_name'] = song_names
    
    # add the year column to the dataframe
    df['year'] = years[i]

    # store the dataframe in the dictionary
    dfs_by_year[years[i]] = df


In [6]:
# inspect
dfs_by_year[2019].head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,song_name,year
0,0.878,0.619,6,-5.56,1,0.102,0.0533,0.0,0.113,0.639,136.041,audio_features,2YpeDb67231RjR0MgVLzsG,spotify:track:2YpeDb67231RjR0MgVLzsG,https://api.spotify.com/v1/tracks/2YpeDb67231R...,https://api.spotify.com/v1/audio-analysis/2Ype...,157067,4,Old Town Road,2019
1,0.76,0.479,2,-5.574,1,0.0466,0.556,0.0,0.0703,0.913,89.911,audio_features,3KkXRkHbMCARz0aVfEt68P,spotify:track:3KkXRkHbMCARz0aVfEt68P,https://api.spotify.com/v1/tracks/3KkXRkHbMCAR...,https://api.spotify.com/v1/audio-analysis/3KkX...,158040,4,Sunflower,2019
2,0.908,0.669,7,-2.827,1,0.0738,0.00286,0.0,0.237,0.662,112.238,audio_features,7lQ8MOhq6IN2w8EYcFNSUk,spotify:track:7lQ8MOhq6IN2w8EYcFNSUk,https://api.spotify.com/v1/tracks/7lQ8MOhq6IN2...,https://api.spotify.com/v1/audio-analysis/7lQ8...,290320,4,Without Me,2019
3,0.701,0.425,7,-10.965,1,0.375,0.328,0.13,0.1,0.562,135.128,audio_features,2Fxmhks0bxGSBdJ92vM42m,spotify:track:2Fxmhks0bxGSBdJ92vM42m,https://api.spotify.com/v1/tracks/2Fxmhks0bxGS...,https://api.spotify.com/v1/audio-analysis/2Fxm...,194088,4,Bad Guy,2019
4,0.829,0.539,11,-7.359,0,0.208,0.136,2e-06,0.103,0.388,99.96,audio_features,7xQAfvXzm3AkraOtGPWIZg,spotify:track:7xQAfvXzm3AkraOtGPWIZg,https://api.spotify.com/v1/tracks/7xQAfvXzm3Ak...,https://api.spotify.com/v1/audio-analysis/7xQA...,149547,4,Wow,2019


In [7]:
# inspect
list(dfs_by_year[2019].columns.values)

['danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'type',
 'id',
 'uri',
 'track_href',
 'analysis_url',
 'duration_ms',
 'time_signature',
 'song_name',
 'year']

In [None]:
# then concatenate all the dfs together