In [1]:
# import modules
import pandas as pd
import numpy as np
import spotipy

# full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas format
pd.set_option('display.float_format', '{:_.0f}'.format)
# NOTE: underscore separaters ('_') are better than commas (',') because 
# numbers with underscores work in Python without any extra effort.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# spotipy
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# CONFIDENTIAL.py included in .gitignore, contains private API key
# use link above to get a client_id and client_secret from developer.spotify.com
from CONFIDENTIAL import client_id, client_secret

### The Billboard 100

In [2]:
# Billboard Top 100 Historical Data
# link:  https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs
# via:  https://toolbox.google.com/datasetsearch
url_billboard = r'D:\RYERSON\820\Datasets\Billboard The Hot 100 Songs\charts.csv'

In [3]:
df_billboard = pd.read_csv(url_billboard)
df_billboard['date'] = pd.to_datetime(df_billboard['date'])
df_billboard.head(10)

Unnamed: 0,date,rank,song,artist,last-week,peak-rank,weeks-on-board
0,2021-11-06,1,Easy On Me,Adele,1,1,3
1,2021-11-06,2,Stay,The Kid LAROI & Justin Bieber,2,1,16
2,2021-11-06,3,Industry Baby,Lil Nas X & Jack Harlow,3,1,14
3,2021-11-06,4,Fancy Like,Walker Hayes,4,3,19
4,2021-11-06,5,Bad Habits,Ed Sheeran,5,2,18
5,2021-11-06,6,Way 2 Sexy,Drake Featuring Future & Young Thug,6,1,8
6,2021-11-06,7,Shivers,Ed Sheeran,9,7,7
7,2021-11-06,8,Good 4 U,Olivia Rodrigo,7,1,24
8,2021-11-06,9,Need To Know,Doja Cat,11,9,20
9,2021-11-06,10,Levitating,Dua Lipa,8,2,56


In [4]:
# date range for data
df_billboard.date.min(), df_billboard.date.max()

(Timestamp('1958-08-04 00:00:00'), Timestamp('2021-11-06 00:00:00'))

### Unique Songs from The Billboard 100

In [71]:
# just the songs on the billboard 100, once per song
df_billboard_songs = df_billboard[['song', 'artist']].drop_duplicates().sort_values(['artist', 'song']).reset_index(drop=True)

# add a blank id column
df_billboard_songs['id'] = ""

df_billboard_songs.shape

(29681, 3)

### 1.2M songs with Metadata

In [100]:
# there are ~30k songs on the Billboard 100 list
# let's see how many are here:
# https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs
# via:  https://www.kaggle.com/datasets/
# Spotify 1.2M+ Songs
url_1M_songs = r'D:\RYERSON\820\Datasets\Spotify 1.2M+ Songs\tracks_features.csv'

# create the dataframe with the large number of songs metadata
df_1M_songs = pd.read_csv(url_1M_songs)

# make a list of song ids from the 1M dataset
metadata_ids = list(df_1M_songs.id)

### Spotify API setup

https://developer.spotify.com/terms/

https://developer.spotify.com/dashboard/applications/cd5ce2cb690543ff9967e817d4665543

##### Useful Spotify API Features

https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-audio-features

https://developer.spotify.com/documentation/web-api/reference/#/operations/get-recommendations

In [27]:
spotify = spotipy.Spotify(
    client_credentials_manager=SpotifyClientCredentials(
        client_id = client_id,
        client_secret = client_secret
    )
)

In [105]:
def find_id(track_title, artist_name, metadata_ids):
    """ 
    for searches with multiple results, all id were identical for the test cases I ran 
    some searches return no results, in this case the song is not on spotify
        confirmed by spot checks in the spotify music player
    some tracks give a 404 error, 
        these seem to exist in Spotify but 404 anyway
        not sure why the API does this but 
    """
    track_info = spotify.search(q='artist:' + artist_name + ' track:' + track_title, type='track')
    
    if track_info['tracks']['items'] == []:  # if track doesn't exist on Spotify
        return None
    else:
        # set the id to the first result
        track_id = track_info['tracks']['items'][0]['id']
        
        # check if there is a better match
        number_of_results = len(track_info['tracks']['items'])
        
        for i in range(number_of_results):
            alt_track_id = track_info['tracks']['items'][i]['id']
            if alt_track_id in metadata_ids:
                track_id = track_info['tracks']['items'][i]['id']
        
        return track_id

In [106]:
track_title = 'Heathens'
artist_name = 'twenty one pilots'
find_id(track_title, artist_name, metadata_ids)

'6i0V12jOa3mr6uu4WYhUBr'

In [107]:
df_1M_songs[df_1M_songs.id=='6i0V12jOa3mr6uu4WYhUBr']

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date
959826,6i0V12jOa3mr6uu4WYhUBr,Heathens,Heathens,3J8W9AOjQhnBLCX33m3atT,['Twenty One Pilots'],['3YQKmKGau1PzlVlkL1iodx'],1,1,False,1,0,4,-9,0,0,0,0,0,1,90,195920,4,2016,2016-06-16


In [82]:
track_title = 'Heathens'
artist_name = 'twenty one pilots'

track_info = spotify.search(q='artist:' + artist_name + ' track:' + track_title, type='track')

In [98]:
no_results = len(track_info['tracks']['items'])

for i in range(no_results):
    track_id = track_info['tracks']['items'][i]['id']
    print(track_id)
    if track_id in metadata_ids:
        print('match')
    
# maybe make a function that returns the match or if not available, the first entry

6i0V12jOa3mr6uu4WYhUBr
match
3JP0pKp18uaz66YdMpQQDf
3vZM2544ETmOCCHuBPZRzm
2N68XdwQ97tF145Ska3g44
3MIlaQrNHVh4mWROnr42Xp
5pyyN3d68eSFbwJRT0SIYO
4JpyiVLXZMX1JocLO4MQiu
0uB0a0FmuW3DZmfzG0ORlT
7LmyQJQrEsLq8YDnGieT6g
53unsqxbbaoznkaxaNk6eP


### Add Spotify IDs to billboard songs

In [78]:
# artist id, oops
# # add troublesome songs manually
# df_billboard_songs.loc[df_billboard_songs.song == 'Jai Ho! (You Are My Destiny)', 'id'] = '7Kpqjspw4Y7HrvItIRcBiW'
# df_billboard_songs.loc[df_billboard_songs.song == 'Wholy Holy', 'id'] = '0E9JjpG9gVNPcb8cTjRqBo'  # multiple versions, none match perfectly


In [76]:
%%time
# populate with ids, where available
for i, row in df_billboard_songs.iterrows():
    if i%100 == 0:
        print(i)
    if df_billboard_songs['id'].iloc[i] != "":  # start over where we finished
        continue
    else:
        artist = row[1]
        song = row[0]
        df_billboard_songs['id'].iloc[i] = find_id(song, artist)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200


HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'artist:Aretha Franklin With James Cleveland & The Southern California Community Choir track:Wholy Holy', 'limit': 10, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.


SpotifyException: http status: 404, code:-1 - https://api.spotify.com/v1/search?q=artist%3AAretha+Franklin+With+James+Cleveland+%26+The+Southern+California+Community+Choir+track%3AWholy+Holy&limit=10&offset=0&type=track:
 Not found., reason: None

In [79]:
# how many id have we added
sum(df_billboard_songs.id != "")

1283

In [80]:
# save progress so far
df_billboard_songs.to_csv('df_billboard_songs.csv', index=False)

In [None]:
# now merge with entire billboard list



In [None]:
# check how many matches in 1.2M songs, what percent? SQL dataset? manual scraping?



### merging without spotify id is not working flawlessly, use API to get id, and check how many matches

In [None]:
# many indirect matches, need be clever to merge
# match artist, look for billboard.song in 1M.name.str

In [None]:
# change merging fields to lowercase with matching names
df_billboard_songs['artist_lower'] = df_billboard_songs['artist'].str.lower()
df_billboard_songs['track_lower'] = df_billboard_songs['song'].str.lower()

df_billboard_songs = df_billboard_songs[['artist_lower', 'track_lower']]

In [None]:
df_1M_songs['artists_lower'] = df_1M_songs['artists'].str.lower()
df_1M_songs['track_lower'] = df_1M_songs['name'].str.lower()

In [None]:
# now check if df_billboard_songs entries match 1M_songs entries

billboard_tracks = list(df_billboard_songs.track_lower)

# df_billboard_songs['ID'] = df_1M_songs[(df_billboard_songs.track_lower in df_1M_songs.track_lower) & (df_billboard_songs.artist_lower in df_1M_songs.artists_lower)]


In [None]:
df_1M_songs.head(0)

In [None]:
df_temp = df_billboard_songs.sample(100)

### regex method only takes first artist, doesn't work properly misses out of order artists

In [None]:
# formatting in df_1M_songs is difficult to parse
# need to extract list-style formatting using Regex:
"""
https://stackoverflow.com/questions/71469808/how-to-replace-a-list-with-first-element-of-list-in-pandas-dataframe-column
regex:
(             # start capturing
(?<=\[["\'])  # if preceded by [" or ['
[^"\']*       # get all text until " or '
|             # OR
^[^"\']+$     # get whole string if it doesn't contain " or '
)             # stop capturing
"""
df_1M_songs['artist_lower'] = df_1M_songs['artists'].str.extract('((?<=\[["\'])[^"\']*|^[^"\']+$)')
df_1M_songs['artist_lower'] = df_1M_songs['artist_lower'].str.lower()
df_1M_songs['track_lower'] = df_1M_songs['name'].str.lower()

In [None]:
# now, let's see how many id matches we can get with the billboard tracks
merge_on = ['artist_lower']

df_temp = pd.merge(df_billboard_songs, df_1M_songs, how='left', on=merge_on, indicator=True)
df_temp.shape

In [None]:
df_temp.head()

In [None]:
df_1M_songs[df_1M_songs.artist_lower.str.contains('yankovic')].shape

In [None]:
df_1M_songs[df_1M_songs.artists.str.contains('Yankovic')].shape

In [None]:
# merge billboard 100 with 1.2M songs

# change merging fields to lowercase with matching names
df_billboard['artist_lower'] = df_billboard['artist'].str.lower()
df_billboard['track_lower'] = df_billboard['song'].str.lower()

# need to extract list-style formatting using Regex:
"""
https://stackoverflow.com/questions/71469808/how-to-replace-a-list-with-first-element-of-list-in-pandas-dataframe-column
regex:
(             # start capturing
(?<=\[["\'])  # if preceded by [" or ['
[^"\']*       # get all text until " or '
|             # OR
^[^"\']+$     # get whole string if it doesn't contain " or '
)             # stop capturing
"""
df_1M_songs['artist_lower'] = df_1M_songs['artists'].str.extract('((?<=\[["\'])[^"\']*|^[^"\']+$)')
df_1M_songs['artist_lower'] = df_1M_songs['artist_lower'].str.lower()
df_1M_songs['track_lower'] = df_1M_songs['name'].str.lower()

# join tables
df = pd.merge(df_billboard, df_1M_songs, how='outer', on=['artist_lower', 'track_lower'], indicator=True)
df.shape, df[df._merge=='both'].shape

In [None]:
# how many missing songs?
df[df._merge == 'left_only'].drop_duplicates(['artist_lower', 'track_lower']).reset_index().shape

In [None]:
# how many found songs?
df[df._merge == 'both'].drop_duplicates(['artist_lower', 'track_lower']).reset_index().shape
# lame

In [None]:
missing_songs = df[df._merge == 'left_only'].drop_duplicates(['artist_lower', 'track_lower']).sort_values(['artist_lower', 'track_lower']).reset_index()[['artist_lower', 'track_lower']]
missing_songs

In [None]:
# manually check database for these
manual_check = df_1M_songs[['artist_lower', 'track_lower', 'album']]

In [None]:
manual_check[manual_check.track_lower == 'velcro fly']

In [None]:
# can i use this to join?
manual_check[manual_check.track_lower.str.contains('velcro fly')]

# don't like this solution, it will be humongous
# https://stackoverflow.com/questions/58011182/pandas-merge-a-dataframe-on-an-exact-and-partial-match

In [None]:
manual_check[manual_check.artist_lower=='zz top'].sort_values('track_lower')
# they're all in there, maybe we need a partial match for the merge?

In [None]:
# which songs were not matched to billboard 100?
unmatched = df[df._merge == 'right_only'].drop_duplicates(['artist_lower', 'track_lower'])[['artists', 'artist_lower', 'track_lower']].sort_values(['artist_lower', 'track_lower']).reset_index()

In [None]:
unmatched.sample(20)

### Can we merge Billboard 100 with Song Metadata?
### not well (~90% missing)
### NLP section only
    * we could use this merge for NLP
    * this could be a representative sample
    * discard for now because it would be nice to use the entire Billboard 100 dataset

In [None]:
# Billboard Top 100 Historical Data
# link:  https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs
# via:  https://toolbox.google.com/datasetsearch
url_billboard = r'D:\RYERSON\820\Datasets\Billboard The Hot 100 Songs\charts.csv'

# Music Dataset: Lyrics and Metadata from 1950 to 2019
# link:  https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs
# via:  https://toolbox.google.com/datasetsearch
# info re metadata:  https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-audio-features
# also available via Spotify web API:
    # link:  https://developer.spotify.com/documentation/web-api/
url_songdata = r'D:\RYERSON\820\Datasets\Music Dataset Lyrics and Metadata from 1950 to 2019\tcc_ceds_music.csv'


In [None]:
df_billboard = pd.read_csv(url_billboard)

# lowercase for joining datasets
# these match the names in the other dataframe, and are lowercase, as the other dataset is
df_billboard['artist_name'] = df_billboard['artist'].str.lower()
df_billboard['track_name'] = df_billboard['song'].str.lower()

df_billboard.head(1)

In [None]:
df_songdata = pd.read_csv(url_songdata)
# df_songdata.rename(columns={'artist_name': 'artist', 'track_name': 'song'}, inplace=True)
df_songdata.head(1)

In [None]:
df_billboard.shape, df_songdata.shape

In [None]:
df = pd.merge(df_billboard, df_songdata, how='left', on=['artist_name', 'track_name'], indicator=True)
df.shape, df[df._merge=='both'].shape

In [None]:
# lots of missing hits, let's check what's up
temp_columns = ['artist_name', 'track_name', '_merge']
temp = df[temp_columns]

In [None]:
temp.sample(10)

In [None]:
df_songdata[df_songdata.artist_name=='vikki carr']

In [None]:
df_songdata[df_songdata.artist_name=='walk the moon']

In [None]:
# ok, how many songs are missing? should we use the spotify API instead?

# missing tracks
missing = temp[temp._merge=='left_only'].drop_duplicates(keep='first')

# total tracks
all_tracks = df_billboard[['song', 'artist']].drop_duplicates(keep='first')

In [None]:
missing.shape[0] / all_tracks.shape[0]
# 90% of songs are missing