In [1]:
# import modules
import pandas as pd
import numpy as np
import spotipy

# full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas format
pd.set_option('display.float_format', '{:_.0f}'.format)
# NOTE: underscore separaters ('_') are better than commas (',') because 
# numbers with underscores work in Python without any extra effort.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# spotipy
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# used for AUTHORIZATION WORKFLOW (abandoned)
# CONFIDENTIAL.py included in .gitignore, contains private API key
# use link above to get a client_id and client_secret from developer.spotify.com
from CONFIDENTIAL import client_id, client_secret, redirect_uri

### The Billboard 100

In [2]:
# Billboard Top 100 Historical Data
# link:  https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs
# via:  https://toolbox.google.com/datasetsearch
url_billboard = r'D:\RYERSON\820\Datasets\Billboard The Hot 100 Songs\charts.csv'

df_billboard = pd.read_csv(url_billboard)
df_billboard['date'] = pd.to_datetime(df_billboard['date'])

### Unique Songs from The Billboard 100 Dataset

In [3]:
# just the songs on the billboard 100, once per song
df_billboard_songs = df_billboard[['song', 'artist']].drop_duplicates().sort_values(['artist', 'song']).reset_index(drop=True)

# add a blank id column
df_billboard_songs['id'] = ''

df_billboard_songs.shape

(29681, 3)

### 1.2M Songs with Metadata (csv)
https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs

In [5]:
# there are ~30k songs on the Billboard 100 list
# let's see how many are here:
# https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs
# via:  https://www.kaggle.com/datasets/
# Spotify 1.2M+ Songs
url_1M_songs = r'D:\RYERSON\820\Datasets\Spotify 1.2M+ Songs\tracks_features.csv'

# create the dataframe with the large number of songs metadata
df_1M_songs = pd.read_csv(url_1M_songs)

# make a list of song ids from the 1M dataset
metadata_ids = list(df_1M_songs.id)

### Get Spotify IDs using API
(to join with large datasets)

https://developer.spotify.com/terms/

##### Useful Spotify API Features

https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-audio-features

https://developer.spotify.com/documentation/web-api/reference/#/operations/get-recommendations

##### Other Links

my app:
https://developer.spotify.com/dashboard/applications/cd5ce2cb690543ff9967e817d4665543

a tutorial: 
https://www.youtube.com/watch?v=cU8YH2rhN6A&ab_channel=Elbert

In [None]:
# # AUTHORIZATION WORKFLOW 
# # (doens't work anymore, I think I was throttled, but can't get any error messages)

# # https://developer.spotify.com/documentation/general/guides/authorization/scopes/
# # i think this is the right scope, but nothing works when I connect to my app
# scope = 'user-read-private'

# oauth = spotipy.SpotifyOAuth(
#     client_id=client_id,
#     client_secret=client_secret,
#     redirect_uri=redirect_uri,
#     scope=scope
# )

# oauth = spotipy.SpotifyOAuth(
#     client_id=client_id,
#     client_secret=client_secret,
#     redirect_uri=redirect_uri
# )

# cached_token = oauth.get_cached_token()

# # create a spotify object
# spotify = spotipy.Spotify(auth=cached_token['access_token'])

### TEMPORARY TOKEN WORKFLOW

##### get a temporary authorization token from: https://developer.spotify.com/console/get-search-item

In [48]:
# input the temporary token
TEMP_TOKEN = input('Enter token: ')

Enter token: BQAQQ8tBekmiEmil6S3FiqdL_vosBNEm7fH2wow7izhFYQ6wcn-z8baA6VtQp-47KcOUN5izYuJllx9lqaTuyqcQaLALtS0lbVomikxHwF41cLM-eGTKPWdHdzSqxndBeTyS9m93KnPeurMwjxzT5Q-LJx8VtuhD7BSrwHisEIf9


In [49]:
# create a spotify object
spotify = spotipy.Spotify(auth=TEMP_TOKEN)

In [8]:
def find_id(track_title, artist_name, metadata_ids):
    """ 
    for searches with multiple results, all id were identical for the test cases I ran 
    some searches return no results, in this case the song is not on spotify
        confirmed by spot checks in the spotify music player
    some tracks give a 404 error, 
        these seem to exist in Spotify but 404 anyway
        not sure why the API does this but 
    """
    track_info = spotify.search(q='artist:' + artist_name + ' track:' + track_title, type='track')
    
    if track_info['tracks']['items'] == []:  # if track doesn't exist on Spotify
        return None
    else:
        # set the id to the first result
        track_id = track_info['tracks']['items'][0]['id']
        
        # check if there is a better match
        number_of_results = len(track_info['tracks']['items'])
        
        for i in range(number_of_results):
            alt_track_id = track_info['tracks']['items'][i]['id']
            if alt_track_id in metadata_ids:
                track_id = track_info['tracks']['items'][i]['id']
        
        return track_id

In [9]:
# TEST
find_id("You Can't Turn Me Off (In The Middle Of Turning Me On)", 'High Inergy', metadata_ids)

'7CfCUjAZFndcpAWLogl9J3'

### Add Spotify IDs to billboard songs

In [10]:
# load saved csv as df
df_billboard_songs = pd.read_csv('df_billboard_songs.csv', keep_default_na=False)

Unnamed: 0,song,artist,id
0,Misty,"""Groove"" Holmes",7DZsH0df0GuULl0FGwXMfd
1,What Now My Love,"""Groove"" Holmes",11Aldbvo6UCcVhBzv4oUdw
2,May The Bird Of Paradise Fly Up Your Nose,"""Little"" Jimmy Dickens",1WpoMGLjlEUHlWhilsOkJA
3,I Know I Know,"""Pookie"" Hudson",
4,Amish Paradise,"""Weird Al"" Yankovic",6nofbMbadUdrtZmIsBHyYE


In [11]:
df_billboard_songs.id.nunique()

17180

In [12]:
# start over at
start_over_at = 22953

# populate df_billboard_songs with ids, where available
for i, row in df_billboard_songs.iterrows():
    
    # start over at
    if i < start_over_at-1:
        continue
    
    # show status
    if i%10 == 0:
        print(i, end='  ')
    if i%100 == 0:
        print()
        
    # start over where we finished
    if df_billboard_songs['id'].iloc[i] != '':  
        continue
    # append id, NONE, or 'ERROR'
    else:
        artist = row[1]
        song = row[0]
        try:
            # unless there is an error, will append the id or None
            df_billboard_songs['id'].iloc[i] = find_id(song, artist, metadata_ids)
        except:  # any error needs to be dealt with manually
            # if there is an error, change id to 'ERROR'
            print('ERROR:  ', artist, song)
            df_billboard_songs['id'].iloc[i] = 'ERROR'
        # save every 1000 rows, if new
        if i%1000 == 0:
            df_billboard_songs.to_csv('df_billboard_songs_TEMP.csv', index=False)
        
# save final dataframe
df_billboard_songs.to_csv('df_billboard_songs.csv', index=False)

22960  22970  22980  22990  23000  
23010  23020  23030  23040  23050  23060  23070  23080  23090  

HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'artist:Steve Allen and His Orchestra with The Copacabana Trio track:Cuando Calienta El Sol (When The Sun Is Hot)', 'limit': 10, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.


23100  
ERROR:   Steve Allen and His Orchestra with The Copacabana Trio Cuando Calienta El Sol (When The Sun Is Hot)
23110  23120  23130  23140  23150  23160  23170  23180  23190  23200  
23210  23220  23230  23240  23250  23260  23270  23280  23290  23300  
23310  23320  23330  23340  23350  23360  23370  23380  23390  23400  
23410  23420  23430  23440  23450  23460  23470  23480  23490  23500  
23510  23520  23530  23540  23550  23560  23570  23580  23590  23600  
23610  23620  23630  

HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': "artist:Talking Heads track:Life During Wartime (This Ain't No Party...This Ain't No Disco...This Ain't", 'limit': 10, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.


ERROR:   Talking Heads Life During Wartime (This Ain't No Party...This Ain't No Disco...This Ain't
23640  23650  23660  23670  23680  23690  23700  
23710  23720  23730  23740  23750  23760  23770  23780  23790  23800  
23810  23820  23830  23840  23850  23860  23870  23880  23890  23900  
23910  23920  23930  23940  23950  23960  23970  23980  23990  24000  
24010  24020  24030  24040  24050  24060  24070  24080  24090  24100  
24110  24120  24130  24140  24150  24160  24170  24180  24190  24200  
24210  24220  24230  24240  24250  

HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'artist:The Barden Bellas, The Treblemakers & The BU Harmonics track:Riff Off: Mickey / Like A Virgin / Hit Me With Your Best Shot...', 'limit': 10, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.


24260  ERROR:   The Barden Bellas, The Treblemakers & The BU Harmonics Riff Off: Mickey / Like A Virgin / Hit Me With Your Best Shot...
24270  24280  24290  24300  
24310  24320  24330  24340  24350  24360  24370  24380  24390  24400  
24410  24420  24430  24440  24450  24460  24470  24480  24490  24500  
24510  24520  24530  24540  24550  24560  24570  24580  24590  24600  
24610  24620  24630  24640  24650  24660  24670  

HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'artist:The Chi-lites track:There Will Never Be Any Peace (Until God Is Seated At The Conference Table)', 'limit': 10, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.


ERROR:   The Chi-lites There Will Never Be Any Peace (Until God Is Seated At The Conference Table)
24680  24690  24700  
24710  24720  24730  24740  24750  24760  24770  24780  24790  24800  
24810  24820  24830  24840  24850  24860  24870  24880  24890  24900  
24910  24920  24930  24940  24950  24960  24970  24980  24990  25000  
25010  25020  25030  25040  25050  25060  25070  25080  25090  25100  
25110  25120  25130  25140  25150  25160  25170  25180  25190  25200  
25210  25220  25230  25240  25250  25260  25270  25280  25290  25300  
25310  25320  25330  25340  25350  25360  25370  25380  25390  25400  
25410  25420  25430  25440  25450  25460  25470  25480  25490  25500  
25510  25520  25530  25540  25550  25560  25570  25580  25590  25600  
25610  25620  25630  25640  25650  25660  25670  25680  25690  25700  
25710  25720  25730  25740  25750  25760  25770  25780  25790  25800  
25810  25820  25830  25840  25850  25860  25870  25880  25890  25900  
25910  25920  25930  25940 

HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'artist:The Partridge Family Starring Shirley Jones Featuring David Cassidy track:Breaking Up Is Hard To Do', 'limit': 10, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.
HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': "artist:The Partridge Family Starring Shirley Jones Featuring David Cassidy track:Doesn't Somebody Want To Be Wanted", 'limit': 10, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.


ERROR:   The Partridge Family Starring Shirley Jones Featuring David Cassidy Breaking Up Is Hard To Do
ERROR:   The Partridge Family Starring Shirley Jones Featuring David Cassidy Doesn't Somebody Want To Be Wanted


HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'artist:The Partridge Family Starring Shirley Jones Featuring David Cassidy track:I Woke Up In Love This Morning', 'limit': 10, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.
HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': "artist:The Partridge Family Starring Shirley Jones Featuring David Cassidy track:I'll Meet You Halfway", 'limit': 10, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.
HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': "artist:The Partridge Family Starring Shirley Jones Featuring David Cassidy track:It's One Of Those Nights (Yes Love)", 'limit': 10, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.


ERROR:   The Partridge Family Starring Shirley Jones Featuring David Cassidy I Woke Up In Love This Morning
26370  ERROR:   The Partridge Family Starring Shirley Jones Featuring David Cassidy I'll Meet You Halfway
ERROR:   The Partridge Family Starring Shirley Jones Featuring David Cassidy It's One Of Those Nights (Yes Love)


HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'artist:The Partridge Family Starring Shirley Jones Featuring David Cassidy track:Looking Through The Eyes Of Love', 'limit': 10, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.


ERROR:   The Partridge Family Starring Shirley Jones Featuring David Cassidy Looking Through The Eyes Of Love
26380  26390  

HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'artist:The Pipes And Drums And The Military Band Of The Royal Scots Dragoon Guards track:Amazing Grace', 'limit': 10, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.


26400  
ERROR:   The Pipes And Drums And The Military Band Of The Royal Scots Dragoon Guards Amazing Grace
26410  26420  26430  26440  26450  26460  26470  26480  26490  26500  
26510  26520  26530  26540  26550  26560  26570  26580  26590  26600  
26610  26620  26630  26640  26650  26660  26670  26680  26690  26700  
26710  26720  26730  26740  26750  26760  26770  26780  26790  26800  
26810  26820  26830  26840  26850  26860  26870  26880  26890  26900  
26910  26920  26930  26940  26950  

HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'artist:The Star Wars Intergalactic Droid Choir & Chorale track:What Can You Get A Wookiee For Christmas (When He Already Owns A Comb?)', 'limit': 10, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.


ERROR:   The Star Wars Intergalactic Droid Choir & Chorale What Can You Get A Wookiee For Christmas (When He Already Owns A Comb?)
26960  26970  26980  26990  27000  
27010  27020  27030  27040  27050  27060  27070  27080  27090  27100  
27110  27120  27130  27140  27150  27160  27170  27180  27190  27200  
27210  27220  27230  27240  27250  27260  27270  27280  27290  27300  
27310  27320  27330  27340  27350  27360  27370  27380  27390  27400  
27410  27420  27430  27440  27450  27460  27470  27480  27490  27500  
27510  27520  27530  27540  27550  27560  27570  

HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'artist:Thomas Rhett Featuring Reba McEntire, Hillary Scott, Chris Tomlin & Keith Urban track:Be A Light', 'limit': 10, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.


ERROR:   Thomas Rhett Featuring Reba McEntire, Hillary Scott, Chris Tomlin & Keith Urban Be A Light
27580  27590  27600  
27610  27620  27630  27640  27650  27660  27670  27680  27690  27700  
27710  27720  27730  27740  27750  27760  27770  27780  27790  27800  
27810  27820  27830  27840  27850  27860  27870  27880  27890  27900  
27910  27920  27930  27940  27950  27960  27970  27980  27990  28000  
28010  28020  28030  28040  28050  28060  28070  28080  28090  28100  
28110  28120  28130  28140  28150  28160  28170  28180  28190  28200  
28210  28220  28230  28240  28250  28260  28270  28280  28290  28300  
28310  28320  28330  28340  28350  28360  28370  28380  28390  28400  
28410  28420  28430  28440  28450  28460  28470  28480  

HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'artist:Tyler, The Creator Featuring Brent Faiyaz & Fana Hues track:Sweet / I Thought You Wanted To Dance', 'limit': 10, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.


ERROR:   Tyler, The Creator Featuring Brent Faiyaz & Fana Hues Sweet / I Thought You Wanted To Dance
28490  28500  
28510  28520  28530  28540  28550  28560  28570  28580  28590  28600  
28610  28620  28630  28640  28650  28660  28670  28680  28690  28700  
28710  28720  28730  28740  28750  28760  28770  28780  28790  28800  
28810  28820  28830  28840  28850  28860  28870  28880  28890  28900  
28910  28920  28930  28940  28950  28960  28970  28980  28990  29000  
29010  29020  29030  29040  29050  29060  29070  29080  29090  29100  
29110  29120  29130  29140  29150  29160  29170  29180  29190  29200  
29210  29220  29230  29240  29250  29260  29270  29280  29290  29300  
29310  29320  29330  29340  29350  29360  29370  29380  29390  29400  
29410  29420  29430  29440  29450  29460  29470  29480  29490  29500  
29510  29520  29530  29540  29550  29560  29570  29580  29590  29600  
29610  29620  29630  29640  29650  29660  29670  

HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': "artist:will.i.am Featuring Miley Cyrus, French Montana, Wiz Khalifa & DJ Mustard track:Feelin' Myself", 'limit': 10, 'offset': 0, 'type': 'track', 'market': None} returned 404 due to Not found.


29680  ERROR:   will.i.am Featuring Miley Cyrus, French Montana, Wiz Khalifa & DJ Mustard Feelin' Myself


In [13]:
# how many id have we added
sum(df_billboard_songs.id != "")

24019

In [17]:
# check how many id match the metadata_ids
sum(df_billboard_songs.id.isin(metadata_ids))

# which have spotify ids, but don't match metadata_ids?
# consider rechecking with metadata_ids_SQL

5694

In [33]:
# check how many id match the ids from the SQLite database
metadata_ids_SQL = pd.read_csv('all_ids_sql.csv', header=None, names=['id'])
metadata_ids_SQL.shape

(8741672, 1)

In [35]:
metadata_ids_SQL = list(metadata_ids_SQL.id)

In [37]:
sum(df_billboard_songs.id.isin(metadata_ids_SQL))

14313

In [42]:
# separate matching from non-matching dataframes
df_missing = df_billboard_songs[~df_billboard_songs.id.isin(metadata_ids_SQL)]
df_missing = df_missing[(df_missing.id!='') & (df_missing.id!='ERROR')]
df_missing.head()

Unnamed: 0,song,artist,id
1,What Now My Love,"""Groove"" Holmes",11Aldbvo6UCcVhBzv4oUdw
4,Amish Paradise,"""Weird Al"" Yankovic",6nofbMbadUdrtZmIsBHyYE
5,Canadian Idiot,"""Weird Al"" Yankovic",77lZ5O8fojbuYiEskfqJpP
6,Eat It,"""Weird Al"" Yankovic",1jq2FwGP2nctKdPO0mHqMv
7,Fat,"""Weird Al"" Yankovic",0vftDsKyQN36cI2gfh2opK


In [43]:
df_missing.describe()

Unnamed: 0,song,artist,id
count,9650,9650,8040
unique,8936,4424,8009
top,I Need You,Glee Cast,1E8NzDXkY6X1ipKUzXrgFW
freq,8,43,2


In [45]:
# search again for df_missing using metadata_ids_SQL

def find_exact_id(track_title, artist_name, metadata_ids):
    """ 
    same as above, but returns 'MISSING' if not found in metadata_ids
    """
    track_info = spotify.search(q='artist:' + artist_name + ' track:' + track_title, type='track')
    
    if track_info['tracks']['items'] == []:  # if track doesn't exist on Spotify
        return 'MISSING'
    else:
        # check if there is a match
        number_of_results = len(track_info['tracks']['items'])
        
        for i in range(number_of_results):
            track_id = track_info['tracks']['items'][i]['id']
            if track_id in metadata_ids:
                return track_id
        
        # if we haven't returned a match, return 'MISSING'
        return 'MISSING'


In [44]:
# TEST
find_id("You Can't Turn Me Off (In The Middle Of Turning Me On)", 'High Inergy', metadata_ids)

'7CfCUjAZFndcpAWLogl9J3'

In [47]:
# TEST
find_exact_id("You can't find this song", 'not real artist', metadata_ids_SQL)

'MISSING'

In [50]:
# populate df_missing with ids, where available, 'MISSING' otherwise
for i, row in df_missing.iterrows():
    
    # show status
    if i%10 == 0:
        print(i, end='  ')
    if i%100 == 0:
        print()
        
    artist = row[1]
    song = row[0]
    try:
        # unless there is an error, will append the id or None
        df_missing['id'].iloc[i] = find_exact_id(song, artist, metadata_ids_SQL)
    except:  # any error needs to be dealt with manually
        # if there is an error, change id to 'ERROR'
        print('ERROR:  ', artist, song)
        df_missing['id'].iloc[i] = 'ERROR'
    # save every 1000 rows, if new
    if i%1000 == 0:  # less useful without reindexing (oops)
        df_missing.to_csv('df_missing_TEMP.csv', index=False)
        
# save final dataframe
df_missing.to_csv('df_missing.csv', index=False)

10  40  110  150  170  280  300  
330  410  450  490  510  630  690  710  720  780  790  800  
820  890  910  930  980  1010  1090  1120  1140  1160  1180  1190  1200  
1230  1260  1270  1360  1370  1390  1430  1470  1530  1590  1600  
1620  1640  1670  1690  1700  
1810  1850  1940  1950  1960  1970  2020  2110  2160  2190  2200  
2230  2250  2270  2300  
2330  2390  2420  2430  2510  2520  2670  2690  2790  2880  2930  3050  3070  3100  
3110  3120  3160  3180  3190  3200  
3220  3300  
3320  3350  3360  3390  3430  3460  3520  3540  3570  3600  
3660  3710  3730  3740  3790  3800  
3810  3820  3890  3910  3920  3930  4010  4020  4060  4120  4130  4160  4230  4250  4260  4290  4370  4490  4500  
4510  4570  4610  4620  4630  4670  4710  4740  4820  4900  
4940  4980  5060  5070  5110  5160  5200  
5220  5260  5270  5330  5340  5360  5370  5510  5530  5590  5600  
5640  5760  5780  5830  5920  5930  5950  5980  6000  
6020  6110  6150  6180  6190  6240  6280  6310  6320  6360  6380  6

IndexError: iloc cannot enlarge its target object

In [53]:
# confirm that it complete, maybe need to reindex


'MISSING'

In [56]:
# number of tracks missing from both datasets, but on Spotify
sum(df_missing.id == 'MISSING')

1265

In [65]:
df_missing.shape[0], sum(df_missing.id == 'MISSING') + sum(df_missing.id != 'MISSING')

(9650, 9650)

In [None]:
# revised_ids = 
df_missing[(df_missing.id != 'MISSING') & (df_missing.id != '')]

In [55]:
# how many tracks are in 1.2M but not 8M?



# what about from the billboard 100?




1265

In [None]:
# now merge with entire billboard list



In [None]:
# check how many matches in 1.2M songs, what percent? SQL dataset? manual scraping?



### 8+ M. Spotify Tracks, Genre, Audio Features (SQLite)
https://www.kaggle.com/datasets/maltegrosse/8-m-spotify-tracks-genre-audio-features

In [None]:
url_8M_sql = 'D:\RYERSON\820\Datasets\8+ M. Spotify Tracks, Genre, Audio Features\spotify.sqlite'
url_8M_csv = 'all_audio_features_sql.csv'  # .gitignore (very big)
