In [1]:
# import modules
import pandas as pd
import numpy as np
import spotipy

# full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas format
pd.set_option('display.float_format', '{:_.0f}'.format)
# NOTE: underscore separaters ('_') are better than commas (',') because 
# numbers with underscores work in Python without any extra effort.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# spotipy
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# used for AUTHORIZATION WORKFLOW (abandoned)
# CONFIDENTIAL.py included in .gitignore, contains private API key
# use link above to get a client_id and client_secret from developer.spotify.com
from CONFIDENTIAL import client_id, client_secret, redirect_uri

### The Billboard 100

In [2]:
# Billboard Top 100 Historical Data
# link:  https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs
# via:  https://toolbox.google.com/datasetsearch
url_billboard = r'D:\RYERSON\820\Datasets\Billboard The Hot 100 Songs\charts.csv'

df_billboard = pd.read_csv(url_billboard)
df_billboard['date'] = pd.to_datetime(df_billboard['date'])

### Unique Songs from The Billboard 100 Dataset

In [27]:
# just the songs on the billboard 100, once per song
df_billboard_songs = df_billboard[['song', 'artist']].drop_duplicates().sort_values(['artist', 'song']).reset_index(drop=True)

# add a blank id column
df_billboard_songs['id'] = ''

df_billboard_songs.shape

(29681, 3)

In [28]:
df_billboard_songs.sample(10)

Unnamed: 0,song,artist,id
3695,Tarkio Road,Brewer And Shipley,
21851,Somebody Have Mercy,Sam Cooke,
20127,Louisiana Man,Pozo Seco Singers,
26073,(You Can't Let The Boy Overpower) The Man In You,The Miracles,
5120,Land Of Make Believe,Chuck Mangione With The Hamilton Philharmonic ...,
27899,"Walls (From ""She's The One"")",Tom Petty And The Heartbreakers,
25944,Glad To Be Unhappy,The Mamas & The Papas,
10249,Teenage Dream,Glee Cast,
25657,Land Of A Thousand Dances,The J. Geils Band,
2850,Hush/I'm Alive,Blue Swede,


### 1.2M Songs with Metadata (csv)
https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs

In [4]:
# there are ~30k songs on the Billboard 100 list
# let's see how many are here:
# https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs
# via:  https://www.kaggle.com/datasets/
# Spotify 1.2M+ Songs
url_1M_songs = r'D:\RYERSON\820\Datasets\Spotify 1.2M+ Songs\tracks_features.csv'

# create the dataframe with the large number of songs metadata
df_1M_songs = pd.read_csv(url_1M_songs)

# make a list of song ids from the 1M dataset
metadata_ids = list(df_1M_songs.id)

### Get Spotify IDs using API
(to join with large datasets)

https://developer.spotify.com/terms/

##### Useful Spotify API Features

https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-audio-features

https://developer.spotify.com/documentation/web-api/reference/#/operations/get-recommendations

##### Other Links

my app:
https://developer.spotify.com/dashboard/applications/cd5ce2cb690543ff9967e817d4665543

a tutorial: 
https://www.youtube.com/watch?v=cU8YH2rhN6A&ab_channel=Elbert

In [None]:
# # AUTHORIZATION WORKFLOW 
# # (doens't work anymore, I think I was throttled, but can't get any error messages)

# # https://developer.spotify.com/documentation/general/guides/authorization/scopes/
# # i think this is the right scope, but nothing works when I connect to my app
# scope = 'user-read-private'

# oauth = spotipy.SpotifyOAuth(
#     client_id=client_id,
#     client_secret=client_secret,
#     redirect_uri=redirect_uri,
#     scope=scope
# )

# oauth = spotipy.SpotifyOAuth(
#     client_id=client_id,
#     client_secret=client_secret,
#     redirect_uri=redirect_uri
# )

# cached_token = oauth.get_cached_token()

# # create a spotify object
# spotify = spotipy.Spotify(auth=cached_token['access_token'])

### TEMPORARY TOKEN WORKFLOW

##### get a temporary authorization token from: https://developer.spotify.com/console/get-search-item

In [42]:
# input the temporary token
TEMP_TOKEN = input('Enter token: ')

Enter token: BQBxsT2o9vloIRLQrEiska2Sz1dCO00H80SVkca2SeTLsrkATp-lxc5bCJaZn9N2TW3GWGpLshN8c5VPAPVVb9181NTikUiyro5B59evnN9H_9G_YMvhF6tQneD8HUTYxAdE_f-3EOz8UUVorkBaoqleC0BlsWJgTo7YTQoBTz7v


In [43]:
# create a spotify object
spotify = spotipy.Spotify(auth=TEMP_TOKEN)

In [6]:
def find_id(track_title, artist_name, metadata_ids):
    """ 
    for searches with multiple results, all id were identical for the test cases I ran 
    some searches return no results, in this case the song is not on spotify
        confirmed by spot checks in the spotify music player
    some tracks give a 404 error, 
        these seem to exist in Spotify but 404 anyway
        not sure why the API does this but 
    """
    track_info = spotify.search(q='artist:' + artist_name + ' track:' + track_title, type='track')
    
    if track_info['tracks']['items'] == []:  # if track doesn't exist on Spotify
        return None
    else:
        # set the id to the first result
        track_id = track_info['tracks']['items'][0]['id']
        
        # check if there is a better match
        number_of_results = len(track_info['tracks']['items'])
        
        for i in range(number_of_results):
            alt_track_id = track_info['tracks']['items'][i]['id']
            if alt_track_id in metadata_ids:
                track_id = track_info['tracks']['items'][i]['id']
        
        return track_id

In [44]:
# TEST
find_id("You Can't Turn Me Off (In The Middle Of Turning Me On)", 'High Inergy', metadata_ids)

'7CfCUjAZFndcpAWLogl9J3'

### Add Spotify IDs to billboard songs

In [32]:
# load saved csv as df
df_billboard_songs = pd.read_csv('df_billboard_songs.csv', keep_default_na=False)

df_billboard_songs.head()

Unnamed: 0,song,artist,id
0,Misty,"""Groove"" Holmes",7DZsH0df0GuULl0FGwXMfd
1,What Now My Love,"""Groove"" Holmes",11Aldbvo6UCcVhBzv4oUdw
2,May The Bird Of Paradise Fly Up Your Nose,"""Little"" Jimmy Dickens",1WpoMGLjlEUHlWhilsOkJA
3,I Know I Know,"""Pookie"" Hudson",
4,Amish Paradise,"""Weird Al"" Yankovic",6nofbMbadUdrtZmIsBHyYE


In [8]:
df_billboard_songs.id.nunique()

8124

In [None]:
# start over at
start_over_at = 10767

# populate df_billboard_songs with ids, where available
for i, row in df_billboard_songs.iterrows():
    
    # start over at
    if i < start_over_at-1:
        continue
    
    # show status
    if i%10 == 0:
        print(i, end='  ')
    if i%100 == 0:
        print()
        
    # start over where we finished
    if df_billboard_songs['id'].iloc[i] != '':  
        continue
    # append id, NONE, or 'ERROR'
    else:
        artist = row[1]
        song = row[0]
        try:
            # unless there is an error, will append the id or None
            df_billboard_songs['id'].iloc[i] = find_id(song, artist, metadata_ids)
        except:  # any error needs to be dealt with manually
            # if there is an error, change id to 'ERROR'
            print('ERROR:  ', artist, song)
            df_billboard_songs['id'].iloc[i] = 'ERROR'
        # save every 1000 rows, if new
        if i%1000 == 0:
            df_billboard_songs.to_csv('df_billboard_songs_TEMP.csv', index=False)
        
# save final dataframe
df_billboard_songs.to_csv('df_billboard_songs.csv', index=False)

In [None]:
# how many id have we added
sum(df_billboard_songs.id != "")

In [None]:
# now merge with entire billboard list



In [None]:
# check how many matches in 1.2M songs, what percent? SQL dataset? manual scraping?



### 8+ M. Spotify Tracks, Genre, Audio Features (SQLite)
https://www.kaggle.com/datasets/maltegrosse/8-m-spotify-tracks-genre-audio-features

In [None]:
url_8M_sql = 'D:\RYERSON\820\Datasets\8+ M. Spotify Tracks, Genre, Audio Features\spotify.sqlite'

