In [3]:
# import modules
import pandas as pd
import numpy as np
import spotipy

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:_.0f}'.format)
# NOTE: underscore separaters ('_') are better than commas (',') because 
# numbers with underscores work in Python without any extra effort.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

### The Billboard 100
https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs

In [31]:
# Billboard Top 100 Historical Data
# via:  https://toolbox.google.com/datasetsearch
url_billboard = r'D:\RYERSON\820\Datasets\Billboard The Hot 100 Songs\charts.csv'

df_billboard = pd.read_csv(url_billboard)
df_billboard['date'] = pd.to_datetime(df_billboard['date'])

# Unique Songs from The Billboard 100 Dataset

# just the songs on the billboard 100, once per song
df_billboard_songs = df_billboard[['song', 'artist']].drop_duplicates().sort_values(['artist', 'song']).reset_index(drop=True)

# add a blank id column and a blank MISSING column 
df_billboard_songs['id'] = ''
df_billboard_songs['MISSING'] = ''

df_billboard.shape, df_billboard_songs.shape

((330087, 7), (29681, 4))

### 1.2M Songs with Metadata (csv)
https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs

In [14]:
# Spotify 1.2M+ Songs
# via:  https://www.kaggle.com/datasets/
url_1M_songs = r'D:\RYERSON\820\Datasets\Spotify 1.2M+ Songs\tracks_features.csv'

# create the dataframe with the large number of songs metadata
df_1M_songs = pd.read_csv(url_1M_songs)

# make a list of song ids from the 1M dataset
metadata_ids_csv = df_1M_songs.id.to_list()

### 8.7M Songs with Metadata (SQL)

In [6]:
# all ids from the SQLite database
metadata_ids_SQL = pd.read_csv('all_ids_sql.csv', header=None, names=['id'])
metadata_ids_SQL = metadata_ids_SQL.id.to_list()

### Combine Ids For Datasets with Metadata

In [30]:
all_metadata_ids = set(metadata_ids_csv + metadata_ids_SQL) # set() faster to search, no duplicates
len(all_metadata_ids)

9592981

### Get Track IDs using API

### TEMPORARY TOKEN WORKFLOW

##### get a temporary authorization token from: https://developer.spotify.com/console/get-search-item

In [None]:
# input the temporary token
TEMP_TOKEN = input('Enter token: ')

# create a spotify object
spotify = spotipy.Spotify(auth=TEMP_TOKEN)

In [None]:
## TODO, add a 'MISSING' tag somewhere

# helper function
def find_id(track_title, artist_name, metadata_ids):
    """ 
    for searches with multiple results, all id were identical for the test cases I ran 
    some searches return no results, in this case the song is not on spotify
        confirmed by spot checks in the spotify music player
    some tracks give a 404 error, 
        these seem to exist in Spotify but 404 anyway
        not sure why the API does this but 
    """
    track_info = spotify.search(q='artist:' + artist_name + ' track:' + track_title, type='track')
    
    if track_info['tracks']['items'] == []:  # if track doesn't exist on Spotify
        return None, 'MISSING'
    else:
        # default to 0th id
        track_id = track_info['tracks']['items'][0]['id']
        
        number_of_results = len(track_info['tracks']['items'])
        
        # check if there is a better match
        for i in range(number_of_results):
            current_id = track_info['tracks']['items'][i]['id']
            if current_id in metadata_ids:
                return current_id, 'matched'  # immediately return it if it's found
        
        # if we made it through the loop without returning, note 'MISSING' and return the 0th id
        return track_id, 'MISSING'

In [None]:
%%time
# TEST
find_id("You Can't Turn Me Off (In The Middle Of Turning Me On)", 'High Inergy', all_metadata_ids)

### Add Spotify IDs to billboard songs matched in the datasets

In [None]:
# # load saved csv if required
# df_billboard_songs = pd.read_csv('df_billboard_songs.csv', keep_default_na=False)
# df_billboard_songs.id.nunique()

In [None]:
# start over at
start_over_at = 0

# populate df_billboard_songs with ids, where available
for i, row in df_billboard_songs.iterrows():
    
    # start over at
    if i < start_over_at-1:
        continue
    
    # show status update
    if i%10 == 0:
        print(i, end='  ')
    if i%100 == 0:
        print()
        
    # start over where we finished (don't overwrite known ids)
    if df_billboard_songs['id'].iloc[i] != '':  
        continue
    # append id, NONE, or 'ERROR'
    else:
        artist = row[1]
        song = row[0]
        try:
            # unless there is an error, will append the id or None 
            df_billboard_songs[['id', 'MISSING']].iloc[i] = find_id(song, artist, all_metadata_ids)
        except:  # all errors treated the same
            # if there is an error, change id to 'ERROR'
            print('ERROR:  ', artist, song)
            df_billboard_songs['MISSING'].iloc[i] = 'ERROR'  # leave id blank
        
        # save every 1000 rows, if new
        if i%1000 == 0:
            df_billboard_songs.to_csv('df_billboard_songs_TEMP.csv', index=False)
        
# save final dataframe
df_billboard_songs.to_csv('df_billboard_songs.csv', index=False)

In [None]:
# how many id have we added
sum(df_billboard_songs.id != "")

In [None]:
# check how many id match the metadata_ids
sum(df_billboard_songs.id.isin(all_metadata_ids))

In [None]:
# separate songs into
    # not on spotify (no id)
    # on spotify, but not either dataset (not in either metadata set)
    # matched with dataset (could distinguish using temp_revised)
    
    
 

In [None]:
# how many tracks are in spotify, but don't have data?




In [None]:
# check the percentage of songs accounted for




In [None]:
# now merge with entire billboard list



# re-count the percentage of songs accounted for in the entire set

### 8+ M. Spotify Tracks, Genre, Audio Features (SQLite)
https://www.kaggle.com/datasets/maltegrosse/8-m-spotify-tracks-genre-audio-features

In [None]:
url_8M_sql = 'D:\RYERSON\820\Datasets\8+ M. Spotify Tracks, Genre, Audio Features\spotify.sqlite'
url_8M_csv = 'all_audio_features_sql.csv'  # .gitignore (very big)


### Merge all Datasets into Billboard Features, 
export Billboard Features and All Features