# Import Modules

In [1]:
# import modules
import pandas as pd
import numpy as np
import spotipy

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:_.2f}'.format)
# NOTE: underscore separaters ('_') are better than commas (',') because 
# numbers with underscores work in Python without any extra effort.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

# Import Starting Data

### The Billboard 100
https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs

In [None]:
# Billboard Top 100 Historical Data
# via:  https://toolbox.google.com/datasetsearch
url_billboard = r'D:\RYERSON\820\Datasets\Billboard The Hot 100 Songs\charts.csv'

df_billboard = pd.read_csv(url_billboard)
df_billboard['date'] = pd.to_datetime(df_billboard['date'])

# Unique Songs from The Billboard 100 Dataset

# just the songs on the billboard 100, once per song
df_billboard_songs = df_billboard[['song', 'artist']].drop_duplicates().sort_values(['artist', 'song']).reset_index(drop=True)

# add a blank id column and a blank MISSING column 
df_billboard_songs['id'] = ''
df_billboard_songs['MISSING'] = ''

df_billboard.shape, df_billboard_songs.shape

### 1.2M Songs with Metadata (csv)
https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs

In [None]:
# Spotify 1.2M+ Songs
# via:  https://www.kaggle.com/datasets/
url_1M_songs = r'D:\RYERSON\820\Datasets\Spotify 1.2M+ Songs\tracks_features.csv'

# create the dataframe with the large number of songs metadata
df_1M_songs = pd.read_csv(url_1M_songs)

# make a list of song ids from the 1M dataset
metadata_ids_csv = df_1M_songs.id.to_list()

### 8.7M Songs with Metadata (SQL)

In [None]:
# all ids from the SQLite database
metadata_ids_SQL = pd.read_csv('all_ids_sql.csv', header=None, names=['id'])
metadata_ids_SQL = metadata_ids_SQL.id.to_list()

# audio feature data not imported yet (very large)

### Combine Ids For Datasets with Metadata

In [None]:
# list of ids for all of our known metadata
all_metadata_ids = set(metadata_ids_csv + metadata_ids_SQL) # set() faster to search, and no duplicates
len(all_metadata_ids)

### Get Track IDs using API

### TEMPORARY TOKEN WORKFLOW

##### get a temporary authorization token from: https://developer.spotify.com/console/get-search-item

In [None]:
# input the temporary token
TEMP_TOKEN = input('Enter token: ')

# create a spotify object
spotify = spotipy.Spotify(auth=TEMP_TOKEN)

In [None]:
# helper function
def find_id(track_title, artist_name, metadata_ids):
    """ 
    for searches with multiple results, all id were identical for the test cases I ran 
    some searches return no results, in this case the song is not on spotify
        confirmed by spot checks in the spotify music player
    some tracks give a 404 error, 
        these seem to exist in Spotify but 404 anyway
        not sure why the API does this but 
    """
    track_info = spotify.search(q='artist:' + artist_name + ' track:' + track_title, type='track')
    
    if track_info['tracks']['items'] == []:  # if track doesn't exist on Spotify
        return '', 'MISSING'
    else:
        # default to 0th id
        track_id = track_info['tracks']['items'][0]['id']
        
        number_of_results = len(track_info['tracks']['items'])
        
        # check if there is a better match
        for i in range(number_of_results):
            current_id = track_info['tracks']['items'][i]['id']
            if current_id in metadata_ids:
                return current_id, 'matched'  # immediately return it if it's found
        
        # if we made it through the loop without returning, note 'MISSING' and return the 0th id
        return track_id, 'MISSING'

In [None]:
%%time
# TEST
find_id("You Can't Turn Me Off (In The Middle Of Turning Me On)", 'High Inergy', all_metadata_ids)

In [None]:
# TEST
find_id("You Can't Find this (SONG)", 'Low Unergy', all_metadata_ids)

### Add Spotify IDs to billboard songs matched in the datasets

In [None]:
# load saved csv if required
df_billboard_songs = pd.read_csv('df_billboard_songs.csv', keep_default_na=False)
df_billboard_songs.id.nunique()

In [None]:
# start over at
start_over_at = 30000

# populate df_billboard_songs with ids, where available
for i, row in df_billboard_songs.iterrows():
    
    # start over at
    if i < start_over_at-1:
        continue
    
    # show status update
    if i%10 == 0:
        print(i, end='  ')
    if i%100 == 0:
        print()
        
    # start over where we finished (don't overwrite known ids)
    if df_billboard_songs['id'].iloc[i] != '':  
        continue
    # append id, NONE, or 'ERROR'
    else:
        artist = row[1]
        song = row[0]
        try:
            # unless there is an error, will append the id or None 
            df_billboard_songs['id'].iloc[i], df_billboard_songs['MISSING'].iloc[i] = find_id(song, artist, all_metadata_ids)
        except:  # all errors treated the same
            # if there is an error, change id to 'ERROR'
            print('ERROR:  ', artist, song)
            df_billboard_songs['MISSING'].iloc[i] = 'ERROR'  # leave id blank
        
        # save every 1000 rows, if new
        if i%1000 == 0:
            df_billboard_songs.to_csv('df_billboard_songs_TEMP.csv', index=False)
        
# save final dataframe
df_billboard_songs.to_csv('df_billboard_songs.csv', index=False)

### After Gathering all IDs

In [None]:
# reload df_billboard_songs if required
df_billboard_songs = pd.read_csv('df_billboard_songs.csv', keep_default_na=False)

In [None]:
# how many id have we added
df_billboard_songs.id.nunique(), sum(df_billboard_songs.id != "")
# 83 duplicated ids

### Remove Duplicates / Errors

In [None]:
duplicates = df_billboard_songs[df_billboard_songs.id!='']
duplicates = duplicates[duplicates.id.duplicated(False)]
duplicates.to_csv('duplicated_ids.csv', index=True)

In [None]:
# # go through this manually and fix it
#     # in this case NEED TO FIX BOTH BILLBOARD DATAFRAMES

# # OR drop the corrupted rows
# # manually imputing is problematic, error prone, and time consuming

# duplicates.sort_values('id').head(200)

In [None]:
# set of duplicated ids
duplicated_ids = set(duplicates.id)

In [None]:
sum(df_billboard_songs['id'].isin(duplicated_ids))

In [None]:
# drop from billboard list of known ids 
for i, row in df_billboard_songs.iterrows():
    if df_billboard_songs.iloc[i]['id'] in duplicated_ids:
        df_billboard_songs['MISSING'].iloc[i] = 'DUPLICATED'
        df_billboard_songs['id'].iloc[i] = ''

In [None]:
sum(df_billboard_songs['id'].isin(duplicated_ids))

In [None]:
df_billboard_songs.to_csv('df_billboard_songs - duplicates removed.csv', index=False)

### what songs are still missing audio feature data?

In [None]:
# reload the dataframe if required
df_billboard_songs = pd.read_csv('df_billboard_songs - duplicates removed.csv', keep_default_na=False)

In [None]:
# how many id have we added
df_billboard_songs.id.nunique(), sum(df_billboard_songs.id != '')
# off by one because '' counts as a unique id

In [None]:
# check how many id match the metadata_ids
sum(df_billboard_songs.id.isin(all_metadata_ids))

In [None]:
# songs on spotify that we don't have audio features for yet
need_audio_features = df_billboard_songs[(~df_billboard_songs.id.isin(all_metadata_ids) & (df_billboard_songs.id != ''))]
need_audio_features.to_csv('need_audio_features.csv', index=False)

In [None]:
len(need_audio_features.id)

### Use API again to get missing audio features
https://developer.spotify.com/console/get-audio-features-track/

In [None]:
# input the temporary token
TEMP_TOKEN = input('Enter token: ')

# create a spotify object
spotify = spotipy.Spotify(auth=TEMP_TOKEN)

In [None]:
# initialise dataframe

with_audio_features = need_audio_features.copy().reset_index()

list_of_features = [
    'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
    'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature'
]

for feature in list_of_features:
    with_audio_features[feature] = ''

with_audio_features.head()

In [None]:
start_over_at = 1200

list_of_features = [
    'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
    'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature'
]

for i, row in with_audio_features.iterrows():
    
    # start over at
    if i < start_over_at-1:
        continue
    
    # show status update
    if i%10 == 0:
        print(i, end='  ')
    if i%100 == 0:
        print()
    
    track_id = with_audio_features['id'].iloc[i]
    temp_audio_features = spotify.audio_features(track_id)
    
    for key in list_of_features:
        with_audio_features[key].iloc[i] = temp_audio_features[0][key]
    
    if i%100 == 0:
        with_audio_features.to_csv('audio_features_TEMP.csv', index=True)
    
# save final df
with_audio_features.to_csv('audio_features_FINAL.csv', index=True)

### OPTIONAL QA SPOTCHECKS

In [None]:
#### TODO: QA check ####
# should I confirm that i get the same audio features as from the other datasets??
# maybe do a small spotcheck

QA_DATAFRAME = df_billboard_songs[df_billboard_songs.MISSING == 'matched'].sample(100).reset_index()

list_of_features = [
    'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
    'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature'
]

for feature in list_of_features:
    QA_DATAFRAME[feature] = ''

QA_DATAFRAME.head()

In [None]:
start_over_at = 0

list_of_features = [
    'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
    'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature'
]

for i, row in QA_DATAFRAME.iterrows():
    
    # start over at
    if i < start_over_at-1:
        continue
    
    # show status update
    if i%10 == 0:
        print(i, end='  ')
    if i%100 == 0:
        print()
    
    track_id = QA_DATAFRAME['id'].iloc[i]
    temp_audio_features = spotify.audio_features(track_id)
    
    for key in list_of_features:
        QA_DATAFRAME[key].iloc[i] = temp_audio_features[0][key]
    
# save final df
QA_DATAFRAME.to_csv('QA_DATAFRAME.csv', index=True)

In [None]:
# check these later to confirm that the downloaded data is correct

# Finalize Dataset

### Reimport data from Billboard 100 and 1.2M Songs Dataset

In [2]:
# billboard songs 
df_billboard_songs = pd.read_csv('df_billboard_songs - duplicates removed.csv', keep_default_na=False)

# billboard songs with audio features from API (missing from other datasets)
df_api_features = pd.read_csv('audio_features_FINAL.csv', keep_default_na=False)
df_api_features = df_api_features[[
    'id', 'song', 'artist',
    'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 
    'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence'
]]

# billboard time series
url_billboard = r'D:\RYERSON\820\Datasets\Billboard The Hot 100 Songs\charts.csv'
df_billboard = pd.read_csv(url_billboard)
df_billboard['date'] = pd.to_datetime(df_billboard['date'])

# 1.2M songs with metadata
url_1M_songs = r'D:\RYERSON\820\Datasets\Spotify 1.2M+ Songs\tracks_features.csv'
df_1M_songs = pd.read_csv(url_1M_songs)
df_1M_songs.rename(columns={'name': 'song', 'artists': 'artist', 'duration': 'duration_ms'}, inplace=True)
df_1M_songs = df_1M_songs[[
    'id', 'song', 'artist',
    'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 
    'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence'
]]

### 8+ M. Spotify Tracks, Genre, Audio Features (SQLite)
https://www.kaggle.com/datasets/maltegrosse/8-m-spotify-tracks-genre-audio-features

In [3]:
url_8M_sql = 'D:\RYERSON\820\Datasets\8+ M. Spotify Tracks, Genre, Audio Features\spotify.sqlite'
url_8M_csv = 'all_audio_features_sql.csv'  # .gitignore (very big)

df_8M_songs = pd.read_csv(url_8M_csv, on_bad_lines='skip')
df_8M_songs.rename(columns={'name': 'artist', 'name:1': 'song', 'duration': 'duration_ms'}, inplace=True)
# already in alphabetical order

df_8M_songs.drop_duplicates(inplace=True)

### Merge Billboard 100 Songs with Metadata

In [4]:
# audio features
list_of_features = [
    'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 
    'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence'
]

# songs in the billboard 100 list
set_billboard = set(df_billboard_songs.id)
set_billboard.discard('')

# df_api_features does not need to be filtered or sorted

# 8M songs from SQL
df1 = df_8M_songs[df_8M_songs.id.isin(set_billboard)]
# 930 duplicate ids, all appear to be collaborations (based on Google spotcheck ~ 10 songs)
# delete duplicates, keep first, should work fine, but secondary artist may be listed first
df1 = df1.drop_duplicates(subset='id', keep='first')
set1 = set_billboard - set(df1.id)

# 1M songs from SQL
df2 = df_1M_songs[df_1M_songs.id.isin(set1)]
set2 = set1 - set(df2.id) - set(df_api_features.id)
set2.add('') 

# missing songs get np.nan
df3a = df_billboard_songs[df_billboard_songs.id.isin(set2)].reset_index(drop=True)
df3b = pd.DataFrame(data=np.nan, index=[x for x in range(df3a.shape[0])], columns=list_of_features)
df3 = pd.concat([df3a, df3b], axis=1)
df3 = df3[[
    'id', 'song', 'artist',
    'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 
    'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence'
]]

# merge all into total dataframe
all_audio_features = pd.concat([df_api_features, df1, df2, df3]).reset_index(drop=True)

# 3 duplicated ids, look like errors (closely named tracks)
duplicate_errors = set(all_audio_features[(all_audio_features.id != '') & (all_audio_features.id.duplicated(keep=False))].id)
# output for reference:  {'4kqOuBMioKeLNLkPpmxduf', '4oR2cCQGs0Yt0Mgr2diV6V', '5FVbvttjEvQ8r2BgUcJgNg'}

set_nan = ['id', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 
    'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence'
]
all_audio_features = all_audio_features[~all_audio_features.id.isin(duplicate_errors)].sort_values(['artist', 'song']).reset_index(drop=True)

In [5]:
# replace song and artist with the exact text from df_billboard_songs

billboard_ids = set(df_billboard_songs.id)
billboard_ids.discard('')
billboard_ids = list(billboard_ids)  # faster to iterate over list

counter = 0

for idx in billboard_ids:
    
    # show status update
    counter += 1
    if counter%10 == 0:
        print(counter, end='  ')
    if counter%100 == 0:
        print()
        
    all_audio_features.loc[all_audio_features.id==idx, 'song'] = df_billboard_songs.loc[df_billboard_songs.id==idx, 'song'].values[0]
    all_audio_features.loc[all_audio_features.id==idx, 'artist'] = df_billboard_songs.loc[df_billboard_songs.id==idx, 'artist'].values[0]
    

10  20  30  40  50  60  70  80  90  100  
110  120  130  140  150  160  170  180  190  200  
210  220  230  240  250  260  270  280  290  300  
310  320  330  340  350  360  370  380  390  400  
410  420  430  440  450  460  470  480  490  500  
510  520  530  540  550  560  570  580  590  600  
610  620  630  640  650  660  670  680  690  700  
710  720  730  740  750  760  770  780  790  800  
810  820  830  840  850  860  870  880  890  900  
910  920  930  940  950  960  970  980  990  1000  
1010  1020  1030  1040  1050  1060  1070  1080  1090  1100  
1110  1120  1130  1140  1150  1160  1170  1180  1190  1200  
1210  1220  1230  1240  1250  1260  1270  1280  1290  1300  
1310  1320  1330  1340  1350  1360  1370  1380  1390  1400  
1410  1420  1430  1440  1450  1460  1470  1480  1490  1500  
1510  1520  1530  1540  1550  1560  1570  1580  1590  1600  
1610  1620  1630  1640  1650  1660  1670  1680  1690  1700  
1710  1720  1730  1740  1750  1760  1770  1780  1790  1800  
1810  1820

13110  13120  13130  13140  13150  13160  13170  13180  13190  13200  
13210  13220  13230  13240  13250  13260  13270  13280  13290  13300  
13310  13320  13330  13340  13350  13360  13370  13380  13390  13400  
13410  13420  13430  13440  13450  13460  13470  13480  13490  13500  
13510  13520  13530  13540  13550  13560  13570  13580  13590  13600  
13610  13620  13630  13640  13650  13660  13670  13680  13690  13700  
13710  13720  13730  13740  13750  13760  13770  13780  13790  13800  
13810  13820  13830  13840  13850  13860  13870  13880  13890  13900  
13910  13920  13930  13940  13950  13960  13970  13980  13990  14000  
14010  14020  14030  14040  14050  14060  14070  14080  14090  14100  
14110  14120  14130  14140  14150  14160  14170  14180  14190  14200  
14210  14220  14230  14240  14250  14260  14270  14280  14290  14300  
14310  14320  14330  14340  14350  14360  14370  14380  14390  14400  
14410  14420  14430  14440  14450  14460  14470  14480  14490  14500  
14510 

In [10]:
# re-sort, re-index, and save the dataframe to csv
# some of the tracks resort becuase the names contain odd characters
# eg,  Milord	Ã‰dith Piaf should be Milord	Edith Piaf
# eg2, Safaera	Ã‘engo Flow should be Safaera	Bad Bunny, Jowell & Randy & Nengo Flow

all_audio_features = all_audio_features.sort_values(['artist', 'song']).reset_index(drop=True)
all_audio_features.to_csv('all_audio_features_billboard_100_songs.csv', index=False)

### Merge Billboard 100  Timeseries with Songs + Metadata

In [21]:
# reload merged song dataset
all_audio_features = pd.read_csv('all_audio_features_billboard_100_songs.csv')

# merge with all_audio_features
# use ids from df_billboard_songs or match(song, artist)
all_audio_features_billboard_100 = pd.merge(df_billboard, all_audio_features, on=['song', 'artist'])
all_audio_features_billboard_100 = all_audio_features_billboard_100.sort_values(['date', 'artist', 'song']).reset_index(drop=True)

all_audio_features_billboard_100.to_csv('all_audio_features_billboard_100.csv')

In [22]:
all_audio_features_billboard_100.head(20)

Unnamed: 0,date,rank,song,artist,last-week,peak-rank,weeks-on-board,id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,1958-08-04,31,Chantilly Lace,Big Bopper,,31,1,07GtDOCxmye5KDWsTSACPk,0.84,0.49,145_266.00,0.81,0.0,3.0,0.07,-6.05,1.0,0.09,172.27,4.0,0.94
1,1958-08-04,82,Blip Blop,Bill Doggett,,82,1,328wGzwVquTqX5m3t1czL0,0.46,0.7,166_826.00,0.73,0.47,10.0,0.08,-4.58,1.0,0.05,126.63,4.0,0.85
2,1958-08-04,99,I'll Get By (As Long As I Have You),Billy Williams,,99,1,,,,,,,,,,,,,,
3,1958-08-04,3,Splish Splash,Bobby Darin,,3,1,40fD7ct05FvQHLdQTgJelG,0.38,0.64,131_719.00,0.94,0.0,0.0,0.37,-1.53,1.0,0.04,147.77,4.0,0.96
4,1958-08-04,60,Over And Over,Bobby Day,,60,1,3ixHQiAUk6F6ZU1tipromq,0.7,0.64,143_320.00,0.55,0.0,0.0,0.09,-13.08,1.0,0.12,80.23,4.0,0.56
5,1958-08-04,35,Rock-in Robin,Bobby Day,,35,1,2DAgYTzfPYqKJu0uItsNMd,0.58,0.53,154_627.00,0.79,0.0,7.0,0.17,-7.09,1.0,0.19,171.55,4.0,0.89
6,1958-08-04,78,Betty Lou Got A New Pair Of Shoes,Bobby Freeman,,78,1,7h7U3OYppI7HpgFqCc2VZ3,0.22,0.43,149_200.00,0.62,0.0,2.0,0.15,-13.57,1.0,0.05,165.84,4.0,0.93
7,1958-08-04,20,Do You Want To Dance,Bobby Freeman,,20,1,4wXPZBafMKbTtdOB7BVGcp,0.52,0.62,165_693.00,0.44,0.0,0.0,0.15,-12.0,1.0,0.05,155.53,4.0,0.96
8,1958-08-04,40,Crazy Eyes For You,Bobby Hamilton,,40,1,,,,,,,,,,,,,,
9,1958-08-04,61,Itchy Twitchy Feeling,Bobby Hendricks,,61,1,36CvDUCDMDbp3dZidKxTds,0.81,0.54,149_200.00,0.71,0.0,2.0,0.06,-5.42,1.0,0.1,73.16,4.0,0.96


### Merge All Songs With Metadata

In [41]:
# TODO
# df_8M_songs, append
    # df_1M_songs
    # df_billboard_songs
    # remove duplicates, sort, reset index
    
# songs in the billboard 100 list
set_billboard_all = set(all_audio_features.id)
set_billboard_all.discard('')

# 8M songs not in the billboard 100 songs
df1 = df_8M_songs[~df_8M_songs.id.isin(set_billboard_all)]
set1 = set_billboard_all | set(df1.id)

# 1M songs from SQL
df2 = df_1M_songs[~df_1M_songs.id.isin(set1)]

# only billboard songs with known spotify ids
df0 = all_audio_features[~all_audio_features.id.isnull()]

# concat
every_song_with_data = pd.concat([df0, df1, df2]).reset_index(drop=True)
every_song_with_data = every_song_with_data.drop_duplicates(subset='id', keep='first')


In [44]:
every_song_with_data.to_csv('every_song_with_data.csv')

In [45]:
# save
every_song_with_data.shape

(9595992, 16)

### Check How Many Songs Are Accounted For:  75%

In [48]:
# check the percentage of songs accounted for 
# from songs list:  

(
    all_audio_features[all_audio_features.id.notnull()].shape[0], 
    all_audio_features.shape[0], 
    all_audio_features[all_audio_features.id.notnull()].shape[0] / all_audio_features.shape[0]
)

(22189, 29681, 0.7475826286176341)

In [47]:
# check the percentage of songs accounted for
# from billboard timeseries: all_audio_features_billboard_100

(
    all_audio_features_billboard_100[all_audio_features_billboard_100.id.notnull()].shape[0], 
    all_audio_features_billboard_100.shape[0], 
    all_audio_features_billboard_100[all_audio_features_billboard_100.id.notnull()].shape[0] / all_audio_features_billboard_100.shape[0]
)

(253254, 329930, 0.7675991877064832)

# QA check vs QA dataframe

In [None]:
check_vs = 'every_song_with_data.csv'
qa_check = 'QA_DATAFRAME.csv'

