In [1]:
# import modules
import pandas as pd
import numpy as np
import spotipy

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:_.0f}'.format)
# NOTE: underscore separaters ('_') are better than commas (',') because 
# numbers with underscores work in Python without any extra effort.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

### The Billboard 100
https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs

In [4]:
# Billboard Top 100 Historical Data
# via:  https://toolbox.google.com/datasetsearch
url_billboard = r'D:\RYERSON\820\Datasets\Billboard The Hot 100 Songs\charts.csv'

df_billboard = pd.read_csv(url_billboard)
df_billboard['date'] = pd.to_datetime(df_billboard['date'])

### Unique Songs from The Billboard 100 Dataset

In [5]:
# just the songs on the billboard 100, once per song
df_billboard_songs = df_billboard[['song', 'artist']].drop_duplicates().sort_values(['artist', 'song']).reset_index(drop=True)

# add a blank id column
df_billboard_songs['id'] = ''

df_billboard_songs.shape

(29681, 3)

### ID Scraping STEP 1. 1.2M Songs with Metadata (csv)
https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs

In [6]:
# Spotify 1.2M+ Songs
# via:  https://www.kaggle.com/datasets/
url_1M_songs = r'D:\RYERSON\820\Datasets\Spotify 1.2M+ Songs\tracks_features.csv'

# create the dataframe with the large number of songs metadata
df_1M_songs = pd.read_csv(url_1M_songs)

# make a list of song ids from the 1M dataset
metadata_ids = list(df_1M_songs.id)

### Get Spotify IDs using API
(to join with large datasets)

https://developer.spotify.com/terms/

##### Useful Spotify API Features

https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-audio-features

https://developer.spotify.com/documentation/web-api/reference/#/operations/get-recommendations

##### Other Links

my app:
https://developer.spotify.com/dashboard/applications/cd5ce2cb690543ff9967e817d4665543

a tutorial: 
https://www.youtube.com/watch?v=cU8YH2rhN6A&ab_channel=Elbert

### TEMPORARY TOKEN WORKFLOW

##### get a temporary authorization token from: https://developer.spotify.com/console/get-search-item

In [None]:
# input the temporary token
TEMP_TOKEN = input('Enter token: ')

# create a spotify object
spotify = spotipy.Spotify(auth=TEMP_TOKEN)

In [None]:
# helper function
def find_id(track_title, artist_name, metadata_ids):
    """ 
    for searches with multiple results, all id were identical for the test cases I ran 
    some searches return no results, in this case the song is not on spotify
        confirmed by spot checks in the spotify music player
    some tracks give a 404 error, 
        these seem to exist in Spotify but 404 anyway
        not sure why the API does this but 
    """
    track_info = spotify.search(q='artist:' + artist_name + ' track:' + track_title, type='track')
    
    if track_info['tracks']['items'] == []:  # if track doesn't exist on Spotify
        return None
    else:
        # set the id to the first result
        track_id = track_info['tracks']['items'][0]['id']
        
        # check if there is a better match
        number_of_results = len(track_info['tracks']['items'])
        
        for i in range(number_of_results):
            alt_track_id = track_info['tracks']['items'][i]['id']
            if alt_track_id in metadata_ids:
                track_id = track_info['tracks']['items'][i]['id']
        
        return track_id

In [None]:
# TEST
find_id("You Can't Turn Me Off (In The Middle Of Turning Me On)", 'High Inergy', metadata_ids)

### Add Spotify IDs to billboard songs using the 1.2M song dataset

In [None]:
# # load saved csv if required
# df_billboard_songs = pd.read_csv('df_billboard_songs.csv', keep_default_na=False)
# df_billboard_songs.id.nunique()

In [None]:
# start over at
start_over_at = 22953

# populate df_billboard_songs with ids, where available
for i, row in df_billboard_songs.iterrows():
    
    # start over at
    if i < start_over_at-1:
        continue
    
    # show status
    if i%10 == 0:
        print(i, end='  ')
    if i%100 == 0:
        print()
        
    # start over where we finished
    if df_billboard_songs['id'].iloc[i] != '':  
        continue
    # append id, NONE, or 'ERROR'
    else:
        artist = row[1]
        song = row[0]
        try:
            # unless there is an error, will append the id or None
            df_billboard_songs['id'].iloc[i] = find_id(song, artist, metadata_ids)
        except:  # any error needs to be dealt with manually
            # if there is an error, change id to 'ERROR'
            print('ERROR:  ', artist, song)
            df_billboard_songs['id'].iloc[i] = 'ERROR'
        # save every 1000 rows, if new
        if i%1000 == 0:
            df_billboard_songs.to_csv('df_billboard_songs_TEMP.csv', index=False)
        
# save final dataframe
df_billboard_songs.to_csv('df_billboard_songs.csv', index=False)

In [None]:
# how many id have we added
sum(df_billboard_songs.id != "")

In [None]:
# check how many id match the metadata_ids
sum(df_billboard_songs.id.isin(metadata_ids))

# which have spotify ids, but don't match metadata_ids?
# consider rechecking with metadata_ids_SQL

### ID Scraping STEP 2.  Add Spotify IDs to billboard songs using the 8.7M song SQL database

In [10]:
df_billboard_songs = pd.read_csv('df_billboard_songs.csv', keep_default_na=False)

In [7]:
# check how many id match the ids from the SQLite database
metadata_ids_SQL = pd.read_csv('all_ids_sql.csv', header=None, names=['id'])
metadata_ids_SQL.shape

(8741672, 1)

In [8]:
metadata_ids_SQL = list(metadata_ids_SQL.id)

In [11]:
# check how many id match the metadata_ids_SQL
sum(df_billboard_songs.id.isin(metadata_ids_SQL))

14313

In [14]:
# isolate songs on spotify which aren't in either dataset
df_missing = df_billboard_songs[~df_billboard_songs.id.isin(metadata_ids_SQL)]
df_missing = df_missing[~df_missing.id.isin(metadata_ids)]  # if in metadata_ids, we have it from step 1
df_missing = df_missing[(df_missing.id!='') & (df_missing.id!='ERROR')].reset_index() # keep old index for merging later
df_missing.shape

(8040, 4)

In [13]:
df_missing.head()

Unnamed: 0,index,song,artist,id
0,1,What Now My Love,"""Groove"" Holmes",11Aldbvo6UCcVhBzv4oUdw
1,24,Music Of My Heart,'N Sync & Gloria Estefan,5smPCwVq1pp9b9mnhKRxQj
2,78,Watch Out,2 Chainz,0w2MsTyHDjPxNqqXY5o8wb
3,95,El Trago (The Drink),2 In A Room,14gC7IS8BFlzabcoqR1iNW
4,97,Walk Tall,2 Of Clubs,6byQuxXOHoos4pB8AnzuM3


In [None]:
# new helper function
# search again for df_missing using metadata_ids_SQL

def find_exact_id(track_title, artist_name, metadata_ids):
    """ 
    same as above, but returns 'MISSING' if not found in metadata_ids
    """
    track_info = spotify.search(q='artist:' + artist_name + ' track:' + track_title, type='track')
    
    if track_info['tracks']['items'] == []:  # if track doesn't exist on Spotify
        return 'MISSING'
    else:
        # check if there is a match
        number_of_results = len(track_info['tracks']['items'])
        
        for i in range(number_of_results):
            track_id = track_info['tracks']['items'][i]['id']
            if track_id in metadata_ids:
                return track_id
        
        # if we haven't returned a match, return 'MISSING'
        return 'MISSING'


In [None]:
# TEST
find_id("You Can't Turn Me Off (In The Middle Of Turning Me On)", 'High Inergy', metadata_ids)

In [None]:
# TEST
find_exact_id("You Can't Turn Me Off (In The Middle Of Turning Me On)", 'High Inergy', metadata_ids_SQL)

In [None]:
# TEST
find_exact_id("You can't find this song", 'not real artist', metadata_ids_SQL)

In [None]:
# start over at
start_over_at = 0

# populate df_missing with ids, where available, 'MISSING' otherwise
for i, row in df_missing.iterrows():
        
    # start over at
    if i < start_over_at-1:
        continue
        
    # show status
    if i%10 == 0:
        print(i, end='  ')
    if i%100 == 0:
        print()
        
    # UPDATE THIS IF DROP INDEX IN RESET INDEX
    artist = row[2]
    song = row[1]
    
    try:
        # unless there is an error, will append the id or None
        df_missing['id'].iloc[i] = find_exact_id(song, artist, metadata_ids_SQL)
    except:  # any error needs to be dealt with manually
        # if there is an error, change id to 'ERROR'
        print('ERROR:  ', artist, song)
        df_missing['id'].iloc[i] = 'ERROR'
    
    # save every 1000 rows
    if i%1000 == 0:
        df_missing.to_csv('df_missing_TEMP.csv', index=False)
        
# save final dataframe
df_missing.to_csv('df_missing.csv', index=False)

### Check Data and Find out What is Still Missing

In [None]:
# number of tracks missing from both datasets, but on Spotify
sum(df_missing.id == 'MISSING')

# OPTIONAL: could use API to get these sound features

In [None]:
sum(df_missing.id == 'MISSING'), sum(df_missing.id != 'MISSING')
# there are 1940 better ids to be updated in df_billboard_songs
# there are 5188 songs on Spotify, that we don't have audio feature data for yet

In [None]:
# revised_ids includes better ids than df_billboard_songs
# update df_billboard_songs with this, match using the index column
revised_ids = df_missing[df_missing.id != 'MISSING']
revised_ids.to_csv('revised_ids.csv', index=False)
revised_ids.head()

### merging data

In [None]:
metadata_ids_SQL = pd.read_csv('all_ids_sql.csv', header=None, names=['id'])

In [None]:
# recombine df_missing with df_billboard_songs

# reload from csv
temp_missing = pd.read_csv('df_missing.csv', keep_default_na=False)
temp_missing = temp_missing[temp_missing.id == 'MISSING']
temp_revised = pd.read_csv('revised_ids.csv', keep_default_na=False)
temp_billboard = pd.read_csv('df_billboard_songs.csv', keep_default_na=False)

df_billboard_songs = temp_billboard.copy()
df_billboard_songs['MISSING'] = ''

temp_missing.shape, temp_revised.shape, temp_billboard.shape

In [None]:
# isolate songs on spotify which aren't in either dataset
df_missing = df_billboard_songs[~df_billboard_songs.id.isin(metadata_ids_SQL)]
df_missing = df_missing[~df_missing.id.isin(metadata_ids)]  # 9650 without this, 7128 with (2500 in smaller dataset)
df_missing = df_missing[(df_missing.id!='') & (df_missing.id!='ERROR')].reset_index() # keep old index for merging later
df_missing.shape

In [None]:
df_missing.sample(10)

In [None]:
missing = temp_missing.iloc[:,0].tolist()
revised = temp_revised.iloc[:,0].tolist()

len(missing), len(revised), len(missing) + len(revised)

In [None]:
df_billboard_songs.iloc[1]['MISSING']

In [None]:
# revise id based on index
# probably wrong, check index vs column named 'index'
for i, row in df_billboard_songs.iterrows():
    
    current_id = df_billboard_songs.iloc[i].id
    
    if i in missing:
        df_billboard_songs.iloc[i]['MISSING'] = 'MISSING'
    elif i in revised:
        df_billboard_songs.iloc[i]['id'] = current_id
        df_billboard_songs.iloc[i]['MISSING'] = 'revised'


In [None]:
error_check = df_billboard_songs[df_billboard_songs.MISSING == 'MISSING']

In [None]:
error_check[error_check.id == '']

In [None]:
# separate songs into
    # not on spotify (no id)
    # on spotify, but not either dataset (not in either metadata set)
    # matched with dataset (could distinguish using temp_revised)
    
    
 

In [None]:
# how many tracks are in spotify, but don't have data?




In [None]:
# check the percentage of songs accounted for




In [None]:
# now merge with entire billboard list



# re-count the percentage of songs accounted for in the entire set

### 8+ M. Spotify Tracks, Genre, Audio Features (SQLite)
https://www.kaggle.com/datasets/maltegrosse/8-m-spotify-tracks-genre-audio-features

In [None]:
url_8M_sql = 'D:\RYERSON\820\Datasets\8+ M. Spotify Tracks, Genre, Audio Features\spotify.sqlite'
url_8M_csv = 'all_audio_features_sql.csv'  # .gitignore (very big)


### Merge all Datasets into Billboard Features, 
export Billboard Features and All Features