# Import Modules

In [1]:
# import modules
import pandas as pd
import numpy as np
import spotipy

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:_.0f}'.format)
# NOTE: underscore separaters ('_') are better than commas (',') because 
# numbers with underscores work in Python without any extra effort.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

# Import Starting Data

### The Billboard 100
https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs

In [None]:
# Billboard Top 100 Historical Data
# via:  https://toolbox.google.com/datasetsearch
url_billboard = r'D:\RYERSON\820\Datasets\Billboard The Hot 100 Songs\charts.csv'

df_billboard = pd.read_csv(url_billboard)
df_billboard['date'] = pd.to_datetime(df_billboard['date'])

# Unique Songs from The Billboard 100 Dataset

# just the songs on the billboard 100, once per song
df_billboard_songs = df_billboard[['song', 'artist']].drop_duplicates().sort_values(['artist', 'song']).reset_index(drop=True)

# add a blank id column and a blank MISSING column 
df_billboard_songs['id'] = ''
df_billboard_songs['MISSING'] = ''

df_billboard.shape, df_billboard_songs.shape

### 1.2M Songs with Metadata (csv)
https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs

In [None]:
# Spotify 1.2M+ Songs
# via:  https://www.kaggle.com/datasets/
url_1M_songs = r'D:\RYERSON\820\Datasets\Spotify 1.2M+ Songs\tracks_features.csv'

# create the dataframe with the large number of songs metadata
df_1M_songs = pd.read_csv(url_1M_songs)

# make a list of song ids from the 1M dataset
metadata_ids_csv = df_1M_songs.id.to_list()

### 8.7M Songs with Metadata (SQL)

In [None]:
# all ids from the SQLite database
metadata_ids_SQL = pd.read_csv('all_ids_sql.csv', header=None, names=['id'])
metadata_ids_SQL = metadata_ids_SQL.id.to_list()

# audio feature data not imported yet (very large)

### Combine Ids For Datasets with Metadata

In [None]:
# list of ids for all of our known metadata
all_metadata_ids = set(metadata_ids_csv + metadata_ids_SQL) # set() faster to search, and no duplicates
len(all_metadata_ids)

### Get Track IDs using API

### TEMPORARY TOKEN WORKFLOW

##### get a temporary authorization token from: https://developer.spotify.com/console/get-search-item

In [None]:
# input the temporary token
TEMP_TOKEN = input('Enter token: ')

# create a spotify object
spotify = spotipy.Spotify(auth=TEMP_TOKEN)

In [None]:
# helper function
def find_id(track_title, artist_name, metadata_ids):
    """ 
    for searches with multiple results, all id were identical for the test cases I ran 
    some searches return no results, in this case the song is not on spotify
        confirmed by spot checks in the spotify music player
    some tracks give a 404 error, 
        these seem to exist in Spotify but 404 anyway
        not sure why the API does this but 
    """
    track_info = spotify.search(q='artist:' + artist_name + ' track:' + track_title, type='track')
    
    if track_info['tracks']['items'] == []:  # if track doesn't exist on Spotify
        return '', 'MISSING'
    else:
        # default to 0th id
        track_id = track_info['tracks']['items'][0]['id']
        
        number_of_results = len(track_info['tracks']['items'])
        
        # check if there is a better match
        for i in range(number_of_results):
            current_id = track_info['tracks']['items'][i]['id']
            if current_id in metadata_ids:
                return current_id, 'matched'  # immediately return it if it's found
        
        # if we made it through the loop without returning, note 'MISSING' and return the 0th id
        return track_id, 'MISSING'

In [None]:
%%time
# TEST
find_id("You Can't Turn Me Off (In The Middle Of Turning Me On)", 'High Inergy', all_metadata_ids)

In [None]:
# TEST
find_id("You Can't Find this (SONG)", 'Low Unergy', all_metadata_ids)

### Add Spotify IDs to billboard songs matched in the datasets

In [None]:
# load saved csv if required
df_billboard_songs = pd.read_csv('df_billboard_songs.csv', keep_default_na=False)
df_billboard_songs.id.nunique()

In [None]:
# start over at
start_over_at = 30000

# populate df_billboard_songs with ids, where available
for i, row in df_billboard_songs.iterrows():
    
    # start over at
    if i < start_over_at-1:
        continue
    
    # show status update
    if i%10 == 0:
        print(i, end='  ')
    if i%100 == 0:
        print()
        
    # start over where we finished (don't overwrite known ids)
    if df_billboard_songs['id'].iloc[i] != '':  
        continue
    # append id, NONE, or 'ERROR'
    else:
        artist = row[1]
        song = row[0]
        try:
            # unless there is an error, will append the id or None 
            df_billboard_songs['id'].iloc[i], df_billboard_songs['MISSING'].iloc[i] = find_id(song, artist, all_metadata_ids)
        except:  # all errors treated the same
            # if there is an error, change id to 'ERROR'
            print('ERROR:  ', artist, song)
            df_billboard_songs['MISSING'].iloc[i] = 'ERROR'  # leave id blank
        
        # save every 1000 rows, if new
        if i%1000 == 0:
            df_billboard_songs.to_csv('df_billboard_songs_TEMP.csv', index=False)
        
# save final dataframe
df_billboard_songs.to_csv('df_billboard_songs.csv', index=False)

### After Gathering all IDs

In [None]:
# reload df_billboard_songs if required
df_billboard_songs = pd.read_csv('df_billboard_songs.csv', keep_default_na=False)

In [None]:
# how many id have we added
df_billboard_songs.id.nunique(), sum(df_billboard_songs.id != "")
# 83 duplicated ids

### Remove Duplicates / Errors

In [None]:
duplicates = df_billboard_songs[df_billboard_songs.id!='']
duplicates = duplicates[duplicates.id.duplicated(False)]
duplicates.to_csv('duplicated_ids.csv', index=True)

In [None]:
# # go through this manually and fix it
#     # in this case NEED TO FIX BOTH BILLBOARD DATAFRAMES

# # OR drop the corrupted rows
# # manually imputing is problematic, error prone, and time consuming

# duplicates.sort_values('id').head(200)

In [None]:
# set of duplicated ids
duplicated_ids = set(duplicates.id)

In [None]:
sum(df_billboard_songs['id'].isin(duplicated_ids))

In [None]:
# drop from billboard list of known ids 
for i, row in df_billboard_songs.iterrows():
    if df_billboard_songs.iloc[i]['id'] in duplicated_ids:
        df_billboard_songs['MISSING'].iloc[i] = 'DUPLICATED'
        df_billboard_songs['id'].iloc[i] = ''

In [None]:
sum(df_billboard_songs['id'].isin(duplicated_ids))

In [None]:
df_billboard_songs.to_csv('df_billboard_songs - duplicates removed.csv', index=False)

### what songs are still missing audio feature data?

In [None]:
# reload the dataframe if required
df_billboard_songs = pd.read_csv('df_billboard_songs - duplicates removed.csv', keep_default_na=False)

In [None]:
# how many id have we added
df_billboard_songs.id.nunique(), sum(df_billboard_songs.id != '')
# off by one because '' counts as a unique id

In [None]:
# check how many id match the metadata_ids
sum(df_billboard_songs.id.isin(all_metadata_ids))

In [None]:
# songs on spotify that we don't have audio features for yet
need_audio_features = df_billboard_songs[(~df_billboard_songs.id.isin(all_metadata_ids) & (df_billboard_songs.id != ''))]
need_audio_features.to_csv('need_audio_features.csv', index=False)

In [None]:
len(need_audio_features.id)

### Use API again to get missing audio features
https://developer.spotify.com/console/get-audio-features-track/

In [None]:
# input the temporary token
TEMP_TOKEN = input('Enter token: ')

# create a spotify object
spotify = spotipy.Spotify(auth=TEMP_TOKEN)

In [None]:
# initialise dataframe

with_audio_features = need_audio_features.copy().reset_index()

list_of_features = [
    'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
    'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature'
]

for feature in list_of_features:
    with_audio_features[feature] = ''

with_audio_features.head()

In [None]:
start_over_at = 1200

list_of_features = [
    'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
    'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature'
]

for i, row in with_audio_features.iterrows():
    
    # start over at
    if i < start_over_at-1:
        continue
    
    # show status update
    if i%10 == 0:
        print(i, end='  ')
    if i%100 == 0:
        print()
    
    track_id = with_audio_features['id'].iloc[i]
    temp_audio_features = spotify.audio_features(track_id)
    
    for key in list_of_features:
        with_audio_features[key].iloc[i] = temp_audio_features[0][key]
    
    if i%100 == 0:
        with_audio_features.to_csv('audio_features_TEMP.csv', index=True)
    
# save final df
with_audio_features.to_csv('audio_features_FINAL.csv', index=True)

### OPTIONAL QA SPOTCHECKS

In [None]:
#### TODO: QA check ####
# should I confirm that i get the same audio features as from the other datasets??
# maybe do a small spotcheck

QA_DATAFRAME = df_billboard_songs[df_billboard_songs.MISSING == 'matched'].sample(100).reset_index()

list_of_features = [
    'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
    'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature'
]

for feature in list_of_features:
    QA_DATAFRAME[feature] = ''

QA_DATAFRAME.head()

In [None]:
start_over_at = 0

list_of_features = [
    'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
    'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature'
]

for i, row in QA_DATAFRAME.iterrows():
    
    # start over at
    if i < start_over_at-1:
        continue
    
    # show status update
    if i%10 == 0:
        print(i, end='  ')
    if i%100 == 0:
        print()
    
    track_id = QA_DATAFRAME['id'].iloc[i]
    temp_audio_features = spotify.audio_features(track_id)
    
    for key in list_of_features:
        QA_DATAFRAME[key].iloc[i] = temp_audio_features[0][key]
    
# save final df
QA_DATAFRAME.to_csv('QA_DATAFRAME.csv', index=True)

In [None]:
# check these later to confirm that the downloaded data is correct

# Finalize Dataset

### Reimport data from Billboard 100 and 1.2M Songs Dataset

In [5]:
# billboard songs 
df_billboard_songs = pd.read_csv('df_billboard_songs - duplicates removed.csv', keep_default_na=False)

# billboard songs with audio features from API (missing from other datasets)
df_api_features = pd.read_csv('audio_features_FINAL.csv', keep_default_na=False)

# billboard time series
url_billboard = r'D:\RYERSON\820\Datasets\Billboard The Hot 100 Songs\charts.csv'
df_billboard = pd.read_csv(url_billboard)
df_billboard['date'] = pd.to_datetime(df_billboard['date'])

# 1.2M songs with metadata
url_1M_songs = r'D:\RYERSON\820\Datasets\Spotify 1.2M+ Songs\tracks_features.csv'
df_1M_songs = pd.read_csv(url_1M_songs)

### 8+ M. Spotify Tracks, Genre, Audio Features (SQLite)
https://www.kaggle.com/datasets/maltegrosse/8-m-spotify-tracks-genre-audio-features

In [10]:
url_8M_sql = 'D:\RYERSON\820\Datasets\8+ M. Spotify Tracks, Genre, Audio Features\spotify.sqlite'
url_8M_csv = 'all_audio_features_sql.csv'  # .gitignore (very big)

df_8M_songs = pd.read_csv(url_8M_csv, encoding = "ISO-8859-1", on_bad_lines='skip')

In [11]:
df_8M_songs.shape

(11839778, 16)

In [12]:
df_8M_songs.sample(20)
# back to SQL

Unnamed: 0,1dizvxctg9dHEyaYTFufVi,Nancy Fletcher,Gz And Hustlas (feat. Nancy Fletcher),0.164000004529953,0.6520000100135803,275893,0.8140000104904175,0,1,0.36000001430511475,-4.901000022888184,1.1,0.3100000023841858,91.88800048828125,4,0.7879999876022339
7156496,4oWlDu2x7F8GQ9w98GjWN0,Heavy Laden,Psalm 32,0,0,216216,1,0,0,0,-7,0,0,180,4,0
11513901,0Gx7Bx7MVPyLFPKT69rFfV,Sienna Dahlen,Warm Lake,1,0,262320,0,0,2,0,-15,1,0,87,4,0
9193084,4G5l7I6puco5ggGlpD0g6W,Sound Effects Library,Hand Saw on Wood,1,0,12701,1,0,7,1,-20,1,0,0,0,0
9613771,6qAD8laTICCMtr2upG9yR0,Yotto,Ghost Signal,0,0,240773,0,1,7,0,-10,0,0,142,3,0
3403209,32euzRTdKd1NYTZGgagXQ3,May Honorato,Canção de Partida,1,1,200404,0,0,3,0,-12,1,0,100,4,0
8257295,75eDGvQ2b21KZk3sI5kXI8,Cutty Ranks,Love Me Have Fi Get,0,1,227640,1,0,0,0,-13,1,0,164,4,1
1814241,4W9ztl1or5gCf1Q2OW3NOY,Sufjan Stevens,Stevens / Arr Atkinson: Suite from Run Rabbit ...,1,0,317040,0,1,6,0,-19,1,0,77,4,0
6373913,7vNhvaNU1vzZgiIPSWrCwd,Pamela Helen Stephen,The World of the Spirit (arr. P. Hindmarsh): P...,1,0,245520,0,0,0,0,-24,1,0,73,3,0
8342392,4xpetoDXnL2CTbNQSFbGUI,Gordon Macrae,Wanting You,1,0,183946,0,0,3,0,-13,1,0,92,4,0
628918,7vykAL3lFb9uNp2bjsGHFu,Eximinds,One Thing About You - Eximinds Remix,0,1,340004,1,0,1,0,-4,0,0,132,4,0


### Merge Billboard 100 with Metadata

In [None]:
# TODO
# STEP 1
# df_billboard_songs, append
    # df_8M_songs
    # df_1M_songs
    # df_api_features



In [None]:
# STEP 2
# df_billboard, append
    # df_billboard_songs
# set missing audio = np.nan
    # create dataframe of billboard songs not on spotify
    # create dataframe of billboard songs not on spotify
# sort, reset index




### Merge All Songs With Metadata

In [None]:
# TODO
# df_8M_songs, append
    # df_1M_songs
    # df_billboard_songs
    # remove duplicates, sort, reset index
    
    

In [None]:
# separate songs into
    # not on spotify (no id)
    # matched with audio features
    
    
 

In [None]:
# check the percentage of songs accounted for




In [None]:
# now merge with entire billboard list



# re-count the percentage of songs accounted for in the entire set

### Merge all Datasets into Billboard Features, 
export Billboard Features and All Features

In [None]:
# QA check vs QA dataframe


