# Imports

In [8]:
# import modules
import pandas as pd
import numpy as np
import re
from ast import literal_eval
import spotipy

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:_.2f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

In [28]:
# helper functions

def find_all_tracks(track_title, artist_name):
    """ returns list of lists ['id', 'song', 'artist'] """
    track_info = spotify.search(q='artist:' + artist_name + ' track:' + track_title, type='track')
    
    if track_info['tracks']['items'] == []:  # if track doesn't exist on Spotify
        return 'MISSING'
    else:
        all_tracks = []
        number_of_results = len(track_info['tracks']['items'])
        
        # check if there is a better match
        for i in range(number_of_results):
            track_id = track_info['tracks']['items'][i]['id']
            artist_name = track_info['tracks']['items'][i]['artists'][0]['name']
            song_name = track_info['tracks']['items'][i]['name']
            all_tracks.append([track_id, song_name, artist_name])
        
        # if we made it through the loop without returning, note 'MISSING' and return the 0th id
        return all_tracks


def remove_punctuation(text_input):
    text_input = str(text_input)  # avoid float errors in applymap()
    text_input = re.sub(r'&', 'and', text_input)  # replaces & with 'and'
    text_input = re.compile(r'[^a-zA-Z 0-9]').sub('', text_input)
    return text_input.lower().strip()


def clean_text(text_input):
    text_input = str(text_input)  # avoid float errors in applymap()
    text_input = text_input.strip().lower()
    text_input = re.sub(r'&', 'and', text_input)  # replaces & with 'and'
    text_input = re.sub(r'and.+', '', text_input)  # removes text after the 'and'
    text_input = re.compile(r'the').sub('', text_input)  # remove all 'the' (maybe just need the 1st word?)
    text_input = re.sub(r',.+', '', text_input)  # removes all misc artists, after comma 
    text_input = re.sub(r'(?:feat).+', '', text_input)  # removes all misc artists, after 'feat' 
    text_input = re.sub(r'\(.+', '', text_input)  # removes text after first bracket
    text_input = re.sub(r'\-.+', '', text_input)  # removes text after first dash
    text_input = re.compile(r'[^a-zA-Z 0-9]').sub('', text_input)  # remove punctuation
    text_input = re.sub(' +', ' ', text_input)  # remove multiple spaces
    return text_input.strip()

In [10]:
%%time
##### SQL 8+M
"""
    SELECT * FROM tracks
    JOIN r_track_artist ON tracks.id = r_track_artist.track_id
    JOIN artists ON r_track_artist.artist_id = artists.id
    JOIN audio_features ON audio_features.id = tracks.audio_feature_id
"""
dtypes = {
    'key': 'Int16', 'mode': 'Int16', 'time_signature': 'Int16', 'tempo': 'float32', 
    'acousticness': 'float32', 'danceability': 'float32', 'duration_ms': 'Int64',  
    'energy': 'float32', 'instrumentalness': 'float32', 'liveness': 'float32', 
    'loudness': 'float32', 'speechiness': 'float32', 'valence': 'float32'
} 
df_SQL = pd.read_csv('all_audio_features_sql.csv', dtype=dtypes)

# import genre and release date data
"""
    SELECT tracks.id AS id, release_date, genre_id as genre FROM tracks
    JOIN r_albums_tracks ON tracks.id = r_albums_tracks.track_id
    JOIN albums ON r_albums_tracks.album_id = albums.id
    JOIN r_track_artist ON tracks.id = r_track_artist.track_id
    JOIN r_artist_genre ON r_track_artist.artist_id = r_artist_genre.artist_id
"""
df_genre = pd.read_csv('SQL_track_release_date_and_genre.csv')
df_genre['genre_count'] = df_genre.groupby('genre')['genre'].transform('count')  # add a count column
df_TEMP = df_genre.copy()  # create temp df, sort by most common genre, merge with SQL data
df_TEMP = df_TEMP.drop_duplicates(['id']).sort_values('genre_count', ascending=False).drop_duplicates(['id']).reset_index(drop=True)
df_SQL = df_SQL.merge(df_TEMP, on='id')

# now format and save df_genre
# NOTE: slightly different then old method, counts multiple instances of a song with multiple artists (for each artist)
df_genre = df_genre[['genre', 'genre_count']].sort_values('genre_count', ascending=False).drop_duplicates(['genre']).reset_index(drop=True)

# formatting
df_SQL['release_date'] = pd.to_datetime(df_SQL['release_date'], unit='ms', origin='unix', errors = 'coerce')
df_SQL = df_SQL.rename({
    'name:1': 'artist',
    'name': 'song',
    'duration:1': 'duration_ms'
}, axis=1)[[
    'id', 'song', 'artist', 'genre', 'release_date',
    'acousticness', 'danceability', 'duration_ms', 
    'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 
    'mode', 'speechiness', 'tempo','time_signature', 'valence'
]].reset_index(drop=True)

# NOTE: DO NOT DROP DUPLICATES YET, NEED FOR LOOKUP WITH B100

# save files as pickle
df_genre.to_pickle('df_genre.pickle')
df_SQL.to_pickle('df_SQL.pickle')

Wall time: 3min 14s


In [11]:
%%time
##### Spotify 1.2M+ Songs
url_1M_songs = r'D:\RYERSON\820\Datasets\Spotify 1.2M+ Songs\tracks_features.csv'
df_1M_songs = pd.read_csv(url_1M_songs)

# doesn't have genre (would take 1000+ to get using API, will stay NA)
df_1M_songs['genre'] = pd.NA

# explode artists
df_1M_songs['artists'] = df_1M_songs['artists'].apply(literal_eval) #convert to list type
df_1M_songs = df_1M_songs.explode('artists', ignore_index=True)

# formatting
df_1M_songs['release_date'] = pd.to_datetime(df_1M_songs['release_date'], errors = 'coerce')
df_1M_songs = df_1M_songs.rename({
    'name': 'song',
    'artists': 'artist'
}, axis=1)[[
    'id', 'song', 'artist', 'genre', 'release_date',
    'acousticness', 'danceability', 'duration_ms', 
    'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 
    'mode', 'speechiness', 'tempo','time_signature', 'valence'
]].reset_index(drop=True)

# NOTE: DO NOT DROP DUPLICATES YET, NEED FOR LOOKUP WITH B100

# save files as pickle
df_1M_songs.to_pickle('df_1M_songs.pickle')

Wall time: 21.3 s


In [36]:
%%time
##### Billboard Top 100 Historical Data
url_B100 = r'D:\RYERSON\820\Datasets\Billboard The Hot 100 Songs\charts.csv'
dtypes_timeseries = {
    'rank': 'Int16', 'last-week': 'Int16', 'peak-rank': 'Int16', 'weeks-on-board': 'Int16'
}
df_B100 = pd.read_csv(url_B100, dtype=dtypes_timeseries)
df_B100['date'] = pd.to_datetime(df_B100['date'])

# Unique Songs from The Billboard 100 Dataset
df_B100_songs = df_B100[['song', 'artist']].drop_duplicates().sort_values(['artist', 'song']).reset_index(drop=True)

# save files as pickle
df_B100.to_pickle('df_B100.pickle')
df_B100_songs.to_pickle('df_B100_songs.pickle')

Wall time: 806 ms


# Merge into Working Datasets

In [149]:
%%time
# reload data
df_genre = pd.read_pickle('df_genre.pickle')
df_SQL = pd.read_pickle('df_SQL.pickle')
df_1M_songs = pd.read_pickle('df_1M_songs.pickle')
df_B100 = pd.read_pickle('df_B100.pickle')
df_B100_songs = pd.read_pickle('df_B100_songs.pickle')

Wall time: 4.7 s


In [100]:
%%time
# merge SQL and CSV AF to get df_10M
ids_SQL = set(df_SQL['id'].to_list())
df_TEMP = df_1M_songs.copy()  
df_TEMP = df_TEMP[~df_TEMP.id.isin(ids_SQL)]  # drop songs already in SQL before merge
df_10M = pd.concat([df_SQL, df_TEMP]).reset_index(drop=True)

# drop duplicate song/artist (exact matches only)
df_10M = df_10M.sort_values('release_date').drop_duplicates(['song', 'artist']).reset_index(drop=True)

Wall time: 29.7 s


In [102]:
%%time
# add approx song and artist columns for approx match
df_10M['approx_song'] = df_10M[['song']].applymap(clean_text)
df_10M['approx_artist'] = df_10M[['artist']].applymap(clean_text)

Wall time: 2min 20s


In [150]:
%%time
# add approx song and artist columns to B100 for merging
df_B100_songs['approx_song'] = df_B100_songs[['song']].applymap(clean_text)
df_B100_songs['approx_artist'] = df_B100_songs[['artist']].applymap(clean_text)

Wall time: 417 ms


In [151]:
df_B100_songs.shape

(29681, 4)

In [152]:
%%time
# merge B100 with 10M to get AF, where available
df_B100_songs = df_B100_songs.merge(df_10M, on=['approx_song', 'approx_artist'], how='left')

Wall time: 11 s


In [153]:
df_B100_songs.shape

(297732, 22)

In [154]:
%%time
# drop blank artist+song
df_B100_songs = df_B100_songs[~((df_B100_songs.approx_song == '') & (df_B100_songs.approx_artist == ''))]

# sort by exact matches, then release_date
df_B100_songs['EXACT_MATCH'] = (df_B100_songs['song_x'] == df_B100_songs['song_y'])*1 + (df_B100_songs['artist_x'] == df_B100_songs['artist_y'])*1

# drop duplicates (keep exact matches, or oldest song)
df_B100_songs = (
    df_B100_songs.sort_values('release_date')
    .sort_values('EXACT_MATCH', ascending=False)
    .dropna(subset=['song_x', 'artist_x'], axis='rows')
    .drop_duplicates(['song_x', 'artist_x'])
    .reset_index(drop=True)
)

Wall time: 295 ms


In [155]:
df_B100_songs.shape

(29677, 23)

In [156]:
df_B100_songs.head()

Unnamed: 0,song_x,artist_x,approx_song,approx_artist,id,song_y,artist_y,genre,release_date,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,EXACT_MATCH
0,Fairweather Friend,Johnny Gill,fairwear friend,johnny gill,7a35kVTnsXzlplOrxOIN9W,Fairweather Friend,Johnny Gill,funk,1990-01-01,0.01,0.72,276_427.00,0.8,0.0,11,0.06,-8.58,1,0.05,105.12,4.0,0.79,2
1,Parking Lot Party,Lee Brice,parking lot party,lee brice,4HFNGbVmsMV5lHUvDuXLkl,Parking Lot Party,Lee Brice,contemporary country,2012-04-24,0.01,0.54,207_893.00,0.86,0.0,1,0.63,-5.76,1,0.08,171.72,4.0,0.86,2
2,Love Me All The Way,Kim Weston,love me all way,kim weston,70Ku5YKVxSyueEFiLSWDVW,Love Me All The Way,Kim Weston,classic soul,1991-01-01,0.26,0.26,172_093.00,0.65,0.0,2,0.28,-7.86,1,0.03,186.01,3.0,0.77,2
3,All True Man,Alexander O'Neal,all true man,alex,6nhCjIwKCpZCol6O4ciUTu,All True Man,Alexander O'Neal,disco,1991-01-01,0.11,0.76,304_427.00,0.7,0.0,7,0.09,-8.52,1,0.04,104.77,4.0,0.85,2
4,I'm A Fool To Care,Joe Barry,im a fool to care,joe barry,22bP0ZXqaIwaRH5aERblrj,I'm A Fool To Care,Joe Barry,swamp pop,2012-04-27,0.54,0.42,141_307.00,0.47,0.0,5,0.09,-10.63,1,0.03,87.92,4.0,0.85,2


In [123]:
(df_B100_songs['song_x'] == df_B100_songs['song_y'])*1 + (df_B100_songs['artist_x'] == df_B100_songs['artist_y'])*1

0              1
19890          2
23839          1
23840          1
23841          1
              ..
16068          0
16069          0
16070          0
16071          0
EXACT_MATCH    0
Length: 29678, dtype: int32

In [124]:
df_B100_songs.shape

(29678, 23)

In [122]:
df_B100_songs.EXACT_MATCH.describe()

count   29_677.00
mean         0.00
std          0.00
min          0.00
25%          0.00
50%          0.00
75%          0.00
max          0.00
Name: EXACT_MATCH, dtype: float64

In [80]:
# drop blank artist+song
df_B100_songs = df_B100_songs[~((df_B100_songs.approx_song == '') & (df_B100_songs.approx_artist == ''))]

# sort by exact matches, then release_date
df_B100_songs.loc['EXACT_MATCH'] = (df_B100_songs['song_x'] == df_B100_songs['song_y'])*1 + (df_B100_songs['artist_x'] == df_B100_songs['artist_y'])*1
df_B100_songs = df_B100_songs.sort_values(['EXACT_MATCH', 'release_date'])

# drop duplicates (keep exact matches, or oldest song)
df_B100_songs = df_B100_songs.sort_values('release_date').sort_values('EXACT_MATCH', ascending=False).dropna(subset=['song_x', 'artist_x'], axis='rows').drop_duplicates(['song_x', 'artist_x'])

Unnamed: 0,song_x,artist_x,approx_song,approx_artist,id,song_y,artist_y,genre,release_date,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,EXACT_MATCH
262984,Fairweather Friend,Johnny Gill,fairwear friend,johnny gill,7a35kVTnsXzlplOrxOIN9W,Fairweather Friend,Johnny Gill,funk,1990-01-01,0.01,0.72,276_427.00,0.80,0.00,11,0.06,-8.58,1,0.05,105.12,4.00,0.79,2.00
266211,Parking Lot Party,Lee Brice,parking lot party,lee brice,4HFNGbVmsMV5lHUvDuXLkl,Parking Lot Party,Lee Brice,contemporary country,2012-04-24,0.01,0.54,207_893.00,0.86,0.00,1,0.63,-5.76,1,0.08,171.72,4.00,0.86,2.00
265229,Love Me All The Way,Kim Weston,love me all way,kim weston,70Ku5YKVxSyueEFiLSWDVW,Love Me All The Way,Kim Weston,classic soul,1991-01-01,0.26,0.26,172_093.00,0.65,0.00,2,0.28,-7.86,1,0.03,186.01,3.00,0.77,2.00
1449,All True Man,Alexander O'Neal,all true man,alex,6nhCjIwKCpZCol6O4ciUTu,All True Man,Alexander O'Neal,disco,1991-01-01,0.11,0.76,304_427.00,0.70,0.00,7,0.09,-8.52,1,0.04,104.77,4.00,0.85,2.00
262005,I'm A Fool To Care,Joe Barry,im a fool to care,joe barry,22bP0ZXqaIwaRH5aERblrj,I'm A Fool To Care,Joe Barry,swamp pop,2012-04-27,0.54,0.42,141_307.00,0.47,0.00,5,0.09,-10.63,1,0.03,87.92,4.00,0.85,2.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275797,Fun,Pitbull Featuring Chris Brown,fun,pitbull,6Zo3wgDE076K2AuiGZF3CS,Fun (feat. Chris Brown),Pitbull,dance pop,2014-11-21,0.16,0.79,202_307.00,0.77,0.00,0,0.19,-3.67,0,0.05,113.97,4.00,0.55,0.00
248084,Hey Mama,"David Guetta Featuring Nicki Minaj, Bebe Rexha...",hey mama,david guetta,285HeuLxsngjFn4GGegGNm,"Hey Mama (feat. Nicki Minaj, Bebe Rexha & Afro...",David Guetta,dance pop,2014-11-21,0.24,0.60,192_560.00,0.73,0.00,9,0.32,-4.09,1,0.15,85.98,4.00,0.52,0.00
295515,I Don't Mind,Usher Featuring Juicy J,i dont mind,usher,7aXuop4Qambx5Oi3ynsKQr,I Don't Mind (feat. Juicy J),Usher,atl hip hop,2014-11-21,0.20,0.87,251_989.00,0.46,0.00,4,0.09,-8.34,1,0.18,112.97,4.00,0.46,0.00
285229,Straight From The Heart,The Allman Brothers Band,straight from heart,allman brors band,2CTXzkxa6KnziErTeFnXa6,Straight from the Heart,Allman Brothers Band,album rock,1981-08-01,0.13,0.67,225_853.00,0.84,0.00,9,0.05,-9.13,1,0.03,131.50,4.00,0.85,0.00


In [79]:
(df_B100_songs['song_x'] == df_B100_songs['song_y'])*1 + (df_B100_songs['artist_x'] == df_B100_songs['artist_y'])*1

0         0
1         0
2         1
3         0
4         0
         ..
297727    0
297728    1
297729    0
297730    1
297731    1
Length: 297732, dtype: int32

In [83]:
df_B100_songs.head()

Unnamed: 0,song_x,artist_x,approx_song,approx_artist,id,song_y,artist_y,genre,release_date,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,EXACT_MATCH
0,Misty,"""Groove"" Holmes",misty,groove holmes,,,,,NaT,,,,,,,,,,,,,,0.0
1,What Now My Love,"""Groove"" Holmes",what now my love,groove holmes,,,,,NaT,,,,,,,,,,,,,,0.0
2,May The Bird Of Paradise Fly Up Your Nose,"""Little"" Jimmy Dickens",may bird of paradise fly up your nose,little jimmy dickens,1WpoMGLjlEUHlWhilsOkJA,May The Bird Of Paradise Fly Up Your Nose,Little Jimmy Dickens,country gospel,1954-01-01,0.74,0.63,145_640.00,0.67,0.0,4.0,0.47,-13.38,1.0,0.09,104.33,4.0,0.9,1.0
3,May The Bird Of Paradise Fly Up Your Nose,"""Little"" Jimmy Dickens",may bird of paradise fly up your nose,little jimmy dickens,4KRLWRl1bFjnXhY5MgZWrM,May the Bird of Paradise Fly up Your Nose,Little Jimmy Dickens,country gospel,1965-01-01,0.74,0.66,151_693.00,0.8,0.0,4.0,0.63,-8.45,1.0,0.12,104.37,4.0,0.87,0.0
4,I Know I Know,"""Pookie"" Hudson",i know i know,pookie hudson,,,,,NaT,,,,,,,,,,,,,,0.0


In [66]:
df_B100_songs.shape

(297732, 22)

In [71]:
df_B100_songs.sample(20)

Unnamed: 0,song_x,artist_x,approx_song,approx_artist,id,song_y,artist_y,genre,release_date,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
20989,(Love Is) Thicker Than Water,Andy Gibb,,,1dRwycStiOjBAGfq6Up8BL,如果沒有你 (Without You),龍飄飄,c-pop,2010-11-01,0.42,0.61,202_137.00,0.38,0.0,11.0,0.13,-12.29,1.0,0.05,110.59,4.0,0.64
271913,I'm Not Gonna Cry Anymore,Nancy Brooks,im not gonna cry anymore,nancy brooks,,,,,NaT,,,,,,,,,,,,,
23507,(Love Is) Thicker Than Water,Andy Gibb,,,5YswYBFwXnR7e7G1gus1Dd,赤道直下型の誘惑,渡辺桂子,idol kayo,2012-02-08,0.25,0.67,189_693.00,0.87,0.0,4.0,0.14,-3.38,0.0,0.04,156.55,4.0,0.88
133547,(Where Do I Begin) Love Story,Andy Williams,,,7vKUoCZ74zknlYgnI04W2r,Волшебник Изумрудного города: Нас не пугает тр...,Оперно-симфонический оркестр п/у Юрия Арановича,detskie pesni,2007-01-01,0.87,0.53,244_827.00,0.11,0.0,5.0,0.43,-17.75,0.0,0.5,79.93,3.0,0.41
34940,(Love Is) Thicker Than Water,Andy Gibb,,,3i2ggrziUiJJguNCXynBjr,愛人緊回頭,小鳳鳳,hokkien pop,2015-07-20,0.39,0.51,228_800.00,0.64,0.0,10.0,0.17,-4.08,0.0,0.03,165.72,4.0,0.76
51267,(Love Is) Thicker Than Water,Andy Gibb,,,5hkaFqz1HiBdN6Vwxyj09G,夢なんかじゃ飯は喰えないと誰かのせいにして,藤川千愛,anime,2019-05-05,0.04,0.65,277_107.00,0.77,0.0,7.0,0.09,-2.37,1.0,0.03,118.03,4.0,0.62
66663,(Our Love) Don't Throw It All Away,Andy Gibb,,,0g8lk8AuE5NxnJ2w8Lf7UE,若水,南京華夏樂團,chinese instrumental,1995-04-24,0.91,0.61,533_667.00,0.12,0.83,2.0,0.08,-16.76,1.0,0.03,120.06,4.0,0.31
163512,(Where Do I Begin) Love Story,Andy Williams,,,17uK8pQDXKMMY0UqBcIpOs,Никогда,МЫ,russian alt pop,2018-04-06,0.17,0.68,213_043.00,0.62,0.0,5.0,0.16,-10.04,0.0,0.05,138.03,4.0,0.28
106365,(Our Love) Don't Throw It All Away,Andy Gibb,,,429fZokkOwcjqKQcJjNCcB,נעליים מארח את מילקי,שגב,israeli trap,2018-09-06,0.24,0.69,177_000.00,0.71,0.0,0.0,0.13,-8.66,1.0,0.05,160.0,4.0,0.66
36243,(Love Is) Thicker Than Water,Andy Gibb,,,3hrgME9LJ1gRssEBFyccv0,Πατέρα πονεμένε,Κώστας Καραπαναγιωτίδης,pontian folk,2015-12-15,0.7,0.3,156_573.00,0.5,0.0,0.0,0.39,-7.22,0.0,0.06,178.63,3.0,0.43


# Spotify API - GET missing data

##### get a temporary authorization token from: https://developer.spotify.com/console/get-search-item

In [None]:
# input the temporary token
TEMP_TOKEN = input('Enter token: ')

# create a spotify object
spotify = spotipy.Spotify(auth=TEMP_TOKEN)

In [None]:
%%time
# loop to get id where missing

counter = 0

for i, row in df_B100_songs.iterrows():
    
    if i < 0:  # where we timed out last time
        continue
        
    if counter % 10 == 0:
        print(counter, end=' ')
    if counter % 100 == 0:
        print()
        df_B100_songs.to_pickle('df_B100_songs_AF_TEMP.pickle') # save temp file
    
    counter += 1
    
    if df_B100_songs.loc[df_B100_songs.index[i], 'id'] not None:
        continue    
    
    # these are the actual song and artist from the Billboard Hot 100
    song = df_B100_songs.loc[df_B100_songs.index[i], 'song']
    artist = df_B100_songs.loc[df_B100_songs.index[i], 'artist']
    
    # get all track info from Spotify API matching 'song' and 'artist'
    all_track_info = find_all_tracks(song, artist)
    
    # restart loop if there are no results
    if all_track_info == 'MISSING':
        continue
    
    is_exact_match = False
    
    # first subloop - check for direct matched
    for track_info in all_track_info:
        temp_id = track_info[0]
        temp_song = track_info[1]
        temp_artist = track_info[2]
        
        # if there is an exact text match
        is_exact_match = remove_punctuation(temp_song) == remove_punctuation(song) and \
                         remove_punctuation(temp_artist) == remove_punctuation(artist)
        
        # continue subloop
        if is_exact_match:
            df_B100_songs.loc[df_B100_songs.index[i], 'CORRECT_ID'] = temp_id
            df_B100_songs.loc[df_B100_songs.index[i], 'id_status'] = 'CONFIRMED'
            continue
    
    # if we confirmed, go to the next row item
    if df_B100_songs.loc[df_B100_songs.index[i], 'id_status'] != 'CONFIRMED':
            
        # second subloop - check for indirect matches (shouldn't run if direct match occurs)
        for track_info in all_track_info:
            temp_id = track_info[0]
            temp_song = track_info[1]
            temp_artist = track_info[2]

            # if there is a probable text match
            is_probable_match = clean_text(temp_song) == clean_text(song) and \
                                clean_text(temp_artist) == clean_text(artist)

            if is_probable_match:
                df_B100_songs.loc[df_QA2.index[i], 'PROBABLE_ID'] = temp_id
                df_B100_songs.loc[df_QA2.index[i], 'id_status'] = 'LIKELY'
                continue
            

df_B100_songs.to_pickle('df_B100_songs_AF_COMPLETE.pickle')

In [None]:
%%time
# loop to get genre, release_date, and audio features

how_many_passes = 0

for i, row in df_need_AF.iterrows():
    # not sure if this is too many GETs...
    
    # show status update
    if i%10 == 0:
        print(i, end='  ')
    if i%100 == 0:
        print()

    # Get Audio Features - 1st GET request
    list_of_features = [
        'acousticness', 'danceability', 'duration_ms', 'energy',
        'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
        'speechiness', 'tempo', 'time_signature', 'valence'
    ]
    
    track_id = df_need_AF['id'].iloc[i]
    temp_audio_features = spotify.audio_features(track_id)
    for key in list_of_features:
        df_need_AF.loc[i, key] = temp_audio_features[0][key]

    # Get Release Date - 2nd GET request
    track_info = spotify.track(track_id)
    df_need_AF.loc[i, 'release_date'] = track_info['album']['release_date']

    # Get Release Date Genre - 3rd GET request
    artist_id = track_info['artists'][0]['id']
    artist_info = spotify.artist(artist_id)
    list_of_artist_genres = artist_info['genres']

    try:
        most_common_genre = list_of_artist_genres[0] # default to first genre
        if len(list_of_artist_genres) == 1:
            pass
        else:
            for genre in list_of_ordered_genres:
                if genre in list_of_artist_genres:
                    most_common_genre = genre
                    break
        df_need_AF.loc[i, 'genre'] = most_common_genre
    except:
        how_many_passes += 1
        pass  # didn't have any genres, move on
