In [57]:
import pandas as pd
import requests
import json
from urllib.parse import urlencode, quote
import asyncio
import aiohttp
import time

In [154]:
with open('../secrets.json') as f:
    secrets = json.load(f)

CLIENT_ID = secrets['SPOTIFY_CLIENT_ID']
CLIENT_SECRET = secrets['SPOTIFY_CLIENT_SECRET']

In [309]:
auth_url = 'https://accounts.spotify.com/api/token'
response = requests.post(auth_url, {
    'grant_type': 'client_credentials',
    'client_id': CLIENT_ID,
    'client_secret': CLIENT_SECRET,
})

access_token = response.json()['access_token']
headers = {'Authorization': f'Bearer {access_token}'}

In [283]:
BASE_URL = 'https://api.spotify.com'

In [361]:
original_df = pd.read_csv('../datasets/hot_100_rank_1.csv')
unique_df = original_df[['Song', 'Artist']].drop_duplicates()

query_df = unique_df.copy()

query_df['Song'] = query_df['Song'].replace({'...': '', '?': ''}, regex=False)
query_df['Song'] = query_df['Song'].replace({';': '', '/': ' '}, regex=True)
query_df['Song'] = query_df['Song'].str.replace(r'\(.*?\)', '', regex=True).str.rstrip()
query_df['Song'] = query_df['Song'].str.strip()

query_df['Artist'] = query_df['Artist'].str.split(r'\s*(Feat|feat|With|with|And|and|Starring).*').str[0]
query_df['Artist'] = query_df['Artist'].str.replace(r'\(.*?\)', '', regex=True).str.rstrip()
query_df['Artist'] = query_df['Artist'].replace({' x ': ' ', '¥': '', ':': '', '$': '', '&': ' ', 'Duet': ''}, regex=True)
query_df['Artist'] = query_df['Artist'].str.strip()

In [300]:
async def get_song_uri_match(song, artist, session, retries=2, delay=2):
    
    params = {
        'q': f'track:{' '.join(song.split(' ')[:4])} artist:{artist}',
        'type': 'track',
        'limit': 1
    }
    
    url = f"{BASE_URL}/v1/search?{urlencode(params)}"

    tracks = None
    attempts = 0

    while tracks is None and attempts < retries:
        async with session.get(url, headers=headers) as response:
            
            if response.status == 429:
                await asyncio.sleep(delay)
                continue
            else:
                response.raise_for_status()

            data = await response.json()
            tracks = data.get('tracks', {}).get('items', [])
            
    return tracks[0]['uri'] if tracks else None

In [364]:
async def fetch_all_uris(qdf):
    async with aiohttp.ClientSession() as session:
        tasks = [get_song_uri_match(row.Song, row.Artist, session) for row in qdf.itertuples(index=False)]
        uris = await asyncio.gather(*tasks)
    return uris

uris = await fetch_all_uris(query_df)
query_df['Track URI'] = uris
nulls = query_df[query_df['Track URI'].isna()]
nulls

Unnamed: 0,Song,Artist,Track URI
120,Are You Lonesome To-night?,Elvis Presley,
612,American Woman No Sugar Tonight,The Guess Who,
645,My Sweet Lord Isn't It A Pity,George Harrison,
845,Then Came You,Dionne Warwicke Spinners,
862,Pick Up The Pieces,AWB,
884,The Hustle,Van McCoy The Soul City Symphony,
1382,Careless Whisper,Wham!,
2204,Independent Women Part I,Destiny's Child,


In [365]:
# Manually fix missing value lookups

pd.options.mode.chained_assignment = None

nulls['Song'] = nulls['Song'].str.replace(" To-night?", " Tonight", regex=False)
nulls['Song'] = nulls['Song'].str.replace(" No Sugar Tonight", "", regex=False)
nulls['Song'] = nulls['Song'].str.replace(" Isn't It A Pity", "", regex=False)
nulls['Song'] = nulls['Song'].str.replace(" Part I", "", regex=False)

nulls['Artist'] = nulls['Artist'].str.replace("Warwicke", "Warwick", regex=False)
nulls['Artist'] = nulls['Artist'].str.replace(" Spinners", "", regex=False)
nulls['Artist'] = nulls['Artist'].str.replace("AWB", "Average White Band", regex=False)
nulls['Artist'] = nulls['Artist'].str.replace(" The Soul City Symphony", "", regex=False)
nulls['Artist'] = nulls['Artist'].str.replace("Wham!", "George Michael", regex=False)

missing_uris = await fetch_all_uris(nulls)
nulls['Track URI'] = missing_uris
nulls

Unnamed: 0,Song,Artist,Track URI
120,Are You Lonesome Tonight,Elvis Presley,spotify:track:4xUqqie4bBKufHtlMuZS3k
612,American Woman,The Guess Who,spotify:track:0emHuukZSuaOzOlsAWHj2W
645,My Sweet Lord,George Harrison,spotify:track:0KZodeWxqxd88F9wY1cqgs
845,Then Came You,Dionne Warwick,spotify:track:2ARBXxaM4pzXhGpFoZPAg7
862,Pick Up The Pieces,Average White Band,spotify:track:2x1LQq8lsUzAA2wNj8yjC9
884,The Hustle,Van McCoy,spotify:track:6hYT9vkr0xMjhBlaLsYq9T
1382,Careless Whisper,George Michael,spotify:track:5WDLRQ3VCdVrKw0njWe5E5
2204,Independent Women,Destiny's Child,spotify:track:69XUpOpjzDKcfdxqZebGiI


In [366]:
query_df['Track URI'] = query_df['Track URI'].where(query_df['Track URI'].notna(), nulls['Track URI'])
query_df[query_df['Track URI'].isna()]

Unnamed: 0,Song,Artist,Track URI


In [385]:
unique_df['Track URI'] = query_df['Track URI']
merged_df = original_df.merge(unique_df[['Song', 'Artist', 'Track URI']], on=['Song', 'Artist'], how='left')
merged_df.to_csv('../datasets/hot_100_rank_1_with_uri.csv', index=False)
merged_df.head()

Unnamed: 0,Date,Song,Artist,Rank,Last Week,Peak Position,Weeks in Charts,Image URL,Track URI
0,1958-08-06,Poor Little Fool,Ricky Nelson,1,1,1,2,#,spotify:track:5ayybTSXNwcarDtxQKqvWX
1,1958-08-13,Nel Blu Dipinto Di Blu (Volare),Domenico Modugno,1,2,1,3,https://charts-static.billboard.com/img/1958/0...,spotify:track:006Ndmw2hHxvnLbJsBFnPx
2,1958-08-20,Little Star,The Elegants,1,2,1,4,https://charts-static.billboard.com/img/1958/0...,spotify:track:6xupOaBWORbDmakCdQwMRG
3,1958-08-27,Nel Blu Dipinto Di Blu (Volare),Domenico Modugno,1,2,1,5,https://charts-static.billboard.com/img/1958/0...,spotify:track:006Ndmw2hHxvnLbJsBFnPx
4,1958-09-03,Nel Blu Dipinto Di Blu (Volare),Domenico Modugno,1,1,1,6,https://charts-static.billboard.com/img/1958/0...,spotify:track:006Ndmw2hHxvnLbJsBFnPx


In [386]:
spotify_df = pd.read_csv('../datasets/top_10000_spotify.csv')
spotify_merged_df = pd.merge(spotify_df, merged_df, on='Track URI', how='inner', suffixes=('', '_drop'))
spotify_merged_df = spotify_merged_df.loc[:, ~spotify_merged_df.columns.str.endswith('_drop')]

spotify_merged_df.to_csv('../datasets/spotify_hot_100_rank_1.csv', index=False)

pd.set_option('display.max_columns', None)
spotify_merged_df.head()

Unnamed: 0,Track URI,Song,Artist URI(s),Artist,Album URI,Album Name,Album Artist URI(s),Album Artist Name(s),Album Release Date,Album Image URL,Disc Number,Track Number,Track Duration (ms),Track Preview URL,Explicit,Popularity,ISRC,Added By,Added At,Artist Genres,Danceability,Energy,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature,Album Genres,Label,Copyrights,Date,Rank,Last Week,Peak Position,Weeks in Charts,Image URL
0,spotify:track:6dGWq08LXuIw6T2oUeHECh,Convoy,spotify:artist:0iTkPxRldzi5lmS6qZ70JV,C.W. McCall,spotify:album:6XbZ0Hc2LQaLJuUaFrHfTY,C.W. McCall's Greatest Hits,spotify:artist:0iTkPxRldzi5lmS6qZ70JV,C.W. McCall,1990-01-01,https://i.scdn.co/image/ab67616d0000b273183cad...,1,1,230600,https://p.scdn.co/mp3-preview/6a093f1c5ce5e657...,False,64,USPR37500034,spotify:user:bradnumber1,2021-08-08T09:26:31Z,"classic country pop,country rock,truck-driving...",0.68,0.543,5.0,-14.876,1.0,0.141,0.857,0.000116,0.0881,0.796,111.62,4.0,,Island Mercury,"C © 1990 UMG Recordings, Inc., P This Compilat...",1976-01-07,1,6,1,6,https://charts-static.billboard.com/img/1975/1...
1,spotify:track:0k2GOhqsrxDTAbFFSdNJjT,Temperature,spotify:artist:3Isy6kedDrgPYoTS1dazA9,Sean Paul,spotify:album:32Bu3ETQhR1PFCj3ndDlYf,The Trinity,spotify:artist:3Isy6kedDrgPYoTS1dazA9,Sean Paul,2005-09-26,https://i.scdn.co/image/ab67616d0000b27369ba68...,1,11,218573,https://p.scdn.co/mp3-preview/fd0b2beda757e9b8...,False,84,USAT20505520,spotify:user:bradnumber1,2021-08-08T09:26:31Z,"dance pop,dancehall,pop,pop rap",0.951,0.6,0.0,-4.675,0.0,0.0685,0.106,0.0,0.0712,0.822,125.04,4.0,,Atlantic Records,C © 2005 Atlantic Recording Corporation for th...,2006-03-29,1,2,1,11,https://charts-static.billboard.com/img/2005/1...
2,spotify:track:12q3V8ShACq2PSWINMc2rC,It's Too Late,spotify:artist:319yZVtYM9MBGqmSQnMyY6,Carole King,spotify:album:12n11cgnpjXKLeqrnIERoS,Tapestry,spotify:artist:319yZVtYM9MBGqmSQnMyY6,Carole King,1971,https://i.scdn.co/image/ab67616d0000b27323350f...,1,3,233173,https://p.scdn.co/mp3-preview/65a393c68256deea...,False,72,USSM17100509,spotify:user:bradnumber1,2021-08-08T09:26:31Z,"brill building pop,classic rock,folk,folk rock...",0.45,0.442,9.0,-12.718,0.0,0.0353,0.493,0.00564,0.134,0.812,208.282,4.0,,Ode/Epic/Legacy,P Originally released 1971. All rights reserve...,1971-06-16,1,6,1,7,https://charts-static.billboard.com/img/1962/0...
3,spotify:track:12q3V8ShACq2PSWINMc2rC,It's Too Late,spotify:artist:319yZVtYM9MBGqmSQnMyY6,Carole King,spotify:album:12n11cgnpjXKLeqrnIERoS,Tapestry,spotify:artist:319yZVtYM9MBGqmSQnMyY6,Carole King,1971,https://i.scdn.co/image/ab67616d0000b27323350f...,1,3,233173,https://p.scdn.co/mp3-preview/65a393c68256deea...,False,72,USSM17100509,spotify:user:bradnumber1,2021-08-08T09:26:31Z,"brill building pop,classic rock,folk,folk rock...",0.45,0.442,9.0,-12.718,0.0,0.0353,0.493,0.00564,0.134,0.812,208.282,4.0,,Ode/Epic/Legacy,P Originally released 1971. All rights reserve...,1971-06-23,1,1,1,8,https://charts-static.billboard.com/img/1962/0...
4,spotify:track:12q3V8ShACq2PSWINMc2rC,It's Too Late,spotify:artist:319yZVtYM9MBGqmSQnMyY6,Carole King,spotify:album:12n11cgnpjXKLeqrnIERoS,Tapestry,spotify:artist:319yZVtYM9MBGqmSQnMyY6,Carole King,1971,https://i.scdn.co/image/ab67616d0000b27323350f...,1,3,233173,https://p.scdn.co/mp3-preview/65a393c68256deea...,False,72,USSM17100509,spotify:user:bradnumber1,2021-08-08T09:26:31Z,"brill building pop,classic rock,folk,folk rock...",0.45,0.442,9.0,-12.718,0.0,0.0353,0.493,0.00564,0.134,0.812,208.282,4.0,,Ode/Epic/Legacy,P Originally released 1971. All rights reserve...,1971-06-30,1,1,1,9,https://charts-static.billboard.com/img/1962/0...
