In [1]:
import pandas as pd
import requests
import json
from urllib.parse import urlencode, quote
import asyncio
import aiohttp
import time
import sys
sys.path.append('..')
from spotify_api import SpotifyAPI

In [14]:
original_df = pd.read_csv('../datasets/hot_100_rank_1.csv')
unique_df = original_df[['Song', 'Artist']].drop_duplicates()

query_df = unique_df.copy()

query_df['Song'] = query_df['Song'].str.lower()
query_df['Song'] = query_df['Song'].str.replace(r'\s*[\(\[\{].*?[\)\]\}]', '', regex=True).str.rstrip()
query_df['Song'] = query_df['Song'].replace({' - ': ' '}, regex=False)
query_df['Song'] = query_df['Song'].replace({';': '', '/': ' ', '"': ''}, regex=True)
query_df['Song'] = query_df['Song'].str.replace(r'[+|?!]|\.{3}', ' ', regex=True)
query_df['Song'] = query_df['Song'].str.split().str[:4].str.join(' ')
query_df['Song'] = query_df['Song'].str.strip()

query_df['Artist'] = query_df['Artist'].str.lower()
query_df['Artist'] = query_df['Artist'].str.replace(r'\s*[\(\[\{].*?[\)\]\}]', '', regex=True).str.rstrip()
query_df['Artist'] = query_df['Artist'].str.split(r'\s*(feat|starring).*').str[0]
query_df['Artist'] = query_df['Artist'].replace({' and ': ' ', ' x ': ' ', ' with ': ' ', 'duet': '', ' - ': ' '}, regex=False)
query_df['Artist'] = query_df['Artist'].str.replace(r'[+|?]|\.{3}', ' ', regex=True)
query_df['Artist'] = query_df['Artist'].replace({'¥': '', ':': '', '$': '', '&': ' ',  '|': '', '/': ' ', '"': ''}, regex=True)
query_df['Artist'] = query_df['Artist'].str.split().str[:4].str.join(' ')
query_df['Artist'] = query_df['Artist'].str.strip()

query_df.to_csv('../datasets/hot_100_rank_1_unique_query.csv')

In [4]:
api = SpotifyAPI(secrets_path='../secrets.json')
uris = await api.get_matching_tracks_uris(songs=query_df['Song'], artists=query_df['Artist'], delay=3, retries=3)
uris[-5:]

Generated Access Token


['spotify:track:6AI3ezQ4o3HUoP6Dhudph3',
 'spotify:track:5IZXB5IKAD2qlvTPJYDCFB',
 'spotify:track:2tHwzyyOLoWSFqYNjeVMzj',
 'spotify:track:2FQrifJ1N335Ljm3TjTVVf',
 'spotify:track:7hR22TOX3RorxJPcsz5Wbo']

In [5]:
query_df['Track URI'] = uris
nulls = query_df[query_df['Track URI'].isna()]
nulls

Unnamed: 0,Song,Artist,Track URI
120,Are You Lonesome To-night?,Elvis Presley,
612,American Woman No Sugar,The Guess Who,
645,My Sweet Lord Isn't,George Harrison,
845,Then Came You,Dionne Warwicke Spinners,
862,Pick Up The Pieces,AWB,
884,The Hustle,Van McCoy The Soul City Symphony,
1382,Careless Whisper,Wham!,
2204,Independent Women Part I,Destiny's Child,


In [6]:
# Manually fix missing value lookups

pd.options.mode.chained_assignment = None

nulls['Song'] = nulls['Song'].str.replace(" To-night?", " Tonight", regex=False)
nulls['Song'] = nulls['Song'].str.replace(" No Sugar", "", regex=False)
nulls['Song'] = nulls['Song'].str.replace(" Isn't It A Pity", "", regex=False)
nulls['Song'] = nulls['Song'].str.replace(" Part I", "", regex=False)
nulls['Song'] = nulls['Song'].str.replace(" Isn't", "", regex=False)

nulls['Artist'] = nulls['Artist'].str.replace("Warwicke", "Warwick", regex=False)
nulls['Artist'] = nulls['Artist'].str.replace(" Spinners", "", regex=False)
nulls['Artist'] = nulls['Artist'].str.replace("AWB", "Average White Band", regex=False)
nulls['Artist'] = nulls['Artist'].str.replace(" The Soul City Symphony", "", regex=False)
nulls['Artist'] = nulls['Artist'].str.replace("Wham!", "George Michael", regex=False)

nulls['Track URI'] = await api.get_matching_tracks_uris(songs=nulls['Song'], artists=nulls['Artist'], delay=3, retries=3)
nulls

Unnamed: 0,Song,Artist,Track URI
120,Are You Lonesome Tonight,Elvis Presley,spotify:track:4xUqqie4bBKufHtlMuZS3k
612,American Woman,The Guess Who,spotify:track:0emHuukZSuaOzOlsAWHj2W
645,My Sweet Lord,George Harrison,spotify:track:0KZodeWxqxd88F9wY1cqgs
845,Then Came You,Dionne Warwick,spotify:track:2ARBXxaM4pzXhGpFoZPAg7
862,Pick Up The Pieces,Average White Band,spotify:track:2x1LQq8lsUzAA2wNj8yjC9
884,The Hustle,Van McCoy,spotify:track:6hYT9vkr0xMjhBlaLsYq9T
1382,Careless Whisper,George Michael,spotify:track:5WDLRQ3VCdVrKw0njWe5E5
2204,Independent Women,Destiny's Child,spotify:track:69XUpOpjzDKcfdxqZebGiI


In [7]:
query_df['Track URI'] = query_df['Track URI'].where(query_df['Track URI'].notna(), nulls['Track URI'])
query_df[query_df['Track URI'].isna()]

Unnamed: 0,Song,Artist,Track URI


In [21]:
unique_df['Track URI'] = query_df['Track URI']
unique_df['Track ID'] = unique_df['Track URI'].str.replace('spotify:track:', '', regex=False)
unique_df = unique_df.drop(columns=['Track URI', 'Rank', 'Last Week', 'Peak Position', 'Image URL'])
unique_df.to_csv('../datasets/hot_100_rank_1_unique.csv', index=False)

merged_df = original_df.merge(unique_df[['Song', 'Artist', 'Track ID']], on=['Song', 'Artist'], how='left')
merged_df.to_csv('../datasets/hot_100_rank_1.csv', index=False)
merged_df.head()

Unnamed: 0,Date,Song,Artist,Weeks in Charts,Track ID
0,1958-08-06,Poor Little Fool,Ricky Nelson,2,5ayybTSXNwcarDtxQKqvWX
1,1958-08-13,Nel Blu Dipinto Di Blu (Volare),Domenico Modugno,3,006Ndmw2hHxvnLbJsBFnPx
2,1958-08-20,Little Star,The Elegants,4,6xupOaBWORbDmakCdQwMRG
3,1958-08-27,Nel Blu Dipinto Di Blu (Volare),Domenico Modugno,5,006Ndmw2hHxvnLbJsBFnPx
4,1958-09-03,Nel Blu Dipinto Di Blu (Volare),Domenico Modugno,6,006Ndmw2hHxvnLbJsBFnPx
