# Fetch Tracks Features from Spotify API

In [1]:
import json
from pathlib import Path
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from tqdm import tqdm
from multiprocessing.pool import ThreadPool
import numpy as np

## Load Data

In [2]:
data_folder = Path("../data/genre_playlists/").glob('**/*')
files = [x for x in data_folder if x.is_file()]

## Fetch Unique Tracks and Their Embeddings

### Read and load tracks from each playlist

In [3]:
def getSongURIs(playlist, songs):
    for track in playlist['tracks']:
        songs[track['name']] = (track['artists'][0]['name'], track['uri'])
        

In [4]:
%%time

#Create a dictinary of {track name: (artist name, track uri)}
tracks = dict()
totalTracks = 0
totalPlaylists = 0

#Read files and update tracks
for file in files:
    with open(file) as f:
        data = json.load(f)
        for playlist in data:
            totalTracks += playlist['num_tracks']
            totalPlaylists += 1
            getSongURIs(playlist, tracks)

print('Total unique songs:', len(tracks))
print('Total songs:', totalTracks)
print('Total playlists:', totalPlaylists)


Total unique songs: 337130
Total songs: 492862
Total playlists: 5035
CPU times: user 3.66 s, sys: 359 ms, total: 4.02 s
Wall time: 4.03 s


### Fetch song features using uri

In [5]:
SPOTIPY_CLIENT_ID = "f96a1f0e67bf41afa9c09b2da29d8c7c"
SPOTIPY_CLIENT_SECRET = "3c726b5c74c643de83771882afc67d2f"

spotify = spotipy.Spotify(
    client_credentials_manager=SpotifyClientCredentials(SPOTIPY_CLIENT_ID,SPOTIPY_CLIENT_SECRET),
    requests_timeout=5)

In [6]:
#Divid tracks into chunks
chunk_indices = []

for i in range(0, len(tracks) - len(tracks) % 50000, 50000):
    chunk_indices.append((i, i + 50000))

chunk_indices.append((len(tracks) - len(tracks) % 50000, len(tracks)))

In [7]:
#Fetch song features by batch
track_items = list(tracks.items())
failed_tracks = []

#Create a list of track features
track_features = []
for i in tqdm(range(len(track_items))):
    item = track_items[i]
    try:
        features = spotify.audio_features(item[1][1])
        new_features = []
        new_features.append(item[0])   # track name
        new_features.append(item[1][0]) # artist name
        new_features.append(features[0]['danceability'])  # danceability
        new_features.append(features[0]['energy'])    # energy
        new_features.append(features[0]['key'])   # key
        new_features.append(features[0]['loudness'])    # loudness
        new_features.append(features[0]['mode'])    # mode
        new_features.append(features[0]['speechiness'])    # speechiness
        new_features.append(features[0]['acousticness'])    # acousticness
        new_features.append(features[0]['instrumentalness'])    # instrumentalness
        new_features.append(features[0]['liveness'])    # liveness
        new_features.append(features[0]['valence']) # valence
        new_features.append(features[0]['tempo'])    # tempo
        new_features.append(features[0]['duration_ms']) # duration_ms
        track_features.append(new_features)
    except:
        failed_tracks.append(item)

100%|██████████| 337130/337130 [9:18:45<00:00, 10.06it/s]


In [8]:
print(failed_tracks)

[('Chaser', ('Ill Vision', 'spotify:track:66lnY4uOKVwQ5RS4VpdZYJ')), ('سورة البقرة', ('Sheikh Mohamed al Tablawi', 'spotify:track:0kbMIeWpfvpcHPSz01qELI')), ('Midnight With Bird-Voiced Treefrogs and Cattle', ('David Michael', 'spotify:track:7Dzc44TMgUkuT4vJgzrmF0'))]


### Saving to CSV

In [12]:
#Divide data chunks
chunk_indices = []

for i in range(0, len(track_features) - len(track_features) % 10000, 10000):
    chunk_indices.append((i, i + 10000))

chunk_indices.append((len(track_features) - len(track_features) % 10000, len(track_features)))


In [13]:
#Write to CSV
import csv

for t in chunk_indices:
    with open('../data/track_features/features.{}-{}.csv'.format(t[0], t[1]), 'w+') as csvFile:
        csvWriter = csv.writer(csvFile, delimiter=',')
        csvWriter.writerow(['track_name', 'artist_name', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms'])
        csvWriter.writerows(track_features[t[0]:t[1]])