# Unique Artists

Read and get data of artists

In [1]:
import json
from pathlib import Path
from operator import itemgetter

## Read all data files

In [2]:
data_folder = Path("../data/data/").glob('**/*')
files = [x for x in data_folder if x.is_file()]

In [3]:
#Reads multiple json files and returns a set of tuples (artist name, artist uri)
#@Param batchOfFiles - list containing Path to each file
#@Returns listOfArtists - set of tuples (artist name, artist uri)
def readBatch(batchOfFiles):

    listOfArtists = dict()
    
    #Get all tracks from the batchOfFiles
    for file in batchOfFiles:
        with open(file) as f:
            data = json.load(f)
            for playlist in data['playlists']:
                for track in playlist['tracks']:
                    if track['artist_name'] not in listOfArtists:
                        listOfArtists[track['artist_name']] = track['artist_uri']
    return listOfArtists
    

In [4]:
def combineBatches(batch1, batch2):
    for item in batch2:
        if item not in batch1:
            batch1[item] = batch2[item]

In [5]:
%%time

#Reads the files in batches
numOfBatches = 10
totalFiles = 1000
x = round(totalFiles / numOfBatches)

artistsSet = dict()

for batch in range(numOfBatches):
    startFile = (batch * x)
    endFile = ((batch + 1) * x)
    artists = readBatch(files[startFile:endFile])
    origTotal = len(artistsSet)
    combineBatches(artistsSet, artists)
    print("Finished Reading files:",startFile,'-',endFile - 1, \
        " | Artists Added:", len(artistsSet) - origTotal)
    

Finished Reading files: 0 - 99  | Artists Added: 107167
Finished Reading files: 100 - 199  | Artists Added: 39520
Finished Reading files: 200 - 299  | Artists Added: 28413
Finished Reading files: 300 - 399  | Artists Added: 23334
Finished Reading files: 400 - 499  | Artists Added: 19160
Finished Reading files: 500 - 599  | Artists Added: 17427
Finished Reading files: 600 - 699  | Artists Added: 14728
Finished Reading files: 700 - 799  | Artists Added: 13320
Finished Reading files: 800 - 899  | Artists Added: 12647
Finished Reading files: 900 - 999  | Artists Added: 12026
CPU times: user 2min 22s, sys: 36.8 s, total: 2min 59s
Wall time: 6min


In [6]:
print(len(artistsSet))

287742


## Fetch genres from Spotify API

In [7]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import sys

SPOTIPY_CLIENT_ID = "f96a1f0e67bf41afa9c09b2da29d8c7c"
SPOTIPY_CLIENT_SECRET = "3c726b5c74c643de83771882afc67d2f"

spotify = spotipy.Spotify(
    client_credentials_manager=SpotifyClientCredentials(SPOTIPY_CLIENT_ID,SPOTIPY_CLIENT_SECRET),
    requests_timeout=5)

In [8]:
%%time

#Convert the list of artists to json
#Fetches genres from api and save to artist obj
uniqueArtists = []
totalArtists = len(artistsSet)

for i, artist in enumerate(artistsSet):
    obj = dict()
    obj['artist_name'] = artist
    obj['artist_uri'] = artistsSet[artist]
    try:
        obj['genres'] = spotify.artist(artistsSet[artist])['genres']
    except:
        #If request times out
        obj['genres'] = None
    uniqueArtists.append(obj)

    if (i + 1) % 10000 == 0:
        print('{} artists added.'.format(i+1))



10000 artists added.
20000 artists added.
30000 artists added.
40000 artists added.
50000 artists added.
60000 artists added.
70000 artists added.
80000 artists added.
90000 artists added.
100000 artists added.
110000 artists added.
120000 artists added.
130000 artists added.
140000 artists added.
150000 artists added.
160000 artists added.
170000 artists added.
180000 artists added.
190000 artists added.
200000 artists added.
210000 artists added.
220000 artists added.
230000 artists added.
240000 artists added.
250000 artists added.
260000 artists added.
270000 artists added.
280000 artists added.
CPU times: user 13min 53s, sys: 2min 5s, total: 15min 58s
Wall time: 7h 55min 29s


In [9]:
len(uniqueArtists)

287742

In [10]:
%%time

noGenres = list(filter(lambda x: x['genres'] is None or len(x['genres']) == 0, uniqueArtists))

nulls = list(filter(lambda x: x['genres'] is None, noGenres))
print("Total nulls:", len(nulls))

empties = list(filter(lambda x: x['genres'] is not None and len(x['genres']) == 0, noGenres))
print("Total empties:", len(empties))

Total nulls: 3
Total empties: 166409
CPU times: user 125 ms, sys: 0 ns, total: 125 ms
Wall time: 113 ms


## Save data to json files

In [12]:
#Save artists json data to json files, each file contains around 1000 artists
artistsGenres = list(filter(lambda x: x['genres'] is not None and len(x['genres']) > 0, uniqueArtists))

newTotal = len(artistsGenres)

x = 0
while (x + 1) * 1000 < newTotal:
    batch = dict()
    batch['num_artists'] = 1000
    batch['artists'] = artistsGenres[x * 1000 : (x + 1) * 1000]

    fileName = '../data/artists/artists.{}-{}.json'.format(x * 1000, (x + 1) * 1000 - 1)
    with open(fileName, 'w') as outfile:
        json.dump(batch, outfile, indent=4)
    
    x += 1

lastBatch = dict()
lastBatch['num_artists'] = newTotal % 1000
lastBatch['artists'] = artistsGenres[x * 1000 : totalArtists]

fileName = '../data/artists/artists.{}-{}.json'.format(x * 1000, totalArtists)
with open(fileName, 'w') as outfile:
    json.dump(lastBatch, outfile, indent=4)

In [13]:
print(newTotal)

121330
