In [1]:
import json
from pathlib import Path
from operator import itemgetter

## Read all data files

https://stackoverflow.com/questions/39909655/listing-of-all-files-in-directory

----------------------------------------
#### Stats given by dataset

number of playlists 1000000 <br>
number of tracks 66346428 <br>
number of unique tracks 2262292 <br>
number of unique albums 734684 <br>
number of unique artists 295860 <br>
number of unique titles 92944 <br>
number of playlists with descriptions 18760 <br>
number of unique normalized titles 17381 <br>
avg playlist length 66.346428


In [2]:
data_folder = Path("../data/data/").glob('**/*')
files = [x for x in data_folder if x.is_file()]

artists_folder = Path("../data/artists").glob('**/*')
artists_files = [x for x in artists_folder if x.is_file()]

## Get all unique tracks and their frequency


### Read and grab all the data from the files
The data is seperated into batches

In [3]:
#Takes multiple json files and keeps a tally of each track's frequency
#@Param batchOfFiles - list containing Path to each file
#@Returns listOfTracks - a dictionary in which keys are (track_name, artist_name) and values are the track's frequency
#@Returns totalTracks - the total amount of tracks in the listOfTracks
def readBatch(batchOfFiles):
    
    listOfTracks = dict()
    totalTracks = 0
    
    #Get all tracks from the batchOfFiles
    for file in batchOfFiles:
        with open(file) as f:
            data = json.load(f)
            for playlist in data['playlists']:
                totalTracks += playlist['num_tracks']
                simplifyList(listOfTracks, playlist['tracks'])
    return listOfTracks, totalTracks

In [4]:
#Takes a list of tracks and counts the frequency of each song in the list
#@Param listOfTracks - a dictionary of track frequencies {('track_name','artist_name': frequency, (..):. ,...}.
#@Param playlist - a list of tracks to read from
def simplifyList(listOfTracks, playlist):
    #Will store data as (track, artist): frequency
    for track in playlist:
        key = (track['track_name'], track['artist_name'])
        if key not in listOfTracks:
            listOfTracks[key] = 1
        else:
            listOfTracks[key] = listOfTracks[key] + 1

In [5]:
%%time

#Code for reading the files in batches
#Reads the files in batches
#Grabs all tracks from the files and then sorts by track_name and artist_name for easier simplifying
#Simplifying is just getting the frequency of each track

##Takes 10-25 minutes :(
#I didnt play around with batch amounts
#Maybe use threads for each batch

'''
threading sounds like a good idea
right now it should take under 10 min, depends on environment of course
'''

numOfBatches = 10
totalFiles = 1000
x = round(totalFiles / numOfBatches)

#Stores the batches of tracks as a list of list
#[[batch_1][batch_2]....[batch_X]]
batchesOfTracks = [] 
totalTracks = 0

for batch in range(numOfBatches):
    startFile = (batch * x)
    endFile = ((batch + 1) * x)
    
    tracksInBatch, amtOfTracks = readBatch(files[startFile:endFile])
    print("Finished Reading files:",startFile,'-',endFile - 1, " | Tracks Added:", amtOfTracks)
    
    totalTracks = totalTracks + amtOfTracks
    
    batchesOfTracks.append(tracksInBatch)

Finished Reading files: 0 - 99  | Tracks Added: 6685101
Finished Reading files: 100 - 199  | Tracks Added: 6622724
Finished Reading files: 200 - 299  | Tracks Added: 6616212
Finished Reading files: 300 - 399  | Tracks Added: 6597661
Finished Reading files: 400 - 499  | Tracks Added: 6597376
Finished Reading files: 500 - 599  | Tracks Added: 6594992
Finished Reading files: 600 - 699  | Tracks Added: 6667422
Finished Reading files: 700 - 799  | Tracks Added: 6672749
Finished Reading files: 800 - 899  | Tracks Added: 6645424
Finished Reading files: 900 - 999  | Tracks Added: 6646767
CPU times: user 2min 57s, sys: 39.3 s, total: 3min 36s
Wall time: 7min 23s


In [6]:
#Target is 66346428

print(totalTracks)

66346428


### Combine the batches of data into one list

In [7]:
#Used to combine two different listOfTracks containing [[track_name,artist_name,frequency],...] 
#@param batch1, batch2 - The batches that are going to be combined and then simplified
#@Return returns combined batch as [[track_name,artist_name,frequency],...]
def combineBatchesandSimplify(batch1, batch2):
    
    for track in batch2:
        if track not in batch1:
            batch1[track] = batch2[track]
        else:
            batch1[track] = batch1[track] + batch2[track]

In [8]:
%%time

#Combine all batches into one list

allTracks = dict()

for batch in batchesOfTracks:
    combineBatchesandSimplify(allTracks,batch)


CPU times: user 4.94 s, sys: 46.9 ms, total: 4.98 s
Wall time: 5.02 s


In [9]:
#Checks the amount of tracks

count = 0
for track in allTracks:
    count = count + allTracks[track]
    
print(count)

# Target == 66346428

66346428


In [10]:
### NOT SURE WHY ITS LOWER
#Target is 2262292

print(len(allTracks))

2189699


# Loop through all tracks to get songs to delete
# and then delete Songs from the playlists

In [11]:
#Using set because set has O(1) lookup time 
#Will store elements as tuple (track_name, artist_name)
tracksToDel = set(filter(lambda x: allTracks[x] < 3, allTracks))
deleteAmt = len(tracksToDel) ### For testing

In [12]:
print("Amount of Tracks to delete:", deleteAmt)

Amount of Tracks to delete: 1339273


## Get list of artists with genres

This is a list of unique artists based on the dataset after removing songs that occur fewer than 3 times and playlists with fewer than 10 songs or smaller than 30% of their original size

Songs with artists that don't have genre labels from Spotify API will be removed

In [14]:
artistsList = set()
totalArtists = 0
for file in artists_files:
    with open(file) as f:
        data = json.load(f)
        artistsList.update([artist['artist_name'] for artist in data['artists']])

print("Total artists:", len(artistsList))

Total artists: 121330


In [15]:
#Deletes songs in the json data
#@param data - json file containing metadata for playlists
#@param songsToDel - list of songs to del from data. read as [track_name, artist_name]

def delSongs(data, songsToDel, artistsList):
    tracksDeleted = 0 # for testing
    playlistsDel = 0 # for testing
    playlistsToDel = set()
    
    for playlist in data['playlists']:
        playlist['tracks'] = list(filter(lambda track: \
            (track['track_name'], track['artist_name']) not in songsToDel, playlist['tracks']))
        
        #delete songs with artists that are not in artistsList
        playlist['tracks'] = list(filter(lambda track: track['artist_name'] in artistsList, playlist['tracks']))
        
        newNumTracks = len(playlist['tracks'])
        origNumTracks = playlist['num_tracks']
        tracksDeleted += origNumTracks - newNumTracks
        playlist['num_tracks'] = newNumTracks

        if newNumTracks < 10 or newNumTracks < round(origNumTracks * .3)\
            or newNumTracks > 5000:
            playlistsDel += 1
            playlistsToDel.add(playlist['pid'])

        data['playlists'] = list(filter(lambda playlist: playlist['pid'] not in playlistsToDel, data['playlists']))
    
    return playlistsDel, tracksDeleted #For testing | Might return new json data to save


# Delete songs

In [16]:
%%time

## Took 12 minutes

#Check every playlist for unique tracks(tracks under 3 frequency) and "deletes" them
#If the playlist results in less than 30% of its original size or less than 10 tracks
#                   -> delete the playlist too

TracksDeleted = 0 #for testing
playlistsDel = 0 #for testing

## Optimize with concurrency with/or splitting into batches 

for i, file in enumerate(files):
    with open(file) as f:
        data = json.load(f)
        if i % 50 == 0:
            print("Files up to", i, "cleaned")
        x,y = delSongs(data,tracksToDel,artistsList)

        newFileName = '../data/filtered_data/filter.{}-{}.json'.format(i*1000, (i+1)*1000-1)
        with open(newFileName, 'w') as outfile:
            json.dump(data, outfile, indent=4)
        
        playlistsDel = playlistsDel + x
        TracksDeleted = TracksDeleted + y
                

                
print("Assumed Amt of Tracks to delete:", deleteAmt)
print("Tracks Deleted:", TracksDeleted)
print("Playlists Deleted:", playlistsDel)


        
        

Files up to 0 cleaned
Files up to 50 cleaned
Files up to 100 cleaned
Files up to 150 cleaned
Files up to 200 cleaned
Files up to 250 cleaned
Files up to 300 cleaned
Files up to 350 cleaned
Files up to 400 cleaned
Files up to 450 cleaned
Files up to 500 cleaned
Files up to 550 cleaned
Files up to 600 cleaned
Files up to 650 cleaned
Files up to 700 cleaned
Files up to 750 cleaned
Files up to 800 cleaned
Files up to 850 cleaned
Files up to 900 cleaned
Files up to 950 cleaned
Assumed Amt of Tracks to delete: 1339273
Tracks Deleted: 3651848
Playlists Deleted: 48727
CPU times: user 14min 52s, sys: 1min 7s, total: 16min
Wall time: 23min
