In [1]:
import json
from pathlib import Path

from operator import itemgetter
from itertools import islice

###For TESTING
import copy 

## Open and Read Dataset

Data Paths- How to read files from different path.

https://medium.com/@ageitgey/python-3-quick-tip-the-easy-way-to-deal-with-file-paths-on-windows-mac-and-linux-11a072b58d5f

In [2]:
data_folder = Path("spotify_million_playlist_dataset/data/")

file_to_open = data_folder / "mpd.slice.0-999.json"

with open(file_to_open) as f:
    data = json.load(f)

In [3]:
def countOfPlaylists(data):
    return len(data['playlists'])

In [4]:
print("Count:", countOfPlaylists(data))    

for playlist in data['playlists']:
    print(playlist['name'])

Count: 1000
Throwbacks
Awesome Playlist
korean 
mat
90s
Wedding
I Put A Spell On You
2017
BOP
old country 
abby 
VIBE
relax
sleep
90's 
New Songs
slow hands
Mom's playlist
SARAH
melancholy
mixtape
Sad Songs
fall '17
✔️
Twenty one pilots
run it
Winter 2014
smooth 
Yeet
groovy
Garage Rock
Running 2.0
acoustic
morning
good good
Good Vibes
 indie rock
Hits
tb
Country
2020
Belters
Road Trippin'
Country
taylor swift
angst
Running 2
Spring 2016
Country
Gospel
SB
Kevin
Christmas
NB
Smooth Jazz
Tennessee 
funky
spring 16
Gates
Favorite Songs
Ski
Country
hype
Awesome Mix
Elizabeth
PlayStation
Ashley
chill
xx
art class
Random Tunes
boat
spanish jams
new
CR
Run!!
winter '17
Workout
Traps 
Dope
Chill
rap
Gospel
Florida
Workout2
Gym
Zoned
relax
KPOP
80's
For the Road
R & B
Tom Petty
Catchy Songs
relax
roadtrippin
christmas
2012
Dave
bang bang. 
Disneyland
#boostyourrun
LUV
latin
JAMS
~Rando~
Party Rock!!
party people
The Piano Guys
Worship
Brasileiras
country
4/20
2016
Willie
beach
Gold Digger
Fall 

# Data Filter:
1. Remove if there are less than 10 songs in the playlist
2. Tracks occurring in less than 3 playlists are discarded.
3. Playlists with the less than 30% of tracks left after this are also removed. 

All duplicate tracks from playlists are removed. And finally, only playlists with lengths in
the range [10-5000] are retained and the rest are discarded. This leaves us with a total of 745,543 unique playlists and
2,470,756 unique tracks and 2680 unique genres.

In [5]:
test = copy.deepcopy(data)

## 1. Remove if there are less than 10 songs in the playlist

In [6]:
#Remove playlist if there are less than the minimum amount of songs in it
def delSmallPlaylist(data, minimum):
    data['playlists'] = list(filter(lambda playlist: playlist['num_tracks'] >= minimum, data['playlists']))
        

In [7]:
#Remove if less than 10 songs in the playlist:

print("Data Count:", countOfPlaylists(data))   
print("Copy Count:", countOfPlaylists(test))   

# delSmallPlaylist(test,10)

print("Data Count:", countOfPlaylists(data))   
print("Copy Count:", countOfPlaylists(test))  

Data Count: 1000
Copy Count: 1000
Data Count: 1000
Copy Count: 1000


## 2a. Get All tracks from all playlists and their frequency 
## listOfTracks = [track, artist, frequency] 

In [8]:
#Get All tracks from all playlists and their frequency [track, artist, frequency] 

#Creates a dictionary in which (track, artist) tuples are keys and frequencies are values
dictTracks = dict()
totalSongCount = 0
for playlist in test['playlists']:
    for track in playlist['tracks']:
        totalSongCount += 1
        key = (track['track_name'], track['artist_name'])
        if key not in dictTracks:
            dictTracks[key] = 1
        else:
            dictTracks[key] = dictTracks[key] + 1

for key in dictTracks:
    print(key, ':', dictTracks[key])

) : 1
('Blast Ya (feat. Barrington Levy)', 'Borgore') : 1
('Law', 'Yo Gotti') : 1
('The Illest', 'Far East Movement') : 1
('Stranger - Skrillex Remix with Tennyson & White Sea', 'Skrillex') : 1
('Jungle', 'Andre Nickatina') : 1
('Realest In the City', 'Preme') : 2
('Rewind', 'Kid Ink') : 1
('Erbody But Me', 'Tech N9ne') : 1
('Memories (Part II)', 'Big Sean') : 1
('Bling Blaww Burr (feat. Young Dolph)', 'Gucci Mane') : 1
('Gimmie Got Shot', 'YG') : 1
('About Mine', 'Kid Ink') : 1
('2 Of Amerikaz Most Wanted - Album Version (Edited)', '2Pac') : 1
('Tear It Up - Dirty', 'Yung Wun') : 1
('Pere', 'DaVido') : 1
('Ready Set Go Remix (feat. Big Boi & T.I.)', 'Killer Mike') : 1
('Know No Better (feat. Travis Scott & Quavo) - Bad Bunny Remix', 'Major Lazer') : 1
('Montreal', 'The Weeknd') : 2
('Deserve (feat. Travis Scott)', 'Kris Wu') : 1
('Take A Look Around', 'Limp Bizkit') : 1
('Wherever You Go', 'Ron Pope') : 1
('In My Bones', 'Ron Pope') : 1
('Tightrope', 'Ron Pope') : 1
('Let Me Go', 'Ron

In [9]:
print("Total Amount of Tracks:", totalSongCount)

Total Amount of Tracks: 67503


## 2b. Tracks occurring in less than 3 playlists are discarded.

In [10]:
#Check every playlist for unique tracks(tracks under 3 frequency) and "deletes" them
#If the playlist results in less than 30% of its original size or less than 10 tracks
#                   -> delete the playlist too

#Get tracks that have frequencies under 3
removeTracks = dict(filter(lambda x: x[1] < 3, dictTracks.items()))

numToDel = sum([removeTracks[k] for k in removeTracks])
numToDelUni = len(removeTracks)

print('Number of tracks to delete:', numToDel)
print('Number of unique tracks to delete:', numToDelUni)

tracksToDel = removeTracks.keys()
actualDel = 0
playlistsDel = 0
playlistsToDel = []

#Iterates through the playlists to delete tracks under 3 frequency
#Pushes the playlist pid to playlistsToDel if the new playlist needs to be deleted
for playlist in test['playlists']:
    playlist['tracks'] = list(filter(lambda track: \
        (track['track_name'], track['artist_name']) not in tracksToDel, playlist['tracks']))

    newNumTracks = len(playlist['tracks'])
    origNumTracks = playlist['num_tracks']
    actualDel += origNumTracks - newNumTracks
    playlist['num_tracks'] = newNumTracks

    if newNumTracks < 10 or newNumTracks < round(origNumTracks * .3) or newNumTracks > 5000:
        playlistsDel += 1
        playlistsToDel.append(playlist['pid'])

#Delete playlists based on playlistsToDel
test['playlists'] = list(filter(lambda playlist: playlist['pid'] not in playlistsToDel, test['playlists']))

print('Number of tracks actually deleted:', actualDel)
print('Number of playlists deleted:', len(playlistsToDel))

Number of tracks to delete: 33183
Number of unique tracks to delete: 28884
Number of tracks actually deleted: 33183
Number of playlists deleted: 353


In [11]:
print("Count:", countOfPlaylists(data))   
print("Count:", countOfPlaylists(test))  


Count: 1000
Count: 647
