In [1]:
import numpy as np
import pandas as pd

# Check
Checks to see that the MPD is correct

In [None]:
# %load "Arxiu comprimit/src/check.py"
"""
   Checks to see that the MPD is correct

   Usage:
    python check.py path-to-mpd-data/

"""
import sys
import json
import string
import datetime
import os


min_tracks_per_playlist = 5
max_tracks_per_playlist = 250
min_artists_per_playlist = 3
min_albums_per_playlist = 2
max_files_for_quick_processing = 10
latest_add_ts = int(datetime.datetime(2017, 11, 1).strftime('%S')) * 1000
pids = set()

artist_names = {}
album_names = {}
track_names = {}

gstats = {
    'errors': 0
}

def process_mpd(path):
    count = 0
    filenames = os.listdir(path)
    for filename in sorted(filenames):
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            slice = json.loads(js)
            process_info(slice['info'])
            for playlist in slice['playlists']:
                process_playlist(playlist)
            count += 1
            if quick and count > max_files_for_quick_processing:
                break

    show_summary()

def show_summary():
    tassert(len(pids) ==  1000000, "mismatched pids %d %d", len(pids), 1000000)
    missing = set()
    for pid in range(0, 1000000):
        if pid not in pids:
            missing.add(pid)
    tassert(len(missing) == 0, "missing %d pids", len(missing))

    for k,v in gstats.items():
        print (k, v)


required_fields = set(['name', 'collaborative', 'pid', 'modified_at', 'num_albums', 'num_tracks', 'num_followers',
'num_tracks', 'num_edits', 'duration_ms', 'num_artists', 'tracks'])
optional_fields = set(['description'])

required_track_fields = set(['pos', 'artist_name', 'artist_uri', 'album_uri', 'album_name', 'track_uri', 'track_name', 'duration_ms'])

def process_playlist(playlist):

    tassert(playlist['pid'] not in pids, "duplicate pid %d", playlist['pid'])
    pids.add(playlist['pid'])

    tassert(len(playlist['name']) > 0, "zero length playlist title")
    tassert(len(playlist['tracks']) >= min_tracks_per_playlist, "min tracks per playlist < %d", min_tracks_per_playlist)
   
    for field in playlist:
        tassert(field in required_fields or field in  optional_fields, "extra field %s", field) 

    for field in required_fields:
        tassert(field in playlist, "missing field %s", field)

    tassert(playlist['num_followers'] >= 1,  "too few followers %d", playlist['num_followers'])
    tassert(playlist['num_edits'] > 0,  "too few edits %d", playlist['num_edits'])
    tassert(playlist['modified_at'] <= latest_add_ts,  "modified_at too late %d", playlist['modified_at'])

    albums = set()
    artists = set()

    total_duration = 0
    for i, track in enumerate(playlist['tracks']):


        for field in track:
            tassert(field in required_track_fields, "extra track field %s", field)

        for field in required_track_fields:
            tassert(field in track, "missing track field %s", field)

        tassert(i == track['pos'], "out of order %d %d", i, track['pos'])
        artists.add(track['artist_uri'])
        albums.add(track['album_uri'])
        total_duration += track['duration_ms']

        if track['artist_uri'] not in artist_names:
            artist_names[track['artist_uri']] = track['artist_name']
        tassert(track['artist_name'] == artist_names[track['artist_uri']], 'mismatch artist name %s %s', track['artist_name'], artist_names[track['artist_uri']])

        if track['album_uri'] not in album_names:
            album_names[track['album_uri']] = track['album_name']
        tassert(track['album_name'] == album_names[track['album_uri']], 'mismatch album name %s %s', track['album_name'], album_names[track['album_uri']])

        if track['track_uri'] not in track_names:
            track_names[track['track_uri']] = track['track_name']
        tassert(track['track_name'] == track_names[track['track_uri']], 'mismatch track name %s %s', track['track_name'], track_names[track['track_uri']])

        tassert(is_track_uri(track['track_uri']), "invalid track uri %s", track['track_uri'])
        tassert(is_album_uri(track['album_uri']), "invalid album uri %s", track['album_uri'])
        tassert(is_artist_uri(track['artist_uri']), "invalid artst uri %s", track['artist_uri'])

    tassert(len(artists) >= min_artists_per_playlist, 'too few artists %d', len(artists))
    tassert(len(albums) >= min_albums_per_playlist, 'too few albums %d', len(albums))
    tassert(len(albums) == playlist['num_albums'], 'nalbum mismatch %d %d', len(albums), playlist['num_albums'])
    tassert(len(artists) == playlist['num_artists'], 'nartist mismatch %d %d', len(artists), playlist['num_artists'])
    tassert(len(playlist['tracks']) == playlist['num_tracks'], 'ntracks mismatch %d %d', len(playlist['tracks']), playlist['num_tracks'])
    tassert(total_duration == playlist['duration_ms'], "mismiatch duration %d %d", total_duration, playlist['duration_ms'])



required_info_fields = ['generated_on', 'slice', 'version']
def process_info(info):
    for field in info:
        tassert(field in required_info_fields, "extra info field %s", field)

    for field in required_info_fields:
        tassert(field in info, "missing info field %s", field)


def is_track_uri(uri):
    return uri.startswith("spotify:track:")

def is_album_uri(uri):
    return uri.startswith("spotify:album:")

def is_artist_uri(uri):
    return uri.startswith("spotify:artist:")

def tassert(condition, fmtstring, *args):
    if not condition:
        gstats['errors'] += 1
        print (fmtstring % args)
        
if __name__ == '__main__':
    path = sys.argv[1]
    quick = False
    if len(sys.argv) > 2 and sys.argv[2] == '--quick':
        quick = True
    #process_mpd(path)
    process_mpd(os.getcwd())


# Deeper Stats
* number of playlists 
* number of tracks 
* number of unique tracks 
* number of unique albums 
* number of unique artists 
* number of unique titles 
* number of unique normalized titles 
* avg playlist length 
<br><br>
* full playlist titles
* top tracks
* top artists


In [None]:
# %load "Arxiu comprimit/src/deeper_stats.py"
"""
    shows deep stats for the MPD

    usage:

        python deeper_stats.py path-to-mpd-data/
"""
import sys
import json
import re
import collections
import os

total_playlists = 0
total_tracks = 0
tracks = set()
artists = set()
albums = set()
titles = set()
ntitles = set()
full_title_histogram = collections.Counter()
title_histogram = collections.Counter()
artist_histogram = collections.Counter()
track_histogram = collections.Counter()

quick = False
max_files_for_quick_processing = 50


def process_mpd(path):
    count = 0
    filenames = os.listdir(path)
    for filename in sorted(filenames):
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            mpd_slice = json.loads(js)
            process_info(mpd_slice['info'])
            for playlist in mpd_slice['playlists']:
                process_playlist(playlist)
            count += 1

            if quick and count > max_files_for_quick_processing:
                break

    show_summary()


def show_summary():
    print
    print ("number of playlists", total_playlists)
    print ("number of tracks", total_tracks)
    print ("number of unique tracks", len(tracks))
    print ("number of unique albums", len(albums))
    print ("number of unique artists", len(artists))
    print ("number of unique titles", len(titles))
    print ("number of unique normalized titles", len(ntitles))
    print ("avg playlist length", float(total_tracks) / total_playlists)
    print()
    print ("full playlist titles")
    for title, count in full_title_histogram.most_common():
        print ("%7d %s" % (count, title))
    print()

    print ("top playlist titles")
    for title, count in title_histogram.most_common():
        print ("%7d %s" % (count, title))
    print()

    print ("top tracks")
    for track, count in track_histogram.most_common(10000):
        print ("%7d %s" % (count, track))

    print()
    print ("top artists")
    for artist, count in artist_histogram.most_common(10000):
        print ("%7d %s" % (count, artist))


def normalize_name(name):
    name = name.lower()
    name = re.sub(r"[.,\/#!$%\^\*;:{}=\_`~()@]", ' ', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name


def process_playlist(playlist):
    global total_playlists, total_tracks

    total_playlists += 1
    # print playlist['playlist_id'], playlist['name']

    titles.add(playlist['name'])
    nname = normalize_name(playlist['name'])
    ntitles.add(nname)
    title_histogram[nname] += 1
    full_title_histogram[playlist['name'].lower()] += 1

    for track in playlist['tracks']:
        total_tracks += 1
        albums.add(track['album_uri'])
        tracks.add(track['track_uri'])
        artists.add(track['artist_uri'])

        full_name = track['track_name'] + " by " + track['artist_name']
        artist_histogram[track['artist_name']] += 1
        track_histogram[full_name] += 1


def process_info(info):
    for k, v in info.items():
        print ("%-20s %s" % (k + ":", v))
    print()


if __name__ == '__main__':
    quick = True
    path = sys.argv[1]
    if len(sys.argv) > 2 and sys.argv[2] == '--quick':
        quick = True
    #process_mpd(path)
    process_mpd(os.getcwd()+"\MPD")


# Descriptions
Descriptions of the playlists that have descriptions

In [None]:
# %load "Arxiu comprimit/src/descriptions.py"

"""
    iterates over the million playlist dataset and outputs info
    about what is in there.

    Usage:

        python stats.py path-to-mpd-data
"""
import sys
import json
import re
import collections
import os
import datetime

quick = False
max_files_for_quick_processing = 5
descriptions  = collections.Counter()
ndescriptions  = collections.Counter()

def process_mpd(path):
    count = 0
    filenames = os.listdir(path)
    for filename in sorted(filenames):
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            mpd_slice = json.loads(js)
            process_info(mpd_slice['info'])
            for playlist in mpd_slice['playlists']:
                process_playlist(playlist)
            count += 1

            if quick and count > max_files_for_quick_processing:
                break

    show_summary()

def show_summary():
    print ("descriptions")
    for k,v in descriptions.most_common():
        print (v, k)

    print ()
    print ("normalized descriptions")
    for k,v in ndescriptions.most_common():
        print (v, k)

def normalize_name(name):
    name = name.lower()
    name = re.sub(r"[.,\/#!$%\^\*;:{}=\_`~()@]", ' ', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name


def process_playlist(playlist):
    if 'description' in playlist:
        desc = playlist['description']
        ndesc = normalize_name(desc)
        descriptions[desc] += 1
        ndescriptions[ndesc] += 1

def process_info(_):
    pass

if __name__ == '__main__':
    path = sys.argv[1]
    if len(sys.argv) > 2 and sys.argv[2] == '--quick':
        quick = True
    #process_mpd(path)
    process_mpd(os.getcwd())
    


# Print
Prints everything


In [None]:
# %load "Arxiu comprimit/src/print.py"
"""
    pretty prints the MPD

    usage:
        python print.py path-mpd/
"""
import sys
import json
import time
import os


def process_playlists(path):
    filenames = os.listdir(path)
    for filename in sorted(filenames):
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            mpd_slice = json.loads(js)
            for playlist in mpd_slice['playlists']:
                print_playlist(playlist)


def print_playlist(playlist):
    print ("=====", playlist['pid'], '====')
    print ("name:          ", playlist['name'])
    ts = time.strftime('%Y-%m-%d %H:%M:%S',
                       time.localtime(playlist['modified_at'] / 1000))

    print ("last_modified: ", ts)
    print ("num edits: ", playlist['num_edits'])
    print ("num followers: ", playlist['num_followers'])
    print ("num artists: ", playlist['num_artists'])
    print ("num albums: ", playlist['num_albums'])
    print ("num tracks: ", playlist['num_tracks'])
    print()
    for i, track in enumerate(playlist['tracks']):
        print ("   %3d %s - %s - %s" % (i + 1, track['track_name'],
                                       track['album_name'],
                                       track['artist_name']))
    print()


if __name__ == '__main__':
    path = sys.argv[1]
    #process_playlists(path)
    process_playlists(os.getcwd())


# Show
Pretty printer

In [None]:
# %load "Arxiu comprimit/src/show.py"

"""
    pretty printer for the MPD

    Usage:
        python show.py - show all the playlists in the MPD
        python show.py 1008 1120 4356 - show the playlists with the given pids
        python show.py 1000-1020 1989 99870-99999 - show the playlists in the given range

"""


import sys
import json
import codecs
import datetime


pretty = True
compact = False
cache = {}

def print_playlist(playlist):
    if pretty:
        print ('===', playlist['pid'], '===')
        print (playlist['name'])
        print ("  followers", playlist['num_followers'])
        print ("  modified", datetime.datetime.fromtimestamp(playlist['modified_at']).strftime("%Y-%m-%d"))
        print ("  edits", playlist['num_edits'])
        print()
        if not compact:
            for track in playlist['tracks']:
                print ("%3d %s - %s" %(track['pos'], track['track_name'], track['album_name']))
            print()
    else:
        print (json.dumps(playlist, indent=4))


def show_playlist(pid):
    if pid >=0 and pid < 1000000:
        low = 1000 * int(pid / 1000)
        high = low + 999
        offset = pid - low
        path = "mpd.slice." + str(low) + '-' + str(high) + ".json"
        if not path in cache:
            f = codecs.open(path, 'r', 'utf-8')
            js = f.read()
            f.close()
            playlist = json.loads(js)
            cache[path] = playlist

        playlist = cache[path]['playlists'][offset]
        print_playlist(playlist)


def show_playlists_in_range(start, end):
    try:
        istart = int(start)
        iend = int(end)
        if istart <= iend and istart >= 0 and iend <= 1000000:
            for pid in xrange(istart, iend):
                show_playlist(pid)
    except:
        raise
        print ()# %load "Arxiu comprimit/src/descriptions.py""bad pid")

        '''
if __name__ == '__main__':
    for arg in sys.argv[1:]:
        if arg == '--pretty':
            pretty = True
        elif arg == '--compact':
            compact = True
        elif arg == '--raw':
            pretty = False
        elif '-' in arg:
            fields = arg.split('-')
            if len(fields) == 2:
                start = fields[0]
                end = fields[1]
                show_playlists_in_range(start, end)
        else:
            pid = int(arg)
            show_playlist(pid)
'''
if __name__ == '__main__':
    show_playlist(0)

# Stats
* number of playlists 1000
* number of tracks 67503
* number of unique tracks 34443
* number of unique albums 19261
* number of unique artists 9754
* number of unique titles 869
* number of playlists with descriptions 20
* number of unique normalized titles 754
* avg playlist length 67.503
<br><br>
* top playlist titles
* top tracks
* top artists
* numedits histogram
* last modified histogram
* playlist length histogram
* num followers histogram

In [3]:
# %load "Arxiu comprimit/src/stats.py"
"""
    iterates over the million playlist dataset and outputs info
    about what is in there.

    Usage:

        python stats.py path-to-mpd-data
"""
import sys
import json
import re
import collections
import os
import datetime

total_playlists = 0
total_tracks = 0
tracks = set()
artists = set()
albums = set()
titles = set()
total_descriptions = 0
ntitles = set()
title_histogram = collections.Counter()
artist_histogram = collections.Counter()
track_histogram = collections.Counter()
last_modified_histogram = collections.Counter()
num_edits_histogram = collections.Counter()
playlist_length_histogram = collections.Counter()
num_followers_histogram = collections.Counter()

quick = True
max_files_for_quick_processing = 1

def process_mpd(path):
    count = 0
    filenames = os.listdir(path)
    for filename in sorted(filenames):
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            mpd_slice = json.loads(js)
            process_info(mpd_slice['info'])
            for playlist in mpd_slice['playlists']:
                process_playlist(playlist)
            count += 1

            if quick and count > max_files_for_quick_processing:
                break

    show_summary()


def show_summary():
    print()
    print ("number of playlists", total_playlists)
    print ("number of tracks", total_tracks)
    print ("number of unique tracks", len(tracks))
    print ("number of unique albums", len(albums))
    print ("number of unique artists", len(artists))
    print ("number of unique titles", len(titles))
    print ("number of playlists with descriptions", total_descriptions)
    print ("number of unique normalized titles", len(ntitles))
    print ("avg playlist length", float(total_tracks) / total_playlists)
    print()
    print ("top playlist titles")
    for title, count in title_histogram.most_common(20):
        print ("%7d %s" % (count, title))

    print()
    print ("top tracks")
    for track, count in track_histogram.most_common(20):
        print ("%7d %s" % (count, track))

    print()
    print ("top artists")
    for artist, count in artist_histogram.most_common(20):
        print ("%7d %s" % (count, artist))

    print()
    print ("numedits histogram")
    for num_edits, count in num_edits_histogram.most_common(20):
        print ("%7d %d" % (count, num_edits))

    print()
    print ("last modified histogram")
    for ts, count in last_modified_histogram.most_common(20):
        print ("%7d %s" % (count, to_date(ts)))

    print()
    print ("playlist length histogram")
    for length, count in playlist_length_histogram.most_common(20):
        print ("%7d %d" % (count, length))

    print()
    print ("num followers histogram")
    for followers, count in num_followers_histogram.most_common(20):
        print ("%7d %d" % (count, followers))


def normalize_name(name):
    name = name.lower()
    name = re.sub(r"[.,\/#!$%\^\*;:{}=\_`~()@]", ' ', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name


def to_date(epoch):
    return datetime.datetime.fromtimestamp(epoch).strftime("%Y-%m-%d")


def process_playlist(playlist):
    global total_playlists, total_tracks, total_descriptions

    total_playlists += 1
    # print playlist['playlist_id'], playlist['name']

    if 'description' in playlist:
        total_descriptions += 1

    titles.add(playlist['name'])
    nname = normalize_name(playlist['name'])
    ntitles.add(nname)
    title_histogram[nname] += 1

    playlist_length_histogram[playlist['num_tracks']] += 1
    last_modified_histogram[playlist['modified_at']] += 1
    num_edits_histogram[playlist['num_edits']] += 1
    num_followers_histogram[playlist['num_followers']] += 1

    for track in playlist['tracks']:
        total_tracks += 1
        albums.add(track['album_uri'])
        tracks.add(track['track_uri'])
        artists.add(track['artist_uri'])

        full_name = track['track_name'] + " by " + track['artist_name']
        artist_histogram[track['artist_name']] += 1
        track_histogram[full_name] += 1


def process_info(_):
    pass


if __name__ == '__main__':
    path = sys.argv[1]
    if len(sys.argv) > 2 and sys.argv[2] == '--quick':
        quick = True
    #process_mpd(path)
    process_mpd(os.getcwd()+"\MPD")



number of playlists 2000
number of tracks 134125
number of unique tracks 57884
number of unique albums 30754
number of unique artists 14973
number of unique titles 1573
number of playlists with descriptions 40
number of unique normalized titles 1309
avg playlist length 67.0625

top playlist titles
     48 country
     31 chill
     18 workout
     18 rock
     16 christmas
     16 rap
     15 oldies
     13 worship
     13 party
     12 disney
     11 new
     10 feels
      9 summer
      8 lit
      8 dance
      7 relax
      7 sleep
      7 throwback
      7 pregame
      7 random

top tracks
     98 One Dance by Drake
     96 HUMBLE. by Kendrick Lamar
     87 Broccoli (feat. Lil Yachty) by DRAM
     86 Congratulations by Post Malone
     84 Closer by The Chainsmokers
     84 Bad and Boujee (feat. Lil Uzi Vert) by Migos
     79 Caroline by Aminé
     75 Roses by The Chainsmokers
     73 XO TOUR Llif3 by Lil Uzi Vert
     72 Mask Off by Future
     72 Panda by Desiigner
     70 goo