**Collects stats about the 1,000,000 playlists**  
Provides the list of the unique tracks, unique artists, and unique albums

In [20]:
import sys
import json
import re
import collections
import os
import datetime
import pandas as pd

In [2]:
total_playlists = 0
total_tracks = 0
tracks = set()
artists = set()
albums = set()
titles = set()  # Playlist name
total_descriptions = 0
ntitles = set()  # Normalized playlist name
title_histogram = collections.Counter()
artist_histogram = collections.Counter()
track_histogram = collections.Counter()
last_modified_histogram = collections.Counter()
num_edits_histogram = collections.Counter()
playlist_length_histogram = collections.Counter()
num_followers_histogram = collections.Counter()


In [3]:
def normalize_name(name):
    name = name.lower()
    name = re.sub(r"[.,\/#!$%\^\*;:{}=\_`~()@]", " ", name)
    name = re.sub(r"\s+", " ", name).strip()
    return name

In [4]:
def process_playlist(playlist):
    global total_playlists, total_tracks, total_descriptions

    total_playlists += 1
    # print playlist['playlist_id'], playlist['name']

    if "description" in playlist:
        total_descriptions += 1

    titles.add(playlist["name"])
    nname = normalize_name(playlist["name"])
    ntitles.add(nname)
    title_histogram[nname] += 1

    playlist_length_histogram[playlist["num_tracks"]] += 1
    last_modified_histogram[playlist["modified_at"]] += 1
    num_edits_histogram[playlist["num_edits"]] += 1
    num_followers_histogram[playlist["num_followers"]] += 1

    for track in playlist["tracks"]:
        total_tracks += 1
        albums.add(track["album_uri"])
        tracks.add(track["track_uri"])
        artists.add(track["artist_uri"])

        full_name = track["track_name"] + " by " + track["artist_name"]
        artist_histogram[track["artist_name"]] += 1
        track_histogram[full_name] += 1

In [21]:
def process_mpd(path):
    count = 0
    filenames = os.listdir(path)
    for filename in sorted(filenames):
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            mpd_slice = json.loads(js)
            # process_info(mpd_slice["info"])
            for playlist in mpd_slice["playlists"]:
                process_playlist(playlist)
            count += 1

In [22]:
path_to_slices='../data/'
process_mpd(path_to_slices)

# Ran in 11m 2.2s

In [39]:
# Create a DataFrame from the set
tracks_df = pd.DataFrame(tracks, columns=['track_URI'])

# Save the DataFrame to a CSV file
output_file = 'tracks_URI.csv'
tracks_df.to_csv(output_file, index=False, header=False)

In [40]:
# Create a DataFrame from the set
albums_df = pd.DataFrame(albums, columns=['album_URI'])

# Save the DataFrame to a CSV file
output_file = 'albums_URI.csv'
albums_df.to_csv(output_file, index=False, header=False)

In [41]:
# Create a DataFrame from the set
artists_df = pd.DataFrame(artists, columns=['artist_URI'])

# Save the DataFrame to a CSV file
output_file = 'artists_URI.csv'
artists_df.to_csv(output_file, index=False, header=False)

In [23]:
len(tracks)

2262292

In [24]:
len(artists)

295860

In [25]:
len(albums)

734684

In [29]:
track_histogram

Counter({'HUMBLE. by Kendrick Lamar': 46626,
         'One Dance by Drake': 43502,
         'Broccoli (feat. Lil Yachty) by DRAM': 41359,
         'Closer by The Chainsmokers': 41125,
         'Congratulations by Post Malone': 40031,
         'Caroline by Aminé': 35237,
         'iSpy (feat. Lil Yachty) by KYLE': 35177,
         'Bad and Boujee (feat. Lil Uzi Vert) by Migos': 35037,
         'Location by Khalid': 35023,
         'XO TOUR Llif3 by Lil Uzi Vert': 34959,
         'Bounce Back by Big Sean': 33738,
         'Ignition - Remix by R. Kelly': 32416,
         'No Role Modelz by J. Cole': 32370,
         'Mask Off by Future': 32097,
         'No Problem (feat. Lil Wayne & 2 Chainz) by Chance The Rapper': 31523,
         "I'm the One by DJ Khaled": 31400,
         'Jumpman by Drake': 31158,
         'goosebumps by Travis Scott': 31141,
         'Fake Love by Drake': 30707,
         'Despacito - Remix by Luis Fonsi': 30514,
         'Panda by Desiigner': 30450,
         'Roses by T

In [33]:
# Convert the counter object to a dataframe
track_histogram_df=pd.DataFrame.from_records(track_histogram.most_common(), columns=['Track','count'])
# Save the DataFrame to a CSV file
output_file = 'tracks_histogram.csv'
track_histogram_df.to_csv(output_file, index=False)

track_histogram_df.head()

Unnamed: 0,Track,count
0,HUMBLE. by Kendrick Lamar,46626
1,One Dance by Drake,43502
2,Broccoli (feat. Lil Yachty) by DRAM,41359
3,Closer by The Chainsmokers,41125
4,Congratulations by Post Malone,40031


In [34]:
# Convert the counter object to a dataframe
artist_histogram_df=pd.DataFrame.from_records(artist_histogram.most_common(), columns=['Artist','count'])
# Save the DataFrame to a CSV file
output_file = 'artist_histogram.csv'
artist_histogram_df.to_csv(output_file, index=False)

artist_histogram_df.head()

Unnamed: 0,Artist,count
0,Drake,848099
1,Kanye West,413712
2,Kendrick Lamar,354009
3,Rihanna,339920
4,The Weeknd,316899


In [36]:
# Convert the counter object to a dataframe
title_histogram_df=pd.DataFrame.from_records(title_histogram.most_common(), columns=['Playlist_Name','count'])
# Save the DataFrame to a CSV file
output_file = 'playlist_name_histogram.csv'
title_histogram_df.to_csv(output_file, index=False)

title_histogram_df.head()

Unnamed: 0,Playlist_Name,count
0,country,10025
1,chill,10013
2,rap,8501
3,workout,8489
4,oldies,8154


In [37]:
len(title_histogram_df)

17381