In [None]:
import json
import pandas as pd
import os
from glob import glob

# Folder containing all mpd.slice.*.json files
input_dir = r"C:\Users\jverc\OneDrive\02.DataScienceOD\data\mpd\data"

# Folder to save Parquet files
output_dir = r"C:\Users\jverc\OneDrive\02.DataScienceOD\exports"
os.makedirs(output_dir, exist_ok=True)

# Find all JSON files
json_files = sorted(glob(os.path.join(input_dir, "mpd.slice.*.json")))

print(f"Found {len(json_files)} JSON files to process.")

# Loop through each file
for idx, json_path in enumerate(json_files, 1):
    file_name = os.path.basename(json_path)
    parquet_file_name = file_name.replace(".json", ".parquet")
    parquet_output_path = os.path.join(output_dir, parquet_file_name)

    # Load JSON
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    info_generated_on = data["info"].get("generated_on", "")
    info_slice = data["info"].get("slice", "")
    info_version = data["info"].get("version", "")

    records = []
    for playlist in data["playlists"]:
        for track in playlist["tracks"]:
            records.append({
                "file_name": file_name,
                "info_generated_on": info_generated_on,
                "info_slice": info_slice,
                "info_version": info_version,
                "playlist_id": playlist["pid"],
                "playlist_name": playlist["name"],
                "collaborative": playlist["collaborative"],
                "modified_at": playlist["modified_at"],
                "num_tracks_in_playlist": playlist["num_tracks"],
                "num_albums_in_playlist": playlist["num_albums"],
                "num_followers": playlist["num_followers"],
                "track_position": track["pos"],
                "track_name": track["track_name"],
                "track_uri": track["track_uri"],
                "artist_name": track["artist_name"],
                "artist_uri": track["artist_uri"],
                "album_name": track["album_name"],
                "album_uri": track["album_uri"],
                "track_duration_ms": track["duration_ms"]
            })

    if not records:
        print(f"[{idx}/{len(json_files)}] ⚠️ No tracks in {file_name}")
        continue

    df = pd.DataFrame(records)

    # Convert collaborative to boolean
    df["collaborative"] = df["collaborative"].map({"true": True, "false": False})

    # Save Parquet
    df.to_parquet(parquet_output_path, index=False)

    print(f"[{idx}/{len(json_files)}] ✅ Saved {len(df)} rows to {parquet_output_path}")

print("🎉 All files processed!")

✅ Export complete! Parquet saved to:
C:\Users\jverc\OneDrive\02.DataScienceOD\exports\mpd_slice_0_999_tracks.parquet


In [6]:
import pandas as pd

# Path to your exported CSV
file_path = r"C:\Users\jverc\OneDrive\02.DataScienceOD\exports\mpd_slice_0_999_tracks.parquet"

# Load into DataFrame
df = pd.read_parquet(file_path)

df.head()

Unnamed: 0,file_name,info_generated_on,info_slice,info_version,playlist_id,playlist_name,collaborative,modified_at,num_tracks_in_playlist,num_albums_in_playlist,num_followers,track_position,track_name,track_uri,artist_name,artist_uri,album_name,album_uri,track_duration_ms
0,mpd.slice.0-999.json,2017-12-03 08:41:42.057563,0-999,v1,0,Throwbacks,False,1493424000,52,47,1,0,Lose Control (feat. Ciara & Fat Man Scoop),spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Missy Elliott,spotify:artist:2wIVse2owClT7go1WT98tk,The Cookbook,spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863
1,mpd.slice.0-999.json,2017-12-03 08:41:42.057563,0-999,v1,0,Throwbacks,False,1493424000,52,47,1,1,Toxic,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Britney Spears,spotify:artist:26dSoYclwsYLMAKD3tpOr4,In The Zone,spotify:album:0z7pVBGOD7HCIB7S8eLkLI,198800
2,mpd.slice.0-999.json,2017-12-03 08:41:42.057563,0-999,v1,0,Throwbacks,False,1493424000,52,47,1,2,Crazy In Love,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Beyoncé,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Dangerously In Love (Alben für die Ewigkeit),spotify:album:25hVFAxTlDvXbx2X2QkUkE,235933
3,mpd.slice.0-999.json,2017-12-03 08:41:42.057563,0-999,v1,0,Throwbacks,False,1493424000,52,47,1,3,Rock Your Body,spotify:track:1AWQoqb9bSvzTjaLralEkT,Justin Timberlake,spotify:artist:31TPClRtHm23RisEBtV3X7,Justified,spotify:album:6QPkyl04rXwTGlGlcYaRoW,267266
4,mpd.slice.0-999.json,2017-12-03 08:41:42.057563,0-999,v1,0,Throwbacks,False,1493424000,52,47,1,4,It Wasn't Me,spotify:track:1lzr43nnXAijIGYnCT8M8H,Shaggy,spotify:artist:5EvFsr3kj42KNv97ZEnqij,Hot Shot,spotify:album:6NmFmPX56pcLBOFMhIiKvF,227600


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67503 entries, 0 to 67502
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   file_name               67503 non-null  object
 1   info_generated_on       67503 non-null  object
 2   info_slice              67503 non-null  object
 3   info_version            67503 non-null  object
 4   playlist_id             67503 non-null  int64 
 5   playlist_name           67503 non-null  object
 6   collaborative           67503 non-null  object
 7   modified_at             67503 non-null  int64 
 8   num_tracks_in_playlist  67503 non-null  int64 
 9   num_albums_in_playlist  67503 non-null  int64 
 10  num_followers           67503 non-null  int64 
 11  track_position          67503 non-null  int64 
 12  track_name              67503 non-null  object
 13  track_uri               67503 non-null  object
 14  artist_name             67503 non-null  object
 15  ar