In [5]:
from pathlib import Path
import zipfile

ZIP_PATH = Path(r"C:\Users\Kiera\Downloads\archive (3).zip")

with zipfile.ZipFile(ZIP_PATH, 'r') as z:
    all_files = z.namelist()
    
print(f"Total entries in ZIP: {len(all_files)}")
print("First 50 entries:")
for f in all_files[:50]:
    print(" ", f)


Total entries in ZIP: 1010
First 50 entries:
  README.md
  data/mpd.slice.0-999.json
  data/mpd.slice.1000-1999.json
  data/mpd.slice.10000-10999.json
  data/mpd.slice.100000-100999.json
  data/mpd.slice.101000-101999.json
  data/mpd.slice.102000-102999.json
  data/mpd.slice.103000-103999.json
  data/mpd.slice.104000-104999.json
  data/mpd.slice.105000-105999.json
  data/mpd.slice.106000-106999.json
  data/mpd.slice.107000-107999.json
  data/mpd.slice.108000-108999.json
  data/mpd.slice.109000-109999.json
  data/mpd.slice.11000-11999.json
  data/mpd.slice.110000-110999.json
  data/mpd.slice.111000-111999.json
  data/mpd.slice.112000-112999.json
  data/mpd.slice.113000-113999.json
  data/mpd.slice.114000-114999.json
  data/mpd.slice.115000-115999.json
  data/mpd.slice.116000-116999.json
  data/mpd.slice.117000-117999.json
  data/mpd.slice.118000-118999.json
  data/mpd.slice.119000-119999.json
  data/mpd.slice.12000-12999.json
  data/mpd.slice.120000-120999.json
  data/mpd.slice.121000-1

In [6]:
from pathlib import Path
import zipfile
import shutil

PROJECT_ROOT = Path("..").resolve()
DATA_RAW = PROJECT_ROOT / "data" / "raw"
MPD_DIR = DATA_RAW / "mpd"
ZIP_PATH = Path(r"C:\Users\Kiera\Downloads\archive (3).zip")

print("Project root:", PROJECT_ROOT)
print("MPD target dir:", MPD_DIR)
print("Zip file:", ZIP_PATH)

# Clean mpd dir again just to be sure
if MPD_DIR.exists():
    print("Deleting existing MPD directory...")
    shutil.rmtree(MPD_DIR)
MPD_DIR.mkdir(parents=True, exist_ok=True)
print("Created clean MPD directory:", MPD_DIR)

with zipfile.ZipFile(ZIP_PATH, 'r') as z:
    all_files = sorted(z.namelist())
    print(f"Total entries in ZIP: {len(all_files)}")

    # Match any path that *contains* "mpd.slice." and ends with .json
    slice_files = [f for f in all_files if "mpd.slice." in f and f.endswith(".json")]
    print(f"Total slice-like files in ZIP: {len(slice_files)}")

    # Take first 100 by path sort order
    files_to_extract = slice_files[:100]
    print(f"Extracting {len(files_to_extract)} slices...\n")

    for filename in files_to_extract:
        print("Extracting:", filename)
        # Extract into MPD_DIR, stripping any folder structure
        z.extract(filename, MPD_DIR)

print("\nDone extracting candidate slices.")

# Sanity check
extracted_files = sorted(MPD_DIR.rglob("mpd.slice.*.json"))
print(f"\nFound {len(extracted_files)} mpd.slice JSON files under {MPD_DIR}")
for f in extracted_files[:10]:
    print(" ", f.relative_to(MPD_DIR))


Project root: C:\Users\Kiera\Music_Recommender
MPD target dir: C:\Users\Kiera\Music_Recommender\data\raw\mpd
Zip file: C:\Users\Kiera\Downloads\archive (3).zip
Deleting existing MPD directory...
Created clean MPD directory: C:\Users\Kiera\Music_Recommender\data\raw\mpd
Total entries in ZIP: 1010
Total slice-like files in ZIP: 1000
Extracting 100 slices...

Extracting: data/mpd.slice.0-999.json
Extracting: data/mpd.slice.1000-1999.json
Extracting: data/mpd.slice.10000-10999.json
Extracting: data/mpd.slice.100000-100999.json
Extracting: data/mpd.slice.101000-101999.json
Extracting: data/mpd.slice.102000-102999.json
Extracting: data/mpd.slice.103000-103999.json
Extracting: data/mpd.slice.104000-104999.json
Extracting: data/mpd.slice.105000-105999.json
Extracting: data/mpd.slice.106000-106999.json
Extracting: data/mpd.slice.107000-107999.json
Extracting: data/mpd.slice.108000-108999.json
Extracting: data/mpd.slice.109000-109999.json
Extracting: data/mpd.slice.11000-11999.json
Extracting: d