# Preprocess PercePiano Data

Run the ORIGINAL PercePiano preprocessing to create train/valid/test splits
with their exact 78-dimension features.

This creates data compatible with their original training script.

## Step 1: Setup Environment

In [1]:
# Run this notebook with the project venv:
# cd /Users/jdhiman/Documents/crescendai/model
# source .venv/bin/activate
# jupyter notebook notebooks/preprocess_percepiano_data.ipynb

import sys
import os
from pathlib import Path

# Ensure we're using the right Python
print(f"Python: {sys.executable}")
print(f"Version: {sys.version}")

# Check if we're in the venv
if '.venv' not in sys.executable:
    print("\nWARNING: Not running in project venv!")
    print("Please activate: source .venv/bin/activate")

# Paths
PERCEPIANO_ROOT = Path('/Users/jdhiman/Documents/crescendai/model/data/raw/PercePiano')
VIRTUOSO_PATH = PERCEPIANO_ROOT / 'virtuoso' / 'virtuoso'
PYSCOREPARSER_PATH = VIRTUOSO_PATH / 'pyScoreParser'
DATA_PATH = PERCEPIANO_ROOT / 'virtuoso' / 'data'
LABELS_PATH = PERCEPIANO_ROOT / 'labels'

# Label files (they exist in both locations)
LABEL_FILE = PERCEPIANO_ROOT / 'label_2round_mean_reg_19_with0_rm_highstd0.json'
LABEL_STD_FILE = PERCEPIANO_ROOT / 'label_2round_std_reg_19_with0_rm_highstd0.json'

# Output path
OUTPUT_PATH = Path('/Users/jdhiman/Documents/crescendai/model/data/preprocessed/percepiano_original')
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

print(f"\nPercePiano root: {PERCEPIANO_ROOT}")
print(f"Data path: {DATA_PATH}")
print(f"Output path: {OUTPUT_PATH}")
print(f"Label file exists: {LABEL_FILE.exists()}")
print(f"Label std file exists: {LABEL_STD_FILE.exists()}")

Python: /Users/jdhiman/Documents/crescendai/model/.venv/bin/python
Version: 3.11.14 (main, Oct 14 2025, 21:33:50) [Clang 20.1.4 ]

PercePiano root: /Users/jdhiman/Documents/crescendai/model/data/raw/PercePiano
Data path: /Users/jdhiman/Documents/crescendai/model/data/raw/PercePiano/virtuoso/data
Output path: /Users/jdhiman/Documents/crescendai/model/data/preprocessed/percepiano_original
Label file exists: True
Label std file exists: True


In [3]:
# Install dependencies
!uv pip install pandas pretty_midi mido music21 scipy tqdm

[2mUsing Python 3.11.14 environment at: /Users/jdhiman/Documents/crescendai/model/.venv[0m
[2K[2mResolved [1m30 packages[0m [2min 400ms[0m[0m                                        [0m
[2K[37m⠙[0m [2mPreparing packages...[0m (0/4)                                                   
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/4)--------------[0m[0m     0 B/68.03 KiB           [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/4)--------------[0m[0m     0 B/68.03 KiB           [1A
[2mmore-itertools      [0m [32m[2m------------------------------[0m[0m     0 B/68.03 KiB
[2K[2A[37m⠙[0m [2mPreparing packages...[0m (0/4)--------------[0m[0m     0 B/194.71 KiB          [2A
[2mmore-itertools      [0m [32m--------[2m----------------------[0m[0m 16.00 KiB/68.03 KiB
[2K[2A[37m⠙[0m [2mPreparing packages...[0m (0/4)--------------[0m[0m     0 B/194.71 KiB          [2A
[2mmore-itertools      [0m [32m--------[2m----------------------[0

In [4]:
# Verify data exists
dat_files = list((DATA_PATH / 'all_2rounds').glob('*.dat'))
mid_files = list((DATA_PATH / 'all_2rounds').glob('*.mid'))
xml_files = list((DATA_PATH / 'score_xml').glob('*.musicxml'))
score_midis = list((DATA_PATH / 'score_midi').glob('*.mid'))

print(f"Alignment files (.dat): {len(dat_files)}")
print(f"Performance MIDIs: {len(mid_files)}")
print(f"Score XMLs: {len(xml_files)}")
print(f"Score MIDIs: {len(score_midis)}")

Alignment files (.dat): 1201
Performance MIDIs: 1202
Score XMLs: 261
Score MIDIs: 261


In [5]:
# Check label file
import json

# Find label file
label_files = list(LABELS_PATH.glob('*.json')) if LABELS_PATH.exists() else []
print(f"Label files in {LABELS_PATH}: {[f.name for f in label_files]}")

# The main label file
LABEL_FILE = PERCEPIANO_ROOT / 'label_2round_mean_reg_19_with0_rm_highstd0.json'
if not LABEL_FILE.exists():
    LABEL_FILE = LABELS_PATH / 'label_2round_mean_reg_19_with0_rm_highstd0.json'

if LABEL_FILE.exists():
    with open(LABEL_FILE) as f:
        labels = json.load(f)
    print(f"\nLoaded {len(labels)} labels from {LABEL_FILE.name}")
    print(f"Sample keys: {list(labels.keys())[:3]}")
else:
    print(f"Label file not found!")
    print(f"Searched: {LABEL_FILE}")

Label files in /Users/jdhiman/Documents/crescendai/model/data/raw/PercePiano/labels: ['label_2round_std_reg_19_with0_rm_highstd0.json', 'label_2round_mean_reg_19_with0_rm_highstd0.json']

Loaded 1202 labels from label_2round_mean_reg_19_with0_rm_highstd0.json
Sample keys: ['Schubert_D935_no.3_4bars_1_25', 'Beethoven_WoO80_var27_8bars_1_15', 'Beethoven_WoO80_var15_8bars_Score_10']


## Step 2: Patch Preprocessing Script

In [6]:
# Read the original preprocessing script
preprocess_script = PYSCOREPARSER_PATH / 'm2pf_dataset_compositionfold.py'
with open(preprocess_script) as f:
    original_code = f.read()

print("Hardcoded paths in original script:")
for line in original_code.split('\n'):
    if '/root/v2/muzic' in line:
        print(f"  {line.strip()}")

Hardcoded paths in original script:
  perform_lists = [p for p in perform_lists if ".".join(os.path.basename(p).split(".")[:-1]) in json.load(open("/root/v2/muzic/virtuosonet/label_2round_mean_reg_19_with0_rm_highstd0.json")).keys()]
  dataset = M2PFSet(path = "/root/v2/muzic/virtuosonet/data", split = "all_2rounds", save=False)
  all_data = [filename for filename in os.listdir("/root/v2/muzic/virtuosonet/data/all_2rounds") if ".mid" in filename]
  domain = json.load(open("/root/v2/muzic/virtuosonet/label_2round_mean_reg_19_with0_rm_highstd0.json")).keys()


In [7]:
# Create patched version of the preprocessing script
patched_code = original_code

# Replace all hardcoded paths
replacements = [
    ('/root/v2/muzic/virtuosonet/data', str(DATA_PATH)),
    ('/root/v2/muzic/virtuosonet/label_2round_mean_reg_19_with0_rm_highstd0.json', str(LABEL_FILE)),
    ('/root/v2/muzic/virtuosonet/label_2round_std_reg_19_with0_rm_highstd0.json', str(LABEL_STD_FILE)),
]

for old, new in replacements:
    patched_code = patched_code.replace(old, new)

# Change output folder to our path
# Original: f"m2pf_allround/composition{num_folds}fold/{fold}"
# Replace with absolute path
patched_code = patched_code.replace(
    'pair_data.save_features_for_virtuosoNet(f"m2pf_allround/composition{num_folds}fold/{fold}"',
    f'pair_data.save_features_for_virtuosoNet(str(Path("{OUTPUT_PATH}") / f"fold{{fold}}")'
)

# Also need to add Path import at the top if not present
if 'from pathlib import Path' not in patched_code:
    patched_code = 'from pathlib import Path\n' + patched_code

# Save patched script
patched_script = PYSCOREPARSER_PATH / 'm2pf_dataset_compositionfold_patched.py'
with open(patched_script, 'w') as f:
    f.write(patched_code)

print(f"Patched script saved to: {patched_script}")
print(f"\nReplacements made:")
for old, new in replacements:
    print(f"  {old[:50]}... -> {new[:50]}...")

Patched script saved to: /Users/jdhiman/Documents/crescendai/model/data/raw/PercePiano/virtuoso/virtuoso/pyScoreParser/m2pf_dataset_compositionfold_patched.py

Replacements made:
  /root/v2/muzic/virtuosonet/data... -> /Users/jdhiman/Documents/crescendai/model/data/raw...
  /root/v2/muzic/virtuosonet/label_2round_mean_reg_1... -> /Users/jdhiman/Documents/crescendai/model/data/raw...
  /root/v2/muzic/virtuosonet/label_2round_std_reg_19... -> /Users/jdhiman/Documents/crescendai/model/data/raw...


## Step 3: Run Preprocessing

In [8]:
# Add paths to Python
sys.path.insert(0, str(VIRTUOSO_PATH))
sys.path.insert(0, str(PYSCOREPARSER_PATH))

print(f"Added to path:")
print(f"  {VIRTUOSO_PATH}")
print(f"  {PYSCOREPARSER_PATH}")

Added to path:
  /Users/jdhiman/Documents/crescendai/model/data/raw/PercePiano/virtuoso/virtuoso
  /Users/jdhiman/Documents/crescendai/model/data/raw/PercePiano/virtuoso/virtuoso/pyScoreParser


In [9]:
# Also patch dataset.py (it has hardcoded paths too)
dataset_script = VIRTUOSO_PATH / 'dataset.py'
with open(dataset_script) as f:
    dataset_code = f.read()

# Patch the label_stds path
dataset_code_patched = dataset_code.replace(
    '/root/v2/muzic/virtuosonet/label_2round_std_reg_19_with0_rm_highstd0.json',
    str(LABEL_STD_FILE)
)

# Save patched dataset.py
dataset_script_backup = VIRTUOSO_PATH / 'dataset_original.py'
if not dataset_script_backup.exists():
    with open(dataset_script_backup, 'w') as f:
        f.write(dataset_code)
    print(f"Original dataset.py backed up to: {dataset_script_backup}")

with open(dataset_script, 'w') as f:
    f.write(dataset_code_patched)
print(f"Patched dataset.py")

Original dataset.py backed up to: /Users/jdhiman/Documents/crescendai/model/data/raw/PercePiano/virtuoso/virtuoso/dataset_original.py
Patched dataset.py


In [10]:
# Run the patched preprocessing script
# This will take a while as it extracts features from all performances
import subprocess
import os

env = os.environ.copy()
env['PYTHONPATH'] = f"{VIRTUOSO_PATH}:{PYSCOREPARSER_PATH}"

print("Running preprocessing (this may take 10-30 minutes)...")
print(f"Script: {patched_script}")
print(f"Working dir: {PYSCOREPARSER_PATH}")
print("="*60)

result = subprocess.run(
    [sys.executable, str(patched_script)],
    cwd=str(PYSCOREPARSER_PATH),
    env=env,
    capture_output=True,
    text=True
)

print("STDOUT:")
print(result.stdout[-5000:] if len(result.stdout) > 5000 else result.stdout)

if result.returncode != 0:
    print("\nSTDERR:")
    print(result.stderr[-3000:] if len(result.stderr) > 3000 else result.stderr)
    print(f"\nReturn code: {result.returncode}")
else:
    print("\nPreprocessing completed successfully!")

Running preprocessing (this may take 10-30 minutes)...
Script: /Users/jdhiman/Documents/crescendai/model/data/raw/PercePiano/virtuoso/virtuoso/pyScoreParser/m2pf_dataset_compositionfold_patched.py
Working dir: /Users/jdhiman/Documents/crescendai/model/data/raw/PercePiano/virtuoso/virtuoso/pyScoreParser
STDOUT:
perform list length 1201
list index out of range
/Users/jdhiman/Documents/crescendai/model/data/raw/PercePiano/virtuoso/data/all_2rounds/Schubert_D935_no.3_4bars_1_50.mid
list index out of range
/Users/jdhiman/Documents/crescendai/model/data/raw/PercePiano/virtuoso/data/all_2rounds/Schubert_D935_no.3_4bars_13_50.mid
list index out of range
/Users/jdhiman/Documents/crescendai/model/data/raw/PercePiano/virtuoso/data/all_2rounds/Schubert_D935_no.3_4bars_3_50.mid
list index out of range
/Users/jdhiman/Documents/crescendai/model/data/raw/PercePiano/virtuoso/data/all_2rounds/Schubert_D935_no.3_4bars_9_50.mid
list index out of range
/Users/jdhiman/Documents/crescendai/model/data/raw/Per

## Step 4: Verify Output

In [11]:
# Check output
import pickle

for fold in range(4):
    fold_path = OUTPUT_PATH / f'fold{fold}'
    if fold_path.exists():
        print(f"\nFold {fold}:")
        for split in ['train', 'valid', 'test']:
            split_path = fold_path / split
            if split_path.exists():
                pkl_files = list(split_path.glob('*.pkl'))
                print(f"  {split}: {len(pkl_files)} files")
                
                # Check stat.pkl
                stat_file = split_path / 'stat.pkl'
                if stat_file.exists():
                    with open(stat_file, 'rb') as f:
                        stats = pickle.load(f)
                    print(f"    stat.pkl keys: {list(stats.keys())}")
                    if 'key_to_dim' in stats:
                        print(f"    key_to_dim has {len(stats['key_to_dim'])} entries")


Fold 0:
  train: 740 files
    stat.pkl keys: ['stats', 'input_keys', 'output_keys', 'measure_keys', 'key_to_dim']
    key_to_dim has 4 entries
  valid: 267 files
    stat.pkl keys: ['stats', 'input_keys', 'output_keys', 'measure_keys', 'key_to_dim']
    key_to_dim has 4 entries
  test: 188 files
    stat.pkl keys: ['stats', 'input_keys', 'output_keys', 'measure_keys', 'key_to_dim']
    key_to_dim has 4 entries

Fold 1:
  train: 771 files
    stat.pkl keys: ['stats', 'input_keys', 'output_keys', 'measure_keys', 'key_to_dim']
    key_to_dim has 4 entries
  valid: 236 files
    stat.pkl keys: ['stats', 'input_keys', 'output_keys', 'measure_keys', 'key_to_dim']
    key_to_dim has 4 entries
  test: 188 files
    stat.pkl keys: ['stats', 'input_keys', 'output_keys', 'measure_keys', 'key_to_dim']
    key_to_dim has 4 entries

Fold 2:
  train: 760 files
    stat.pkl keys: ['stats', 'input_keys', 'output_keys', 'measure_keys', 'key_to_dim']
    key_to_dim has 4 entries
  valid: 247 files
    

In [12]:
# Check a sample file
import numpy as np

fold0_train = OUTPUT_PATH / 'fold0' / 'train'
if fold0_train.exists():
    sample_files = [f for f in fold0_train.glob('*.pkl') if f.name != 'stat.pkl']
    if sample_files:
        with open(sample_files[0], 'rb') as f:
            sample = pickle.load(f)
        
        print(f"Sample file: {sample_files[0].name}")
        print(f"Keys: {list(sample.keys())}")
        
        if 'input' in sample:
            inp = np.array(sample['input'])
            print(f"Input shape: {inp.shape}")
        
        if 'note_location' in sample:
            print(f"note_location keys: {list(sample['note_location'].keys())}")

Sample file: all_2rounds_Schubert_D960_mv2_8bars_2_13.mid.pkl
Keys: ['input', 'output', 'beat', 'meas', 'note_location', 'align_matched', 'articulation_loss_weight', 'graph', 'score_path', 'perform_path']
Input shape: (143, 101)
note_location keys: ['beat', 'measure', 'voice', 'section']


## Step 5: Upload to Google Drive

In [13]:
# Upload to GDrive for Thunder Compute
import subprocess

GDRIVE_PATH = 'gdrive:crescendai_data/percepiano_original'

print(f"Uploading {OUTPUT_PATH} to {GDRIVE_PATH}...")
result = subprocess.run(
    ['rclone', 'copy', str(OUTPUT_PATH), GDRIVE_PATH, '--progress'],
    capture_output=False
)

print("\nUpload complete!")

Uploading /Users/jdhiman/Documents/crescendai/model/data/preprocessed/percepiano_original to gdrive:crescendai_data/percepiano_original...
Transferred:   	          0 B / 789.159 MiB, 0%, 0 B/s, ETA -
Checks:                 0 / 0, -, Listed 4796
Transferred:            0 / 4780, 0%
Elapsed time:         0.4s
Transferring:
 * fold2/test/all_2rounds…15_8bars_11_10.mid.pkl:  0% /205.401Ki, 0/s, -
 * fold2/test/all_2rounds…15_8bars_14_10.mid.pkl:  0% /207.770Ki, 0/s, -
 * fold2/test/all_2rounds…15_8bars_18_10.mid.pkl:  0% /205.127Ki, 0/s, -
 * fold2/test/all_2rounds…15_8bars_19_10.mid.pkl:  0% /208.243Ki, 0/s, -Transferred:   	          0 B / 789.159 MiB, 0%, 0 B/s, ETA -
Checks:                 0 / 0, -, Listed 4796
Transferred:            0 / 4780, 0%
Elapsed time:         0.9s
Transferring:
 * fold2/test/all_2rounds…15_8bars_11_10.mid.pkl:  0% /205.401Ki, 0/s, -
 * fold2/test/all_2rounds…15_8bars_14_10.mid.pkl:  0% /207.770Ki, 0/s, -
 * fold2/test/all_2rounds…15_8bars_18_10.mid.pkl:  0