# Piano Performance Evaluation - 3-Way Model Comparison (Colab)

Trains 3 models to prove multi-modal fusion advantage:
1. Audio-Only (MERT only)
2. MIDI-Only (MIDIBert only)
3. Fusion (MERT + MIDIBert)

**Dimensions**: 3 core (note_accuracy, rhythmic_precision, tone_quality)
**Sample size**: 114,246 training samples (full dataset)
**Expected time**: 3-4 hours training + 10-15 min setup
**Goal**: Prove fusion beats both baselines by 15-20%

## Setup

In [1]:
import os
os.environ.pop("HF_TOKEN", None)
os.environ.pop("HUGGINGFACEHUB_API_TOKEN", None)

from huggingface_hub import login, HfApi

try:
    import getpass as gp
    raw = gp.getpass("Paste your Hugging Face token (input hidden): ")
    token = raw.decode() if isinstance(raw, (bytes, bytearray)) else raw
    if not isinstance(token, str):
        raise TypeError(f"Unexpected token type: {type(token).__name__}")
    token = token.strip()
    if not token:
        raise ValueError("Empty token provided")
    login(token=token, add_to_git_credential=False)
    who = HfApi().whoami(token=token)
    print(f"✓ Logged in as: {who.get('name') or who.get('email') or 'OK'}")
except Exception as e:
    print(f"[HF Login] getpass flow failed: {e}")
    print("Falling back to interactive login widget...")
    login()
    try:
        who = HfApi().whoami()
        print(f"✓ Logged in as: {who.get('name') or who.get('email') or 'OK'}")
    except Exception as e2:
        print(f"[HF Login] Verification skipped: {e2}")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
CHECKPOINT_ROOT = '/content/drive/MyDrive/crescendai_checkpoints'
os.makedirs(CHECKPOINT_ROOT, exist_ok=True)

print(f"✓ Checkpoint directory ready: {CHECKPOINT_ROOT}")
print("\nNote: Training data will be downloaded from Hugging Face Hub")
print("      (No longer using Drive for data - much faster!)")

In [None]:
!nvidia-smi

import torch
if not torch.cuda.is_available():
    print("\n⚠️  NO GPU! Enable GPU: Runtime → Change runtime type → T4 GPU")
    raise RuntimeError("GPU required")

print(f"\n✓ GPU: {torch.cuda.get_device_name(0)}")
print(f"✓ Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Clone repo
!rm -rf /content/crescendai
!git clone https://github.com/Jai-Dhiman/crescendai.git /content/crescendai
%cd /content/crescendai/model
!git log -1 --oneline

In [None]:
# Install uv (fast Python package manager)
!curl -LsSf https://astral.sh/uv/install.sh | sh

# Add to PATH for this session
import os
os.environ['PATH'] = f"{os.environ['HOME']}/.cargo/bin:{os.environ['PATH']}"

print("\n✓ uv installed")

In [None]:
# Install dependencies
!uv pip install --system -e .

# Suppress warnings
import warnings
warnings.filterwarnings('ignore', message='divide by zero')
warnings.filterwarnings('ignore', category=SyntaxWarning)  # pydub regex warnings
warnings.filterwarnings('ignore', category=UserWarning, module='torchaudio')
warnings.filterwarnings('ignore', category=UserWarning, module='torchmetrics')

import torch
import pytorch_lightning as pl
print(f"PyTorch: {torch.__version__}")
print(f"Lightning: {pl.__version__}")
print("✓ Dependencies installed")

!python scripts/setup_colab_environment.py

In [None]:
# Download MERT model (cached after first download)
from transformers import AutoModel

print("Downloading MERT-95M (~380MB)...")
model = AutoModel.from_pretrained("m-a-p/MERT-v1-95M", trust_remote_code=True)
print("✓ MERT-95M cached")

del model
torch.cuda.empty_cache()

In [None]:
# Download Data from Hugging Face Hub
print("="*70)
print("STEP 2: DOWNLOAD DATA FROM HUGGING FACE HUB")
print("="*70)
print("\nThis is 10-100x faster and more reliable than Google Drive!")
print("Download time: 7-15 minutes (one-time per session)\n")

from huggingface_hub import hf_hub_download
import tarfile
import os

HF_REPO_ID = "Jai-D/crescendai-data"

print(f"Downloading from: {HF_REPO_ID}")
print("Archive size: ~20-25 GB compressed\n")

# Download archive
print("1. Downloading archive...")
archive_path = hf_hub_download(
    repo_id=HF_REPO_ID,
    filename="crescendai_data.tar.gz",
    repo_type="model",
    local_dir="/tmp/",
    local_dir_use_symlinks=False,
)
print(f"   ✓ Downloaded to: {archive_path}")

# Extract
print("\n2. Extracting archive...")
with tarfile.open(archive_path, 'r:gz') as tar:
    members = tar.getmembers()
    print(f"   Extracting {len(members):,} files...")
    tar.extractall('/tmp/crescendai_data/')

print("   ✓ Extracted to: /tmp/crescendai_data/")

# Clean up archive to save space
print("\n3. Cleaning up...")
os.remove(archive_path)
print("   ✓ Removed archive file")

# Verify structure
print("\n4. Verifying data structure...")
expected_paths = [
    '/tmp/crescendai_data/data/all_segments',
    '/tmp/crescendai_data/data/annotations',
]

all_good = True
for path in expected_paths:
    if os.path.exists(path):
        if 'all_segments' in path:
            num_files = len([f for f in os.listdir(path) if f.endswith('.wav')])
            print(f"   ✓ {path}: {num_files:,} audio files")
        else:
            num_files = len([f for f in os.listdir(path) if f.endswith('.jsonl')])
            print(f"   ✓ {path}: {num_files} annotation files")
    else:
        print(f"   ✗ {path}: NOT FOUND")
        all_good = False

if all_good:
    print("\n" + "="*70)
    print("✓ DATA DOWNLOAD COMPLETE")
    print("="*70)
    print("\nData ready at: /tmp/crescendai_data/")
    print("Training will be 10-30x faster than reading from Drive!")
else:
    print("\n✗ Data structure verification failed!")
    print("   Check that your archive has the correct structure")
    raise RuntimeError("Data download verification failed")

In [None]:
# Alternative: Extract MAESTRO from Google Drive
print("="*70)
print("ALTERNATIVE: EXTRACT MAESTRO FROM GOOGLE DRIVE")
print("="*70)

import zipfile
from pathlib import Path
import shutil

# UPDATE THIS PATH to where you uploaded maestro-v3.0.0.zip in your Drive
MAESTRO_ZIP_PATH = "/content/drive/MyDrive/crescendai_data/maestro-v3.0.0.zip"

# Check if zip file exists
if not Path(MAESTRO_ZIP_PATH).exists():
    print(f"\n✗ MAESTRO zip not found at: {MAESTRO_ZIP_PATH}")
    print("\nPlease:")
    print("1. Download MAESTRO v3.0.0 from https://magenta.tensorflow.org/datasets/maestro#v300")
    print("2. Upload maestro-v3.0.0.zip to your Google Drive")
    print("3. Update MAESTRO_ZIP_PATH in this cell")
    raise FileNotFoundError(f"MAESTRO zip not found: {MAESTRO_ZIP_PATH}")

print(f"\n✓ Found MAESTRO zip: {MAESTRO_ZIP_PATH}")
print(f"   Size: {Path(MAESTRO_ZIP_PATH).stat().st_size / 1e9:.1f} GB")

# Extract to /tmp for fast access
extract_dir = Path("/tmp/maestro-v3.0.0")
if extract_dir.exists():
    print(f"\nRemoving existing extraction at {extract_dir}...")
    shutil.rmtree(extract_dir)

print(f"\nExtracting MAESTRO to /tmp (this may take 5-10 minutes)...")
with zipfile.ZipFile(MAESTRO_ZIP_PATH, 'r') as zip_ref:
    zip_ref.extractall("/tmp/")

print(f"✓ Extracted to: {extract_dir}")

# Verify extraction
audio_files = list(extract_dir.glob("**/*.wav"))
midi_files = list(extract_dir.glob("**/*.midi"))
csv_file = extract_dir / "maestro-v3.0.0.csv"

print(f"\nVerifying extraction:")
print(f"  Audio files: {len(audio_files):,}")
print(f"  MIDI files: {len(midi_files):,}")
print(f"  CSV file: {'✓' if csv_file.exists() else '✗'}")

if len(audio_files) == 0 or len(midi_files) == 0 or not csv_file.exists():
    raise RuntimeError("MAESTRO extraction incomplete!")

print("\n" + "="*70)
print("✓ MAESTRO DATASET READY")
print("="*70)
print(f"\nDataset location: {extract_dir}")
print("\nNote: You'll need to run the preprocessing scripts to create")
print("      annotation files and segments from this raw MAESTRO data.")

## Alternative: Use Original MAESTRO Dataset from Google Drive

**Skip this cell if you're using the HuggingFace Hub data above**

This alternative approach uses the original MAESTRO v3.0.0 dataset that you can upload to Google Drive. This avoids any corrupted MIDI files from the processed archive.

**Setup Instructions:**
1. Download MAESTRO v3.0.0 from https://magenta.tensorflow.org/datasets/maestro#v300
2. Upload `maestro-v3.0.0.zip` to your Google Drive (in a folder like `crescendai_data/`)
3. Update the path below to point to your uploaded zip file
4. Run this cell instead of the HuggingFace download cell above

In [None]:
# Fix annotation paths (Drive → Local SSD)
print("="*70)
print("STEP 2.5: FIX ANNOTATION PATHS")
print("="*70)
print("\nUpdating annotation files to use local SSD paths...\n")

!python scripts/fix_annotation_paths.py

print("\n✓ Annotation paths updated for local SSD access")

In [None]:
# Quick verification - check a few sample paths
print("Verifying data paths...")

import json
from pathlib import Path

# Check a sample annotation
with open('/tmp/crescendai_data/data/annotations/synthetic_train.jsonl') as f:
    sample = json.loads(f.readline())
    
print(f"\nSample annotation:")
print(f"  Audio: {sample['audio_path']}")
print(f"  MIDI:  {sample['midi_path']}")

# Verify files exist
audio_exists = Path(sample['audio_path']).exists()
midi_exists = Path(sample['midi_path']).exists() if sample['midi_path'] else False

print(f"\nFile existence check:")
print(f"  Audio exists: {'✓' if audio_exists else '✗'}")
print(f"  MIDI exists:  {'✓' if midi_exists else '✗ (may be OK if path is None)'}")

if not audio_exists:
    print(f"\n⚠️  WARNING: Audio file not found!")
    print(f"     Check that data extraction completed correctly")
    print(f"     Expected: {sample['audio_path']}")
elif not midi_exists and sample['midi_path']:
    print(f"\n⚠️  WARNING: MIDI file not found!")
    print(f"     Expected: {sample['midi_path']}")
else:
    print(f"\n✓ Sample files verified - data structure looks correct!")

In [None]:
# Preflight Check
print("="*70)
print("STEP 3: PREFLIGHT CHECK")
print("="*70)
print("\nVerifying training environment and data...\n")

!python scripts/preflight_check.py --config configs/experiment_full.yaml


## Experiment 1: Audio-Only (~60-80 min)

Training with audio features only (MERT-95M encoder)

In [None]:
%%time
!python evaluate_audio_only.py \
  --checkpoint /content/drive/MyDrive/crescendai_checkpoints/audio_full/audio-epoch=02-val_loss=7.9935.ckpt \
  --test-data /tmp/crescendai_data/data/annotations/synthetic_test.jsonl \
  --batch-size 16 \
  --num-workers 4

## Evaluate Audio-Only Model

Test the completed audio-only checkpoint

In [None]:
%%time
!python train.py --config configs/experiment_full.yaml --mode audio

## Experiment 2: MIDI-Only (~45-60 min)

Training with MIDI features only (MIDIBert encoder)

In [None]:
%%time
!python train.py --config configs/experiment_full.yaml --mode midi

## Experiment 3: Fusion (~90-120 min)

Training with both audio and MIDI features (multi-modal fusion)

In [None]:
%%time
!python train.py --config configs/experiment_full.yaml --mode fusion

## Compare Results

Load all 3 trained models and compare performance on test set

In [None]:
import pytorch_lightning as pl
from src.models.lightning_module import PerformanceEvaluationModel
from src.data.dataset import create_dataloaders
from pathlib import Path

# Load all 3 models
models = {}
for mode in ['audio', 'midi', 'fusion']:
    ckpt_dir = Path(f'/content/drive/MyDrive/crescendai_checkpoints/{mode}_full')
    ckpts = list(ckpt_dir.glob('*.ckpt'))
    if ckpts:
        latest = sorted(ckpts)[-1]
        print(f"Loading {mode}: {latest.name}")
        models[mode] = PerformanceEvaluationModel.load_from_checkpoint(str(latest))
        models[mode].eval()
        models[mode] = models[mode].cuda()
    else:
        print(f"⚠️  No checkpoint found for {mode}")

# Create test dataloader (using local SSD paths)
_, _, test_loader = create_dataloaders(
    train_annotation_path='/tmp/crescendai_data/data/annotations/synthetic_train.jsonl',
    val_annotation_path='/tmp/crescendai_data/data/annotations/synthetic_val.jsonl',
    test_annotation_path='/tmp/crescendai_data/data/annotations/synthetic_test.jsonl',
    dimension_names=['note_accuracy', 'rhythmic_precision', 'tone_quality'],
    batch_size=8,
    num_workers=4,  # Can use parallel loading with local data
    augmentation_config=None,
    audio_sample_rate=24000,
    max_audio_length=240000,
    max_midi_events=512,
)

# Evaluate each model
trainer = pl.Trainer(accelerator='auto', devices='auto', precision=16)
results = {}

for mode, model in models.items():
    print(f"\nEvaluating {mode}...")
    test_results = trainer.test(model, dataloaders=test_loader, verbose=False)
    results[mode] = test_results[0]

print("\n" + "="*70)
print("COMPARISON")
print("="*70)
print(f"{'Dimension':<25} {'Audio r':<12} {'MIDI r':<12} {'Fusion r':<12} {'Gain'}")
print("-"*70)

for dim in ['note_accuracy', 'rhythmic_precision', 'tone_quality']:
    audio_r = results.get('audio', {}).get(f'test_pearson_{dim}', 0)
    midi_r = results.get('midi', {}).get(f'test_pearson_{dim}', 0)
    fusion_r = results.get('fusion', {}).get(f'test_pearson_{dim}', 0)
    gain = fusion_r - max(audio_r, midi_r)
    
    print(f"{dim:<25} {audio_r:>11.3f} {midi_r:>11.3f} {fusion_r:>11.3f} {gain:>+11.3f}")

avg_gain = sum(
    results.get('fusion', {}).get(f'test_pearson_{dim}', 0) - 
    max(results.get('audio', {}).get(f'test_pearson_{dim}', 0),
        results.get('midi', {}).get(f'test_pearson_{dim}', 0))
    for dim in ['note_accuracy', 'rhythmic_precision', 'tone_quality']
) / 3

print("-"*70)
print(f"Average fusion gain: {avg_gain:+.3f} ({avg_gain*100:+.1f}%)")
print("="*70)

if avg_gain > 0.05:
    print("\n✓ SUCCESS: Fusion shows clear multi-modal advantage!")
else:
    print("\n⚠️  WARNING: Fusion gain is marginal. Check fusion implementation.")