# MIDI-Only Piano Performance Evaluation - PercePiano Training

**Goal**: Train MIDI-only model on PercePiano expert labels to validate symbolic-only analysis.

**Target**: R^2 >= 0.185 (match PercePiano Bi-LSTM baseline)

**Stretch Goal**: R^2 >= 0.30 (near best published result of 0.397)

## What You Need on Google Drive

Upload this folder to your Google Drive root:
- `gdrive:percepiano_data/` containing:
  - `percepiano_train.json`
  - `percepiano_val.json`
  - `percepiano_test.json`
  - `PercePiano/` (the cloned repository with MIDI files)

## Step 1: Setup Environment

In [None]:
# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Install uv and rclone
!curl -LsSf https://astral.sh/uv/install.sh | sh
!curl -fsSL https://rclone.org/install.sh | sudo bash 2>&1 | grep -E "(successfully|already)" || echo "rclone installed"

import os
os.environ['PATH'] = f"{os.environ['HOME']}/.cargo/bin:{os.environ['PATH']}"

# Clone repository
if not os.path.exists('/tmp/crescendai'):
    !git clone https://github.com/Jai-Dhiman/crescendai.git /tmp/crescendai

%cd /tmp/crescendai/model
!git pull
!git log -1 --oneline

# Install dependencies
!uv pip install --system -e .
!pip install tensorboard rich

import torch
import pytorch_lightning as pl
print(f"\nPyTorch: {torch.__version__}")
print(f"Lightning: {pl.__version__}")

## Step 2: Configure rclone (Run Once)

Run this in a terminal:
```bash
rclone config
```

Follow prompts to set up `gdrive` remote for Google Drive.

In [None]:
import os
from pathlib import Path
import subprocess

# Paths
CHECKPOINT_ROOT = '/tmp/checkpoints/midi_only_percepiano'
GDRIVE_CHECKPOINT_PATH = 'gdrive:crescendai_checkpoints/midi_only_percepiano'
GDRIVE_DATA_PATH = 'gdrive:percepiano_data'
DATA_ROOT = Path('/tmp/percepiano_data')

print("="*70)
print("SETUP: CHECKPOINTS AND DATA")
print("="*70)

# Create directories
os.makedirs(CHECKPOINT_ROOT, exist_ok=True)
DATA_ROOT.mkdir(parents=True, exist_ok=True)

# Check rclone
print("\nChecking rclone configuration...")
result = subprocess.run(['rclone', 'listremotes'], capture_output=True, text=True)

if 'gdrive:' in result.stdout:
    print("  rclone 'gdrive' remote: CONFIGURED")
    RCLONE_AVAILABLE = True
    
    # Restore existing checkpoints
    print("\nRestoring checkpoints from Google Drive (if any)...")
    subprocess.run(
        ['rclone', 'copy', GDRIVE_CHECKPOINT_PATH, CHECKPOINT_ROOT, '--progress'],
        capture_output=False
    )
else:
    print("  rclone 'gdrive' remote: NOT CONFIGURED")
    print("  Run 'rclone config' in terminal to set up Google Drive")
    RCLONE_AVAILABLE = False

print(f"\nCheckpoint directory: {CHECKPOINT_ROOT}")
print(f"rclone available: {RCLONE_AVAILABLE}")

## Step 3: Download PercePiano Data

In [None]:
from pathlib import Path
import subprocess
import json

DATA_ROOT = Path('/tmp/percepiano_data')
DATA_ROOT.mkdir(parents=True, exist_ok=True)

# Check if data already exists
train_file = DATA_ROOT / 'percepiano_train.json'
if train_file.exists():
    print(f"Data already exists at {DATA_ROOT}")
else:
    print("Downloading PercePiano data from Google Drive...")
    result = subprocess.run(
        ['rclone', 'copy', GDRIVE_DATA_PATH, str(DATA_ROOT), '--progress'],
        capture_output=False
    )

# Verify data
for split in ['train', 'val', 'test']:
    path = DATA_ROOT / f'percepiano_{split}.json'
    if path.exists():
        with open(path) as f:
            data = json.load(f)
        print(f"{split}: {len(data)} samples")
    else:
        print(f"ERROR: {path} not found!")

# Check MIDI files
midi_dir = DATA_ROOT / 'PercePiano' / 'virtuoso' / 'data' / 'all_2rounds'
if midi_dir.exists():
    midi_files = list(midi_dir.glob('*.mid'))
    print(f"\nMIDI files: {len(midi_files)}")
else:
    print(f"\nERROR: MIDI directory not found at {midi_dir}")
    print("Make sure to upload the full PercePiano repository")

## Step 4: Update MIDI Paths in JSON Files

The JSON files have local paths - we need to update them to point to Thunder Compute paths.

In [None]:
import json
from pathlib import Path

DATA_ROOT = Path('/tmp/percepiano_data')

# Update paths in JSON files
for split in ['train', 'val', 'test']:
    path = DATA_ROOT / f'percepiano_{split}.json'
    
    with open(path) as f:
        data = json.load(f)
    
    # Update MIDI paths
    for sample in data:
        old_path = sample['midi_path']
        # Extract just the filename
        filename = Path(old_path).name
        # Set new path
        sample['midi_path'] = str(DATA_ROOT / 'PercePiano' / 'virtuoso' / 'data' / 'all_2rounds' / filename)
    
    # Save updated file
    with open(path, 'w') as f:
        json.dump(data, f, indent=2)
    
    print(f"Updated {split}: {len(data)} samples")

# Verify a path
with open(DATA_ROOT / 'percepiano_train.json') as f:
    data = json.load(f)
sample_path = Path(data[0]['midi_path'])
print(f"\nSample MIDI path: {sample_path}")
print(f"Exists: {sample_path.exists()}")

## Step 5: Training Configuration

In [None]:
import torch
torch.set_float32_matmul_precision('medium')  # Tensor Core optimization

CONFIG = {
    # Data
    'data_dir': '/tmp/percepiano_data',
    
    # Model Architecture (matches configs/midi_only_percepiano.yaml)
    'midi_hidden_dim': 256,
    'midi_num_layers': 6,
    'midi_num_heads': 8,
    'max_seq_length': 1024,
    'lstm_hidden': 256,
    'lstm_layers': 2,
    'attention_heads': 4,
    'shared_hidden': 256,
    'task_hidden': 128,
    'dropout': 0.1,
    
    # Training
    'batch_size': 16,  # T4: 16, A100: 32
    'learning_rate': 1e-4,
    'weight_decay': 0.01,
    'max_epochs': 30,
    'early_stopping_patience': 7,
    'gradient_clip_val': 1.0,
    'precision': '16-mixed',
    
    # Checkpoints
    'checkpoint_dir': '/tmp/checkpoints/midi_only_percepiano',
    'gdrive_checkpoint': 'gdrive:crescendai_checkpoints/midi_only_percepiano',
}

# Print config
print("Training Configuration:")
for k, v in CONFIG.items():
    print(f"  {k}: {v}")

## Step 6: Create DataLoaders

In [None]:
from pathlib import Path
from src.data.percepiano_dataset import create_dataloaders

train_loader, val_loader, test_loader = create_dataloaders(
    data_dir=Path(CONFIG['data_dir']),
    batch_size=CONFIG['batch_size'],
    max_seq_length=CONFIG['max_seq_length'],
    num_workers=4,
)

print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")

# Test a batch
batch = next(iter(train_loader))
print(f"\nBatch shapes:")
print(f"  midi_tokens: {batch['midi_tokens'].shape}")
print(f"  attention_mask: {batch['attention_mask'].shape}")
print(f"  scores: {batch['scores'].shape}")

## Step 7: Create Model

In [None]:
from src.models.midi_only_module import MIDIOnlyModule

model = MIDIOnlyModule(
    midi_hidden_dim=CONFIG['midi_hidden_dim'],
    midi_num_layers=CONFIG['midi_num_layers'],
    midi_num_heads=CONFIG['midi_num_heads'],
    max_seq_length=CONFIG['max_seq_length'],
    lstm_hidden=CONFIG['lstm_hidden'],
    lstm_layers=CONFIG['lstm_layers'],
    attention_heads=CONFIG['attention_heads'],
    shared_hidden=CONFIG['shared_hidden'],
    task_hidden=CONFIG['task_hidden'],
    learning_rate=CONFIG['learning_rate'],
    weight_decay=CONFIG['weight_decay'],
    dropout=CONFIG['dropout'],
)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"\nDimensions: {model.dimensions}")

## Step 8: Setup Training

In [None]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
import subprocess
from pathlib import Path
import os

# Checkpoint callback
checkpoint_callback = ModelCheckpoint(
    dirpath=CONFIG['checkpoint_dir'],
    filename='midi_only-{epoch:02d}-{val_mean_r:.3f}',
    monitor='val/mean_r',
    mode='max',
    save_top_k=3,
    save_last=True,
)

# Early stopping
early_stopping = EarlyStopping(
    monitor='val/mean_r',
    patience=CONFIG['early_stopping_patience'],
    mode='max',
)

# LR monitor
lr_monitor = LearningRateMonitor(logging_interval='step')

# Logger
logger = TensorBoardLogger(
    save_dir='/tmp/logs',
    name='midi_only_percepiano',
)

# Trainer
trainer = pl.Trainer(
    max_epochs=CONFIG['max_epochs'],
    accelerator='gpu',
    devices=1,
    precision=CONFIG['precision'],
    gradient_clip_val=CONFIG['gradient_clip_val'],
    callbacks=[checkpoint_callback, early_stopping, lr_monitor],
    logger=logger,
    log_every_n_steps=10,
    deterministic=True,
)

print("Trainer configured!")
print(f"  Precision: {CONFIG['precision']}")
print(f"  Max epochs: {CONFIG['max_epochs']}")
print(f"  Early stopping patience: {CONFIG['early_stopping_patience']}")

## Step 9: Train!

In [None]:
# Set seed for reproducibility
pl.seed_everything(42, workers=True)

# Train
print("Starting training...")
trainer.fit(model, train_loader, val_loader)

In [None]:
# Sync checkpoints to Google Drive
if RCLONE_AVAILABLE:
    print("Syncing checkpoints to Google Drive...")
    subprocess.run(
        ['rclone', 'copy', CONFIG['checkpoint_dir'], CONFIG['gdrive_checkpoint'], '--progress'],
        capture_output=False
    )
    print("Sync complete!")

## Step 10: Evaluate on Test Set

In [None]:
# Test with best checkpoint
print("\nRunning test with best checkpoint...")
best_path = checkpoint_callback.best_model_path
print(f"Best checkpoint: {best_path}")

if best_path:
    test_results = trainer.test(model, test_loader, ckpt_path=best_path)
    print("\nTest Results:")
    for k, v in test_results[0].items():
        print(f"  {k}: {v:.4f}")

## Step 11: Analyze Results

In [None]:
import torch
import numpy as np
from scipy import stats

# Load best model
best_model = MIDIOnlyModule.load_from_checkpoint(checkpoint_callback.best_model_path)
best_model.eval()
best_model.cuda()

# Collect predictions
all_preds = []
all_targets = []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
        outputs = best_model(batch['midi_tokens'], batch['attention_mask'])
        all_preds.append(outputs['predictions'].cpu())
        all_targets.append(batch['scores'].cpu())

all_preds = torch.cat(all_preds).numpy()
all_targets = torch.cat(all_targets).numpy()

print("Per-Dimension Results:")
print("="*60)
print(f"{'Dimension':<20} {'Pearson r':<12} {'R^2':<12} {'MAE':<12}")
print("-"*60)

dimensions = best_model.dimensions
results = {}

for i, dim in enumerate(dimensions):
    preds = all_preds[:, i]
    targets = all_targets[:, i]
    
    # Pearson correlation
    r, p = stats.pearsonr(preds, targets)
    
    # R-squared
    ss_res = np.sum((targets - preds) ** 2)
    ss_tot = np.sum((targets - np.mean(targets)) ** 2)
    r2 = 1 - ss_res / ss_tot if ss_tot > 0 else 0
    
    # MAE
    mae = np.mean(np.abs(preds - targets))
    
    results[dim] = {'r': r, 'r2': r2, 'mae': mae}
    print(f"{dim:<20} {r:<12.4f} {r2:<12.4f} {mae:<12.4f}")

# Overall metrics
mean_r = np.mean([v['r'] for v in results.values()])
mean_r2 = np.mean([v['r2'] for v in results.values()])
mean_mae = np.mean([v['mae'] for v in results.values()])

print("-"*60)
print(f"{'MEAN':<20} {mean_r:<12.4f} {mean_r2:<12.4f} {mean_mae:<12.4f}")
print("="*60)

# Compare to baselines
print("\nComparison to PercePiano Baselines:")
print(f"  Bi-LSTM baseline: R^2 = 0.185")
print(f"  MidiBERT:         R^2 = 0.313")
print(f"  Best (HAN):       R^2 = 0.397")
print(f"  Our model:        R^2 = {mean_r2:.3f}")

if mean_r2 >= 0.185:
    print("\n  SUCCESS: Meets baseline target!")
if mean_r2 >= 0.30:
    print("  SUCCESS: Meets stretch goal!")

## Step 12: Save Final Model and Sync

In [None]:
import torch
from pathlib import Path

# Save final model for inference
final_path = Path(CONFIG['checkpoint_dir']) / 'midi_scorer_final.pt'
torch.save({
    'state_dict': best_model.state_dict(),
    'hparams': dict(best_model.hparams),
    'dimensions': best_model.dimensions,
    'results': results,
    'mean_r2': mean_r2,
    'mean_r': mean_r,
}, final_path)
print(f"Saved final model to {final_path}")

# Final sync to Google Drive
if RCLONE_AVAILABLE:
    print("\nFinal sync to Google Drive...")
    subprocess.run(
        ['rclone', 'copy', CONFIG['checkpoint_dir'], CONFIG['gdrive_checkpoint'], '--progress'],
        capture_output=False
    )
    print("Sync complete!")
    print(f"Checkpoints available at: {CONFIG['gdrive_checkpoint']}")