# PercePiano Replica Training - Clean Run

Train the PercePiano HAN model directly without debugging infrastructure.
Goal: Establish a clean baseline using our implementation.

## Attribution

> **PercePiano: Piano Performance Evaluation Dataset with Multi-level Perceptual Features**  
> Park, Kim et al.  
> Nature Scientific Reports 2024  
> Paper: https://pmc.ncbi.nlm.nih.gov/articles/PMC11450231/  
> GitHub: https://github.com/JonghoKimSNU/PercePiano

## Target: R2 = 0.397 (Paper SOTA)

## Step 1: Environment Setup

In [None]:
# Check GPU
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Install rclone
!curl -fsSL https://rclone.org/install.sh | sudo bash 2>&1 | grep -E "(successfully|already)" || echo "rclone installed"

In [None]:
# Install uv and clone repository
!curl -LsSf https://astral.sh/uv/install.sh | sh

import os
os.environ['PATH'] = f"{os.environ['HOME']}/.cargo/bin:{os.environ['PATH']}"

# Clone repository
if not os.path.exists('/tmp/crescendai'):
    !git clone https://github.com/Jai-Dhiman/crescendai.git /tmp/crescendai

%cd /tmp/crescendai/model
!git pull
!git log -1 --oneline

# Clone original PercePiano for comparison (needed for data diagnostics)
PERCEPIANO_PATH = '/tmp/crescendai/model/data/raw/PercePiano'
if not os.path.exists(PERCEPIANO_PATH):
    print("\nCloning original PercePiano repository...")
    !git clone https://github.com/JonghoKimSNU/PercePiano.git {PERCEPIANO_PATH}
else:
    print(f"\nPercePiano already present at {PERCEPIANO_PATH}")

# Install dependencies
!uv pip install --system -e .
!pip install tensorboard rich

import torch
import pytorch_lightning as pl
print(f"\nPyTorch: {torch.__version__}")
print(f"Lightning: {pl.__version__}")

## Step 2: Configure Paths and Check rclone

In [None]:
import os
import subprocess
import shutil
from pathlib import Path

# Paths
DATA_ROOT = Path('/tmp/percepiano_vnet_84dim')
CHECKPOINT_ROOT = Path('/tmp/checkpoints/percepiano_kfold')
LOG_ROOT = Path('/tmp/logs/percepiano_kfold')
GDRIVE_DATA_PATH = 'gdrive:crescendai_data/percepiano_vnet_84dim'
GDRIVE_CHECKPOINT_PATH = 'gdrive:crescendai_checkpoints/percepiano_kfold'

# Training control
RESTART_TRAINING = True  # Set to True to clear checkpoints and start fresh

print("="*60)
print("PERCEPIANO REPLICA TRAINING (4-FOLD CV)")
print("="*60)

# Clear checkpoints if restarting
if RESTART_TRAINING and CHECKPOINT_ROOT.exists():
    print(f"\nRESTART_TRAINING=True: Clearing checkpoints at {CHECKPOINT_ROOT}")
    shutil.rmtree(CHECKPOINT_ROOT)
    print("  Checkpoints cleared!")

if RESTART_TRAINING and LOG_ROOT.exists():
    print(f"RESTART_TRAINING=True: Clearing logs at {LOG_ROOT}")
    shutil.rmtree(LOG_ROOT)
    print("  Logs cleared!")

# Create directories
CHECKPOINT_ROOT.mkdir(parents=True, exist_ok=True)
LOG_ROOT.mkdir(parents=True, exist_ok=True)
DATA_ROOT.mkdir(parents=True, exist_ok=True)

# Check rclone
result = subprocess.run(['rclone', 'listremotes'], capture_output=True, text=True)

if 'gdrive:' in result.stdout:
    print("\nrclone 'gdrive' remote: CONFIGURED")
    RCLONE_AVAILABLE = True
else:
    print("\nrclone 'gdrive' remote: NOT CONFIGURED")
    print("Run 'rclone config' in terminal to set up Google Drive")
    RCLONE_AVAILABLE = False

print(f"\nData directory: {DATA_ROOT}")
print(f"Checkpoint directory: {CHECKPOINT_ROOT}")
print(f"Log directory: {LOG_ROOT}")
print(f"\nRESTART_TRAINING: {RESTART_TRAINING}")

## Step 3: Download Data from Google Drive

In [None]:
import subprocess

if not RCLONE_AVAILABLE:
    raise RuntimeError("rclone not configured. Run 'rclone config' first.")

# Download preprocessed data
print("Downloading preprocessed VirtuosoNet features from Google Drive...")
subprocess.run(
    ['rclone', 'copy', GDRIVE_DATA_PATH, str(DATA_ROOT), '--progress'],
    capture_output=False
)

# Verify data
print("\n" + "="*60)
print("DATA VERIFICATION")
print("="*60)

total_samples = 0
for split in ['train', 'val', 'test']:
    split_dir = DATA_ROOT / split
    if split_dir.exists():
        count = len(list(split_dir.glob('*.pkl')))
        total_samples += count
        print(f"  {split}: {count} samples")
    else:
        print(f"  {split}: MISSING!")

print(f"  Total: {total_samples} samples")

stat_file = DATA_ROOT / 'stat.pkl'
print(f"  stat.pkl: {'present' if stat_file.exists() else 'MISSING!'}")

fold_file = DATA_ROOT / 'fold_assignments.json'
print(f"  fold_assignments.json: {'present' if fold_file.exists() else 'will be created'}")

## Step 4: Create Fold Assignments

In [None]:
from src.percepiano.data.kfold_split import (
    create_piece_based_folds,
    save_fold_assignments,
    load_fold_assignments,
    print_fold_statistics,
)

FOLD_FILE = DATA_ROOT / 'fold_assignments.json'
N_FOLDS = 4
TEST_RATIO = 0.15
SEED = 42

print("="*60)
print("FOLD ASSIGNMENT CREATION")
print("="*60)

# Force regeneration to use corrected methodology
# - Test set: select pieces until ~15% of SAMPLES (PercePiano methodology)
# - CV folds: greedy bin-packing for balanced sample counts (improvement over round-robin)
FORCE_REGENERATE = True

if FOLD_FILE.exists() and not FORCE_REGENERATE:
    print(f"\nLoading existing fold assignments from {FOLD_FILE}")
    fold_assignments = load_fold_assignments(FOLD_FILE)
else:
    if FOLD_FILE.exists():
        print(f"\nRemoving old fold assignments (regenerating with balanced methodology)...")
        FOLD_FILE.unlink()
    
    print(f"\nCreating new {N_FOLDS}-fold piece-based splits...")
    print("  Test set: select pieces until ~15% of SAMPLES")
    print("  CV folds: greedy bin-packing for balanced sample counts")
    fold_assignments = create_piece_based_folds(
        data_dir=DATA_ROOT,
        n_folds=N_FOLDS,
        test_ratio=TEST_RATIO,
        seed=SEED,
    )
    save_fold_assignments(fold_assignments, FOLD_FILE)

# Print statistics
print_fold_statistics(fold_assignments, n_folds=N_FOLDS)

## Step 5: Training Configuration

In [None]:
import torch
torch.set_float32_matmul_precision('medium')

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# Import only what we need
from src.percepiano.training.kfold_trainer import KFoldTrainer, MODEL_TYPE_HAN

# Clean configuration - matches paper exactly
CONFIG = {
    # K-Fold settings
    'n_folds': N_FOLDS,
    'test_ratio': TEST_RATIO,
    # Data
    'data_dir': str(DATA_ROOT),
    'checkpoint_dir': str(CHECKPOINT_ROOT),
    'log_dir': str(LOG_ROOT),
    # Model architecture (matches paper)
    'input_size': 79,
    'hidden_size': 256,
    'note_layers': 2,
    'voice_layers': 2,
    'beat_layers': 2,
    'measure_layers': 1,
    'num_attention_heads': 8,
    # Training (matches paper)
    'learning_rate': 2.5e-5,
    'weight_decay': 1e-5,
    'dropout': 0.2,
    'batch_size': 8,
    'max_epochs': 200,
    'early_stopping_patience': 40,  # Paper uses more patience
    'gradient_clip_val': 2.0,
    'precision': '32',
    'max_notes': 5000,
    'slice_len': 5000,
    'num_workers': 4,
    'augment_train': False,
    # Disable diagnostics for cleaner output
    'enable_diagnostics': False,
}

print("="*60)
print("CLEAN HAN TRAINING - MATCHING PAPER CONFIG")
print("="*60)
print(f"\nTarget: R2 = 0.397 (Paper SOTA)")
print("\nArchitecture:")
print(f"  hidden_size: {CONFIG['hidden_size']}")
print(f"  layers: note={CONFIG['note_layers']}, voice={CONFIG['voice_layers']}, beat={CONFIG['beat_layers']}, measure={CONFIG['measure_layers']}")
print(f"  attention_heads: {CONFIG['num_attention_heads']}")
print("\nTraining:")
print(f"  lr: {CONFIG['learning_rate']}, batch_size: {CONFIG['batch_size']}")
print(f"  max_epochs: {CONFIG['max_epochs']}, patience: {CONFIG['early_stopping_patience']}")
print(f"  diagnostics: {CONFIG['enable_diagnostics']}")
print("="*60)

## Step 6: Initialize HAN Trainer

Train the full Hierarchical Attention Network (HAN) model directly.

In [None]:
import pytorch_lightning as pl

# Set seed for reproducibility
pl.seed_everything(42, workers=True)

# Train on Fold 2 (longest pieces, best for hierarchy)
FOLD_ID = 2

print("="*60)
print("HAN TRAINER INITIALIZATION")
print("="*60)
print(f"\nTraining Fold: {FOLD_ID} (longest pieces)")

# Initialize trainer
han_trainer = KFoldTrainer(
    config=CONFIG,
    fold_assignments=fold_assignments,
    data_dir=DATA_ROOT,
    checkpoint_dir=CHECKPOINT_ROOT / "han",
    log_dir=LOG_ROOT / "han",
    n_folds=N_FOLDS,
    model_type=MODEL_TYPE_HAN,
)

print(f"\nCheckpoint dir: {han_trainer.checkpoint_dir}")
print("="*60)

In [None]:
# Ready to train - no debug mode needed for clean run

## Step 7: Train HAN Model

Train the full HAN model. Target: R2 = 0.397 (Paper SOTA)

In [None]:
"""
CLEAN HAN TRAINING

Train the full Hierarchical Attention Network directly.
Target: R2 = 0.397 (Paper SOTA)
"""

print("="*70)
print("HAN MODEL TRAINING")
print("="*70)
print(f"\nFold: {FOLD_ID}")
print(f"Target: R2 = 0.397 (Paper SOTA)")
print("="*70)

# Train
han_metrics = han_trainer.train_fold(
    fold_id=FOLD_ID,
    verbose=True,
    resume_from_checkpoint=False,
)
han_trainer.save_results()

# Store for analysis
trained_model = han_trainer.get_trained_model(FOLD_ID)
trained_metrics = han_metrics

# Results
print("\n" + "="*70)
print("TRAINING COMPLETE")
print("="*70)
print(f"\n  Val R2: {han_metrics.val_r2:+.4f}")
print(f"  Val Loss: {han_metrics.val_loss:.6f}")
print(f"  Best Epoch: {han_metrics.best_epoch}")
print(f"  Epochs Trained: {han_metrics.epochs_trained}")
print(f"\n  Target: R2 = 0.397 (Paper SOTA)")

if han_metrics.val_r2 >= 0.35:
    print(f"  [SUCCESS] Approaching SOTA!")
elif han_metrics.val_r2 >= 0.30:
    print(f"  [GOOD] Strong performance")
elif han_metrics.val_r2 >= 0.20:
    print(f"  [PARTIAL] Reasonable but below target")
else:
    print(f"  [ISSUE] Below expected - needs investigation")

print("="*70)

## Step 8: Per-Dimension Analysis

In [None]:
"""
Per-dimension R2 analysis
"""

from src.percepiano.models.percepiano_replica import PERCEPIANO_DIMENSIONS

print("="*60)
print("PER-DIMENSION R2 ANALYSIS")
print("="*60)

if hasattr(trained_metrics, 'per_dim_r2') and trained_metrics.per_dim_r2:
    print(f"\n  {'Dimension':<25} {'R2':>10}")
    print(f"  {'-'*25} {'-'*10}")
    
    # Sort by R2 (best first)
    dim_data = [(dim, trained_metrics.per_dim_r2.get(dim, 0)) for dim in PERCEPIANO_DIMENSIONS]
    dim_data.sort(key=lambda x: x[1], reverse=True)
    
    for dim, r2 in dim_data:
        status = "[OK]" if r2 >= 0.2 else "[LOW]" if r2 >= 0 else "[NEG]"
        print(f"  {dim:<25} {r2:>+10.4f} {status}")
    
    # Summary
    positive = sum(1 for _, r2 in dim_data if r2 > 0)
    strong = sum(1 for _, r2 in dim_data if r2 >= 0.2)
    print(f"\n  Positive R2: {positive}/19")
    print(f"  Strong R2 (>=0.2): {strong}/19")
else:
    print("\n  [Per-dimension R2 not available]")

print("="*60)

## Step 9: Sync to Google Drive

In [None]:
"""
Sync checkpoints to Google Drive
"""

import subprocess

print("="*60)
print("SYNC CHECKPOINTS TO GOOGLE DRIVE")
print("="*60)

if RCLONE_AVAILABLE:
    ckpt_dir = han_trainer.checkpoint_dir
    if ckpt_dir.exists():
        gdrive_path = f"{GDRIVE_CHECKPOINT_PATH}/{ckpt_dir.name}"
        print(f"\nSyncing HAN checkpoints...")
        subprocess.run(
            ['rclone', 'copy', str(ckpt_dir), gdrive_path, '--progress'],
            capture_output=False
        )
    
    # Sync fold assignments
    print(f"\nSyncing fold assignments...")
    subprocess.run(
        ['rclone', 'copy', str(FOLD_FILE), GDRIVE_DATA_PATH, '--progress'],
        capture_output=False
    )
    
    print("\n" + "="*60)
    print("SYNC COMPLETE")
    print("="*60)
else:
    print("\nrclone not available - checkpoints saved locally")
    print(f"Location: {han_trainer.checkpoint_dir}")