# Piano Performance Evaluation - 3-Way Model Comparison (Thunder Compute)

Trains 3 models to prove multi-modal fusion advantage:
1. Audio-Only (MERT only)
2. MIDI-Only (MIDIBert only)
3. Fusion (MERT + MIDIBert)

**Dimensions**: 6 technical (note_accuracy, rhythmic_precision, tone_quality, dynamics_control, articulation, pedaling)
**Sample size**: 114,246 training samples (full dataset)
**Expected time**: 3-4 hours training + 10-15 min setup
**Goal**: Identify which dimensions are learnable to guide expert annotation strategy

In [None]:
import torch
print(torch.cuda.is_available())

In [None]:
%pip install -q huggingface_hub

import os
os.environ.pop("HF_TOKEN", None)
os.environ.pop("HUGGINGFACEHUB_API_TOKEN", None)

from huggingface_hub import login, HfApi

try:
    import getpass as gp
    raw = gp.getpass("Paste your Hugging Face token (input hidden): ")
    token = raw.decode() if isinstance(raw, (bytes, bytearray)) else raw
    if not isinstance(token, str):
        raise TypeError(f"Unexpected token type: {type(token).__name__}")
    token = token.strip()
    if not token:
        raise ValueError("Empty token provided")
    login(token=token, add_to_git_credential=False)
    who = HfApi().whoami(token=token)
    print(f"✓ Logged in as: {who.get('name') or who.get('email') or 'OK'}")
except Exception as e:
    print(f"[HF Login] getpass flow failed: {e}")
    print("Falling back to interactive login widget...")
    login()
    try:
        who = HfApi().whoami()
        print(f"✓ Logged in as: {who.get('name') or who.get('email') or 'OK'}")
    except Exception as e2:
        print(f"[HF Login] Verification skipped: {e2}")

In [None]:
import os

print("Installing rclone if needed...")
!curl -fsSL https://rclone.org/install.sh | sudo bash 2>&1 | grep -E "(successfully|already)" || echo "rclone installation status unknown"

print("\n" + "="*70)
print("COPYING CHECKPOINTS FROM GOOGLE DRIVE")
print("="*70)

# Local checkpoint directory
CHECKPOINT_ROOT = '/tmp/crescendai_checkpoints'
os.makedirs(CHECKPOINT_ROOT, exist_ok=True)

# Check if rclone is configured
import subprocess
result = subprocess.run(['rclone', 'listremotes'], capture_output=True, text=True)
if 'gdrive:' not in result.stdout:
    print("\n⚠️  rclone not configured!")
    print("Run 'rclone config' in terminal to set up 'gdrive' remote")
    print("Follow the OAuth flow for remote server authentication")
    raise RuntimeError("rclone gdrive remote not configured")

print("\nCopying checkpoints from Google Drive...")
print("This may take a few minutes depending on checkpoint size...\n")

# Copy each model's checkpoints
for mode in ['audio_full', 'midi_full', 'fusion_full']:
    print(f"Copying {mode}...")
    !rclone copy gdrive:crescendai_checkpoints/{mode} {CHECKPOINT_ROOT}/{mode} -P --transfers 4

print("\n" + "="*70)
print(f"✓ Checkpoints copied to: {CHECKPOINT_ROOT}")
print("="*70)

# List what was copied
print("\nCheckpoint contents:")
!ls -lh {CHECKPOINT_ROOT}/*/*.ckpt 2>/dev/null || echo "No .ckpt files found"

In [None]:
!rm -rf /tmp/crescendai
!git clone https://github.com/Jai-Dhiman/crescendai.git /tmp/crescendai
%cd /tmp/crescendai/model
!git log -1 --oneline

In [None]:
!curl -LsSf https://astral.sh/uv/install.sh | sh

# Add to PATH for this session
import os
os.environ['PATH'] = f"{os.environ['HOME']}/.cargo/bin:{os.environ['PATH']}"

print("\n✓ uv installed")

In [None]:
!uv pip install --system -e .

import torch
import pytorch_lightning as pl
print(f"\nPyTorch: {torch.__version__}")
print(f"Lightning: {pl.__version__}")
print("✓ Dependencies installed")

!python scripts/setup_colab_environment.py

In [None]:
import os
import shutil

MERT_CACHE_DIR = os.path.expanduser("~/.cache/huggingface/hub/models--m-a-p--MERT-v1-95M")

print("="*70)
print("DOWNLOADING MERT-95M MODEL")
print("="*70)

if os.path.exists(MERT_CACHE_DIR):
    print(f"\n✓ MERT-95M already cached at: {MERT_CACHE_DIR}")
else:
    print("\nDownloading MERT-95M (~380MB)...")
    print("This may take 2-5 minutes...\n")
    
    # Install git-lfs if needed
    !command -v git-lfs >/dev/null 2>&1 || (curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash && sudo apt-get install -y git-lfs)
    !git lfs install
    
    # Clone the model
    os.makedirs(os.path.dirname(MERT_CACHE_DIR), exist_ok=True)
    !git clone https://huggingface.co/m-a-p/MERT-v1-95M {MERT_CACHE_DIR}
    
    print("\n✓ MERT-95M downloaded and cached")

# Verify the model can be loaded
print("\nVerifying model...")
from transformers import AutoModel
model = AutoModel.from_pretrained("m-a-p/MERT-v1-95M", trust_remote_code=True, local_files_only=True)
print(f"✓ Model loaded successfully: {model.config.model_type}")

del model
import torch
torch.cuda.empty_cache()

print("\n" + "="*70)
print("✓ MERT-95M READY")
print("="*70)

In [None]:
print("="*70)
print("STEP 2: DOWNLOAD DATA FROM HUGGING FACE HUB")
print("="*70)
print("\nThis is 10-100x faster and more reliable than Google Drive!")
print("Download time: 7-15 minutes (one-time per session)\n")

from huggingface_hub import hf_hub_download
import tarfile
import os

HF_REPO_ID = "Jai-D/crescendai-data"

print(f"Downloading from: {HF_REPO_ID}")
print("Archive size: ~20-25 GB compressed\n")

# Download archive
print("1. Downloading archive...")
archive_path = hf_hub_download(
    repo_id=HF_REPO_ID,
    filename="crescendai_data.tar.gz",
    repo_type="model",
    local_dir="/tmp/",
    local_dir_use_symlinks=False,
)
print(f"   ✓ Downloaded to: {archive_path}")

# Extract
print("\n2. Extracting archive...")
with tarfile.open(archive_path, 'r:gz') as tar:
    members = tar.getmembers()
    print(f"   Extracting {len(members):,} files...")
    tar.extractall('/tmp/crescendai_data/')

print("   ✓ Extracted to: /tmp/crescendai_data/")

# Clean up archive to save space
print("\n3. Cleaning up...")
os.remove(archive_path)
print("   ✓ Removed archive file")

# Verify structure
print("\n4. Verifying data structure...")
expected_paths = [
    '/tmp/crescendai_data/data/all_segments',
    '/tmp/crescendai_data/data/annotations',
]

all_good = True
for path in expected_paths:
    if os.path.exists(path):
        if 'all_segments' in path:
            num_files = len([f for f in os.listdir(path) if f.endswith('.wav')])
            print(f"   ✓ {path}: {num_files:,} audio files")
        else:
            num_files = len([f for f in os.listdir(path) if f.endswith('.jsonl')])
            print(f"   ✓ {path}: {num_files} annotation files")
    else:
        print(f"   ✗ {path}: NOT FOUND")
        all_good = False

if all_good:
    print("\n" + "="*70)
    print("✓ DATA DOWNLOAD COMPLETE")
    print("="*70)
    print("\nData ready at: /tmp/crescendai_data/")
    print("Training will be 10-30x faster than reading from Drive!")
else:
    print("\n✗ Data structure verification failed!")
    print("   Check that your archive has the correct structure")
    raise RuntimeError("Data download verification failed")

In [None]:
print("="*70)
print("STEP 2.5: FIX ANNOTATION PATHS")
print("="*70)
print("\nUpdating annotation files to use local SSD paths...\n")

!python scripts/fix_annotation_paths.py

print("\n✓ Annotation paths updated for local SSD access")

In [None]:
print("Verifying data paths...")

import json
from pathlib import Path

# Check a sample annotation
with open('/tmp/crescendai_data/data/annotations/synthetic_train.jsonl') as f:
    sample = json.loads(f.readline())
    
print(f"\nSample annotation:")
print(f"  Audio: {sample['audio_path']}")
print(f"  MIDI:  {sample['midi_path']}")

# Verify files exist
audio_exists = Path(sample['audio_path']).exists()
midi_exists = Path(sample['midi_path']).exists() if sample['midi_path'] else False

print(f"\nFile existence check:")
print(f"  Audio exists: {'✓' if audio_exists else '✗'}")
print(f"  MIDI exists:  {'✓' if midi_exists else '✗ (may be OK if path is None)'}")

if not audio_exists:
    print(f"\n⚠️  WARNING: Audio file not found!")
    print(f"     Check that data extraction completed correctly")
    print(f"     Expected: {sample['audio_path']}")
elif not midi_exists and sample['midi_path']:
    print(f"\n⚠️  WARNING: MIDI file not found!")
    print(f"     Expected: {sample['midi_path']}")
else:
    print(f"\n✓ Sample files verified - data structure looks correct!")

# Preflight Check
print("="*70)
print("STEP 3: PREFLIGHT CHECK")
print("="*70)
print("\nVerifying training environment and data...\n")

!python scripts/preflight_check.py --config configs/experiment_full.yaml

## Experiment 1: Audio-Only

Training with audio features only (MERT-95M encoder)

In [None]:
import warnings
warnings.filterwarnings('ignore', message='divide by zero')
warnings.filterwarnings('ignore', category=SyntaxWarning)  # pydub regex warnings
warnings.filterwarnings('ignore', category=UserWarning, module='torchaudio')
warnings.filterwarnings('ignore', category=UserWarning, module='torchmetrics')

%%time
!python train.py --config configs/experiment_full.yaml --mode audio

## Experiment 2: MIDI-Only

Training with MIDI features only (MIDIBert encoder)

In [None]:
import warnings
warnings.filterwarnings('ignore', message='divide by zero')
warnings.filterwarnings('ignore', category=SyntaxWarning)  # pydub regex warnings
warnings.filterwarnings('ignore', category=UserWarning, module='torchaudio')
warnings.filterwarnings('ignore', category=UserWarning, module='torchmetrics')

%%time
!python train.py --config configs/experiment_full.yaml --mode midi

## Experiment 3: Fusion

Training with both audio and MIDI features (multi-modal fusion)

In [None]:
import warnings
warnings.filterwarnings('ignore', message='divide by zero')
warnings.filterwarnings('ignore', category=SyntaxWarning)  # pydub regex warnings
warnings.filterwarnings('ignore', category=UserWarning, module='torchaudio')
warnings.filterwarnings('ignore', category=UserWarning, module='torchmetrics')

%%time
!python train.py --config configs/experiment_full.yaml --mode fusion

## Compare Results

Load all 3 trained models and compare performance on test set

In [None]:
import pytorch_lightning as pl
from src.models.lightning_module import PerformanceEvaluationModel
from src.data.dataset import create_dataloaders
from pathlib import Path
import numpy as np
import torch
from scipy import stats

print("="*80)
print("COMPREHENSIVE 3-WAY MODEL EVALUATION")
print("="*80)

# Load all 3 models (use CHECKPOINT_ROOT from earlier cell)
models = {}
for mode in ['audio', 'midi', 'fusion']:
    ckpt_dir = Path(f'{CHECKPOINT_ROOT}/{mode}_full')
    ckpts = list(ckpt_dir.glob('*.ckpt'))
    if ckpts:
        latest = sorted(ckpts)[-1]
        print(f"Loading {mode}: {latest.name}")
        models[mode] = PerformanceEvaluationModel.load_from_checkpoint(str(latest))
        models[mode].eval()
        models[mode] = models[mode].cuda()
    else:
        print(f"⚠️  No checkpoint found for {mode}")

print(f"\nLoaded {len(models)}/3 models")

# Create test dataloader (using local SSD paths)
_, _, test_loader = create_dataloaders(
    train_annotation_path='/tmp/crescendai_data/data/annotations/synthetic_train.jsonl',
    val_annotation_path='/tmp/crescendai_data/data/annotations/synthetic_val.jsonl',
    test_annotation_path='/tmp/crescendai_data/data/annotations/synthetic_test.jsonl',
    dimension_names=['note_accuracy', 'rhythmic_precision', 'tone_quality', 'dynamics_control', 'articulation', 'pedaling'],
    batch_size=8,
    num_workers=4,
    augmentation_config=None,
    audio_sample_rate=24000,
    max_audio_length=240000,
    max_midi_events=512,
)

print(f"Test set size: {len(test_loader.dataset)} samples")

# Evaluate each model
trainer = pl.Trainer(accelerator='auto', devices='auto', precision=16)
results = {}
predictions = {}

for mode, model in models.items():
    print(f"\nEvaluating {mode}...")
    test_results = trainer.test(model, dataloaders=test_loader, verbose=False)
    results[mode] = test_results[0]
    
    # Collect predictions for deeper analysis
    model.eval()
    all_preds = []
    all_targets = []
    
    with torch.no_grad():
        for batch in test_loader:
            # Move batch to GPU
            audio_waveform = batch['audio_waveform'].cuda()
            midi_tokens = batch.get('midi_tokens', None)
            if midi_tokens is not None:
                midi_tokens = midi_tokens.cuda()
            targets = batch['scores'].cuda()
            
            # Forward pass with proper arguments
            output = model(
                audio_waveform=audio_waveform,
                midi_tokens=midi_tokens,
            )
            
            # Skip if batch was None (all MIDI failed in MIDI-only mode)
            if output is None:
                continue
                
            preds = output['scores']
            all_preds.append(preds.cpu().numpy())
            all_targets.append(targets.cpu().numpy())
    
    predictions[mode] = {
        'preds': np.concatenate(all_preds, axis=0),
        'targets': np.concatenate(all_targets, axis=0)
    }

dimensions = ['note_accuracy', 'rhythmic_precision', 'tone_quality', 'dynamics_control', 'articulation', 'pedaling']

print("\n" + "="*80)
print("1. PER-DIMENSION CORRELATION (Pearson r)")
print("="*80)
print(f"{'Dimension':<25} {'Audio':<12} {'MIDI':<12} {'Fusion':<12} {'Best':<12} {'Improvement'}")
print("-"*80)

for dim_idx, dim in enumerate(dimensions):
    audio_r = results.get('audio', {}).get(f'test_pearson_{dim}', 0)
    midi_r = results.get('midi', {}).get(f'test_pearson_{dim}', 0)
    fusion_r = results.get('fusion', {}).get(f'test_pearson_{dim}', 0)
    
    best_single = max(audio_r, midi_r)
    best_modality = 'Audio' if audio_r >= midi_r else 'MIDI'
    improvement = ((fusion_r - best_single) / best_single * 100) if best_single > 0 else 0
    
    print(f"{dim:<25} {audio_r:>11.3f} {midi_r:>11.3f} {fusion_r:>11.3f} {best_modality:<12} {improvement:>+6.1f}%")

print("-"*80)

print("\n" + "="*80)
print("2. PER-DIMENSION MAE (Mean Absolute Error, 0-100 scale)")
print("="*80)
print(f"{'Dimension':<25} {'Audio':<12} {'MIDI':<12} {'Fusion':<12} {'Best':<12} {'Reduction'}")
print("-"*80)

for dim_idx, dim in enumerate(dimensions):
    audio_mae = np.mean(np.abs(predictions['audio']['preds'][:, dim_idx] - predictions['audio']['targets'][:, dim_idx]))
    midi_mae = np.mean(np.abs(predictions['midi']['preds'][:, dim_idx] - predictions['midi']['targets'][:, dim_idx]))
    fusion_mae = np.mean(np.abs(predictions['fusion']['preds'][:, dim_idx] - predictions['fusion']['targets'][:, dim_idx]))
    
    best_single_mae = min(audio_mae, midi_mae)
    best_modality = 'Audio' if audio_mae <= midi_mae else 'MIDI'
    reduction = ((best_single_mae - fusion_mae) / best_single_mae * 100) if best_single_mae > 0 else 0
    
    print(f"{dim:<25} {audio_mae:>11.2f} {midi_mae:>11.2f} {fusion_mae:>11.2f} {best_modality:<12} {reduction:>+6.1f}%")

print("-"*80)

print("\n" + "="*80)
print("3. PER-DIMENSION RMSE (Root Mean Squared Error, 0-100 scale)")
print("="*80)
print(f"{'Dimension':<25} {'Audio':<12} {'MIDI':<12} {'Fusion':<12} {'Best':<12} {'Reduction'}")
print("-"*80)

for dim_idx, dim in enumerate(dimensions):
    audio_rmse = np.sqrt(np.mean((predictions['audio']['preds'][:, dim_idx] - predictions['audio']['targets'][:, dim_idx])**2))
    midi_rmse = np.sqrt(np.mean((predictions['midi']['preds'][:, dim_idx] - predictions['midi']['targets'][:, dim_idx])**2))
    fusion_rmse = np.sqrt(np.mean((predictions['fusion']['preds'][:, dim_idx] - predictions['fusion']['targets'][:, dim_idx])**2))
    
    best_single_rmse = min(audio_rmse, midi_rmse)
    best_modality = 'Audio' if audio_rmse <= midi_rmse else 'MIDI'
    reduction = ((best_single_rmse - fusion_rmse) / best_single_rmse * 100) if best_single_rmse > 0 else 0
    
    print(f"{dim:<25} {audio_rmse:>11.2f} {midi_rmse:>11.2f} {fusion_rmse:>11.2f} {best_modality:<12} {reduction:>+6.1f}%")

print("-"*80)

print("\n" + "="*80)
print("4. OVERALL PERFORMANCE (Averaged Across All Dimensions)")
print("="*80)
print(f"{'Metric':<35} {'Audio':<12} {'MIDI':<12} {'Fusion':<12} {'Winner'}")
print("-"*80)

# Mean Pearson
audio_mean_r = np.mean([results['audio'][f'test_pearson_{d}'] for d in dimensions])
midi_mean_r = np.mean([results['midi'][f'test_pearson_{d}'] for d in dimensions])
fusion_mean_r = np.mean([results['fusion'][f'test_pearson_{d}'] for d in dimensions])
best_r = max(audio_mean_r, midi_mean_r, fusion_mean_r)
winner_r = 'Audio' if audio_mean_r == best_r else ('MIDI' if midi_mean_r == best_r else 'Fusion')
print(f"{'Mean Pearson Correlation':<35} {audio_mean_r:>11.3f} {midi_mean_r:>11.3f} {fusion_mean_r:>11.3f} {winner_r}")

# Mean MAE
audio_mean_mae = np.mean([np.mean(np.abs(predictions['audio']['preds'][:, i] - predictions['audio']['targets'][:, i])) for i in range(len(dimensions))])
midi_mean_mae = np.mean([np.mean(np.abs(predictions['midi']['preds'][:, i] - predictions['midi']['targets'][:, i])) for i in range(len(dimensions))])
fusion_mean_mae = np.mean([np.mean(np.abs(predictions['fusion']['preds'][:, i] - predictions['fusion']['targets'][:, i])) for i in range(len(dimensions))])
best_mae = min(audio_mean_mae, midi_mean_mae, fusion_mean_mae)
winner_mae = 'Audio' if audio_mean_mae == best_mae else ('MIDI' if midi_mean_mae == best_mae else 'Fusion')
print(f"{'Mean Absolute Error':<35} {audio_mean_mae:>11.2f} {midi_mean_mae:>11.2f} {fusion_mean_mae:>11.2f} {winner_mae}")

# Mean RMSE
audio_mean_rmse = np.mean([np.sqrt(np.mean((predictions['audio']['preds'][:, i] - predictions['audio']['targets'][:, i])**2)) for i in range(len(dimensions))])
midi_mean_rmse = np.mean([np.sqrt(np.mean((predictions['midi']['preds'][:, i] - predictions['midi']['targets'][:, i])**2)) for i in range(len(dimensions))])
fusion_mean_rmse = np.mean([np.sqrt(np.mean((predictions['fusion']['preds'][:, i] - predictions['fusion']['targets'][:, i])**2)) for i in range(len(dimensions))])
best_rmse = min(audio_mean_rmse, midi_mean_rmse, fusion_mean_rmse)
winner_rmse = 'Audio' if audio_mean_rmse == best_rmse else ('MIDI' if midi_mean_rmse == best_rmse else 'Fusion')
print(f"{'Root Mean Squared Error':<35} {audio_mean_rmse:>11.2f} {midi_mean_rmse:>11.2f} {fusion_mean_rmse:>11.2f} {winner_rmse}")

print("-"*80)

print("\n" + "="*80)
print("5. FUSION PERFORMANCE GAIN")
print("="*80)

best_single_r = max(audio_mean_r, midi_mean_r)
r_improvement = ((fusion_mean_r - best_single_r) / best_single_r * 100) if best_single_r > 0 else 0
print(f"Pearson r improvement:  {r_improvement:+.1f}% over best single-modal")

best_single_mae = min(audio_mean_mae, midi_mean_mae)
mae_reduction = ((best_single_mae - fusion_mean_mae) / best_single_mae * 100) if best_single_mae > 0 else 0
print(f"MAE reduction:          {mae_reduction:+.1f}% over best single-modal")

best_single_rmse = min(audio_mean_rmse, midi_mean_rmse)
rmse_reduction = ((best_single_rmse - fusion_mean_rmse) / best_single_rmse * 100) if best_single_rmse > 0 else 0
print(f"RMSE reduction:         {rmse_reduction:+.1f}% over best single-modal")

print("\n" + "="*80)
print("6. STATISTICAL SIGNIFICANCE (Fusion vs Best Single-Modal)")
print("="*80)
print(f"{'Dimension':<25} {'Best Single':<15} {'p-value':<12} {'Significant?'}")
print("-"*80)

for dim_idx, dim in enumerate(dimensions):
    audio_errors = np.abs(predictions['audio']['preds'][:, dim_idx] - predictions['audio']['targets'][:, dim_idx])
    midi_errors = np.abs(predictions['midi']['preds'][:, dim_idx] - predictions['midi']['targets'][:, dim_idx])
    fusion_errors = np.abs(predictions['fusion']['preds'][:, dim_idx] - predictions['fusion']['targets'][:, dim_idx])
    
    # Compare fusion vs best single modal (paired t-test on MAE)
    best_single_errors = audio_errors if np.mean(audio_errors) <= np.mean(midi_errors) else midi_errors
    best_single_name = 'Audio' if np.mean(audio_errors) <= np.mean(midi_errors) else 'MIDI'
    
    t_stat, p_value = stats.ttest_rel(best_single_errors, fusion_errors)
    is_significant = p_value < 0.05 and np.mean(fusion_errors) < np.mean(best_single_errors)
    
    print(f"{dim:<25} {best_single_name:<15} {p_value:>11.4f} {'Yes' if is_significant else 'No':>12}")

print("-"*80)

print("\n" + "="*80)
print("7. DIMENSION LEARNABILITY CATEGORIZATION")
print("="*80)

strong = [d for d in dimensions if max(results.get('audio', {}).get(f'test_pearson_{d}', 0), 
                                        results.get('midi', {}).get(f'test_pearson_{d}', 0)) > 0.4]
moderate = [d for d in dimensions if 0.25 <= max(results.get('audio', {}).get(f'test_pearson_{d}', 0),
                                                   results.get('midi', {}).get(f'test_pearson_{d}', 0)) <= 0.4]
weak = [d for d in dimensions if max(results.get('audio', {}).get(f'test_pearson_{d}', 0),
                                      results.get('midi', {}).get(f'test_pearson_{d}', 0)) < 0.25]

print(f"Strong learners (r > 0.4):     {', '.join(strong) if strong else 'None'}")
print(f"Moderate learners (0.25-0.4):  {', '.join(moderate) if moderate else 'None'}")
print(f"Weak learners (r < 0.25):      {', '.join(weak) if weak else 'None'}")

print("\n" + "="*80)
print("8. MVP TARGET ASSESSMENT")
print("="*80)
print("Technical dimension target: r = 0.50-0.65 (Pearson with expert)")
print("Interpretive dimension target: r = 0.35-0.50")
print("MAE target: 10-15 points on 0-100 scale\n")

technical_dims = dimensions  # All 6 are technical in this experiment
technical_r_values = [results.get('fusion', {}).get(f'test_pearson_{d}', 0) for d in technical_dims]
technical_mean_r = np.mean(technical_r_values)

meets_r_target = technical_mean_r >= 0.50
meets_mae_target = fusion_mean_mae <= 15

print(f"Fusion technical r:     {technical_mean_r:.3f} {'(PASS)' if meets_r_target else '(FAIL - below 0.50 target)'}")
print(f"Fusion overall MAE:     {fusion_mean_mae:.2f} {'(PASS)' if meets_mae_target else '(FAIL - above 15 target)'}")

if meets_r_target and meets_mae_target:
    print("\nMVP TARGETS MET - Ready to proceed with expert annotation")
else:
    print("\nMVP TARGETS NOT MET - Consider architecture improvements or data augmentation")

print("\n" + "="*80)
print("9. EXPERT ANNOTATION RECOMMENDATION")
print("="*80)
print(f"Include in expert labels:  {', '.join(strong + moderate)}")
print(f"Consider skipping:         {', '.join(weak)}")
print(f"\nEstimated cost savings: ${len(weak) * 3000:,} by excluding weak dimensions")
print(f"Recommended budget:     ${len(strong + moderate) * 3000:,} for {len(strong + moderate)} dimensions")
print("="*80)