# Aligned Audio + Fusion Experiments

Uses PercePiano fold assignments for proper apples-to-apples comparison.

## Why This Notebook Exists
Previous experiments used different fold assignments for audio vs symbolic models,
causing 60-80% data leakage in symbolic predictions. This notebook fixes that by:
- Training audio model on PercePiano's original fold splits
- Using symbolic predictions from PercePiano models on their correct validation sets
- Running fusion on properly aligned predictions

## Experiments
- **Audio**: MERT layers 7-12 + MLP (best config from Phase 2)
- **Symbolic**: PercePiano HAN predictions (from existing checkpoints)
- **Fusion**: Simple average, weighted, ridge stacking, confidence-weighted

## Requirements
- A100 GPU (80GB VRAM)
- rclone configured with `gdrive:` remote

In [None]:
# Set CUDA deterministic mode
import os
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'

import torch
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
!curl -fsSL https://rclone.org/install.sh | sudo bash 2>&1 | grep -E "(success|already)" || echo "rclone ok"

In [None]:
!pip install transformers librosa soundfile pytorch_lightning scipy scikit-learn --quiet

REPO_DIR = '/tmp/crescendai'
if os.path.exists(REPO_DIR):
    !cd {REPO_DIR} && git pull origin main
else:
    !git clone https://github.com/jai-dhiman/crescendai.git {REPO_DIR}
print(f"Repo: {REPO_DIR}")

In [None]:
import sys
sys.path.insert(0, f'{REPO_DIR}/model/src')

import json
import subprocess
import warnings
from pathlib import Path
import numpy as np
import pytorch_lightning as pl

from audio_experiments import PERCEPIANO_DIMENSIONS, BASE_CONFIG, SEED
from audio_experiments.models import BaseMERTModel
from audio_experiments.training import (
    run_4fold_mert_experiment,
    should_run_experiment, sync_experiment_to_gdrive,
    get_completed_experiments, print_experiment_status,
    run_bootstrap_experiment, run_paired_tests_experiment,
    run_multiple_correction_experiment, run_simple_fusion_experiment,
    run_weighted_fusion_experiment, run_ridge_fusion_experiment,
    run_confidence_fusion_experiment, run_weight_stability_experiment,
    run_category_fusion_experiment, run_error_correlation_experiment,
    save_fusion_experiment,
)

warnings.filterwarnings('ignore')
torch.set_float32_matmul_precision('medium')
pl.seed_everything(SEED, workers=True)
print("Imports: OK")

In [None]:
# Paths
DATA_ROOT = Path('/tmp/aligned_fusion')
AUDIO_DIR = DATA_ROOT / 'audio'
LABEL_DIR = DATA_ROOT / 'labels'
MERT_CACHE = DATA_ROOT / 'mert_cache'
CHECKPOINT_ROOT = DATA_ROOT / 'checkpoints'
RESULTS_DIR = DATA_ROOT / 'results'
LOG_DIR = DATA_ROOT / 'logs'

# GDrive paths
GDRIVE_AUDIO = 'gdrive:crescendai_data/audio_baseline/percepiano_rendered'
GDRIVE_LABELS = 'gdrive:crescendai_data/percepiano_labels'
GDRIVE_FOLDS = 'gdrive:crescendai_data/percepiano_fold_assignments.json'  # PercePiano splits!
GDRIVE_MERT = 'gdrive:crescendai_data/audio_baseline/mert_embeddings'
GDRIVE_SYMBOLIC_CKPTS = 'gdrive:crescendai_data/checkpoints/percepiano_original'
GDRIVE_RESULTS = 'gdrive:crescendai_data/checkpoints/aligned_fusion'

for d in [DATA_ROOT, AUDIO_DIR, LABEL_DIR, MERT_CACHE, CHECKPOINT_ROOT, RESULTS_DIR, LOG_DIR]:
    d.mkdir(parents=True, exist_ok=True)

def rclone(cmd, desc):
    print(f"{desc}...")
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
        print(f"Warning: {result.stderr[:200]}")

# Check rclone
r = subprocess.run(['rclone', 'listremotes'], capture_output=True, text=True)
if 'gdrive:' not in r.stdout:
    raise RuntimeError("rclone gdrive not configured")

# Download data
rclone(['rclone', 'copy', GDRIVE_LABELS, str(LABEL_DIR)], "Labels")
rclone(['rclone', 'copyto', GDRIVE_FOLDS, str(DATA_ROOT / 'folds.json')], "PercePiano folds")

# Load labels and folds
with open(LABEL_DIR / 'label_2round_mean_reg_19_with0_rm_highstd0.json') as f:
    LABELS = json.load(f)
with open(DATA_ROOT / 'folds.json') as f:
    FOLD_ASSIGNMENTS_RAW = json.load(f)

print(f"Labels: {len(LABELS)}")
print("Folds:", [f"fold_{i}: {len(FOLD_ASSIGNMENTS_RAW.get(f'fold_{i}', []))}" for i in range(4)])
print(f"Test: {len(FOLD_ASSIGNMENTS_RAW.get('test', []))}")

---
## Part 1: Audio Model Training

Train MERT layers 7-12 + MLP on PercePiano fold assignments.

In [None]:
# MERT Embeddings Setup
from audio_experiments.extractors import extract_mert_for_layer_range

# Download audio files for MERT extraction
print("Downloading audio files...")
rclone(['rclone', 'copy', GDRIVE_AUDIO, str(AUDIO_DIR), '--progress'], "Audio files")
print(f"Audio files: {len(list(AUDIO_DIR.glob('*.wav')))}")

# Get all keys we need
all_fold_keys = set()
for fold_id in range(4):
    all_fold_keys.update(FOLD_ASSIGNMENTS_RAW.get(f"fold_{fold_id}", []))
ALL_KEYS = sorted(set(LABELS.keys()) & all_fold_keys)
print(f"Total samples needed: {len(ALL_KEYS)}")

# Extract L7-12 embeddings
extract_mert_for_layer_range(7, 13, AUDIO_DIR, MERT_CACHE, ALL_KEYS)
print(f"MERT L7-12 embeddings: {len(list(MERT_CACHE.glob('*.pt')))}")

In [None]:
# Check for existing audio experiments
ALL_RESULTS = {}
AUDIO_EXP_ID = 'audio_mert_L7-12'

COMPLETED_CACHE = get_completed_experiments(GDRIVE_RESULTS)
print_experiment_status([AUDIO_EXP_ID], COMPLETED_CACHE)

In [None]:
# Train audio model on PercePiano folds
if should_run_experiment(AUDIO_EXP_ID, CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, COMPLETED_CACHE):
    print("Training audio model on PercePiano fold assignments...")
    
    def make_mert_model(cfg):
        return BaseMERTModel(
            input_dim=cfg['input_dim'], hidden_dim=cfg['hidden_dim'],
            dropout=cfg['dropout'], learning_rate=cfg['learning_rate'],
            weight_decay=cfg['weight_decay'], pooling='mean',
            loss_type='mse', max_epochs=cfg['max_epochs'],
        )
    
    ALL_RESULTS[AUDIO_EXP_ID] = run_4fold_mert_experiment(
        AUDIO_EXP_ID, 'MERT L7-12 + MLP (PercePiano folds)',
        make_mert_model, MERT_CACHE, LABELS, FOLD_ASSIGNMENTS_RAW,
        BASE_CONFIG, CHECKPOINT_ROOT, RESULTS_DIR, LOG_DIR
    )
    sync_experiment_to_gdrive(
        AUDIO_EXP_ID, ALL_RESULTS[AUDIO_EXP_ID],
        RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS
    )
else:
    # Load existing results
    result_file = RESULTS_DIR / f'{AUDIO_EXP_ID}.json'
    if result_file.exists():
        with open(result_file) as f:
            ALL_RESULTS[AUDIO_EXP_ID] = json.load(f)
    print(f"Audio experiment already complete")

---
## Part 2: Generate Predictions

Generate audio and symbolic predictions on validation sets.

In [None]:
# Generate audio predictions from trained checkpoints
def generate_audio_predictions(fold_assignments, checkpoint_dir, mert_cache, device):
    """Generate CV predictions using held-out fold for each sample."""
    predictions = {}
    
    # Load all fold models
    models = {}
    for fold in range(4):
        ckpt_path = checkpoint_dir / AUDIO_EXP_ID / f"fold{fold}_best.ckpt"
        if ckpt_path.exists():
            model = BaseMERTModel.load_from_checkpoint(ckpt_path)
            model = model.to(device).eval()
            models[fold] = model
            print(f"Loaded fold {fold} model")
    
    # Generate predictions for each fold's validation set
    for fold_id in range(4):
        if fold_id not in models:
            continue
        model = models[fold_id]
        fold_keys = fold_assignments.get(f"fold_{fold_id}", [])
        
        for key in fold_keys:
            embed_path = mert_cache / f"{key}.pt"
            if not embed_path.exists():
                continue
            
            embeddings = torch.load(embed_path, weights_only=True)
            if embeddings.shape[0] > 1000:
                embeddings = embeddings[:1000]
            
            embeddings = embeddings.unsqueeze(0).to(device)
            attention_mask = torch.ones(1, embeddings.shape[1], dtype=torch.bool, device=device)
            
            with torch.no_grad():
                pred = model(embeddings, attention_mask)
            
            predictions[key] = pred.squeeze(0).cpu().numpy().tolist()
        
        print(f"Fold {fold_id}: {len([k for k in fold_keys if k in predictions])} predictions")
    
    return predictions

# Generate audio predictions
audio_preds_file = DATA_ROOT / 'audio_predictions.json'
if not audio_preds_file.exists():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    audio_predictions = generate_audio_predictions(
        FOLD_ASSIGNMENTS_RAW, CHECKPOINT_ROOT, MERT_CACHE, device
    )
    with open(audio_preds_file, 'w') as f:
        json.dump(audio_predictions, f)
    print(f"Saved {len(audio_predictions)} audio predictions")
else:
    with open(audio_preds_file) as f:
        audio_predictions = json.load(f)
    print(f"Loaded {len(audio_predictions)} audio predictions")

In [None]:
# Setup for PercePiano symbolic model
from types import ModuleType

PERCEPIANO_ROOT = Path('/tmp/PercePiano')
if not PERCEPIANO_ROOT.exists():
    !git clone https://github.com/JonghoKimSNU/PercePiano.git /tmp/PercePiano

PERCEPIANO_PATH = PERCEPIANO_ROOT / 'virtuoso' / 'virtuoso'
!pip install omegaconf --quiet

# Patch numpy 2.0 compatibility
if not hasattr(np.lib, 'arraysetops'):
    arraysetops = ModuleType('numpy.lib.arraysetops')
    arraysetops.isin = np.isin
    sys.modules['numpy.lib.arraysetops'] = arraysetops
    np.lib.arraysetops = arraysetops

sys.path.insert(0, str(PERCEPIANO_PATH / 'pyScoreParser'))
sys.path.insert(0, str(PERCEPIANO_PATH))
print(f"PercePiano path: {PERCEPIANO_PATH}")

In [None]:
# Download PercePiano data and checkpoints
PP_DATA_ROOT = DATA_ROOT / 'percepiano_data'
PP_CKPT_ROOT = DATA_ROOT / 'percepiano_ckpts'
PP_DATA_ROOT.mkdir(exist_ok=True)
PP_CKPT_ROOT.mkdir(exist_ok=True)

rclone(['rclone', 'copy', 'gdrive:crescendai_data/percepiano_original', str(PP_DATA_ROOT)], "PercePiano data")
rclone(['rclone', 'copy', GDRIVE_SYMBOLIC_CKPTS, str(PP_CKPT_ROOT)], "PercePiano checkpoints")

print(f"PercePiano folds: {len(list(PP_DATA_ROOT.glob('fold*')))}")
print(f"PercePiano checkpoints: {len(list(PP_CKPT_ROOT.glob('*.pt')))}")

In [None]:
# Generate symbolic predictions (correctly aligned with PercePiano folds)
import pickle
from torch.nn.utils.rnn import pack_sequence
from model_m2pf import VirtuosoNetMultiLevel
from omegaconf import OmegaConf
import yaml

def extract_label_key(filename):
    name = filename.replace('.pkl', '').replace('.mid', '')
    if name.startswith('all_2rounds_'):
        name = name[len('all_2rounds_'):]
    return name

def load_sample(pkl_path, max_notes=5000):
    with open(pkl_path, 'rb') as f:
        data = pickle.load(f)
    x = torch.tensor(data['input'], dtype=torch.float32)
    if len(x) > max_notes:
        x = x[:max_notes]
    note_locations = {
        'beat': torch.tensor(data['note_location']['beat'][:len(x)], dtype=torch.long),
        'measure': torch.tensor(data['note_location']['measure'][:len(x)], dtype=torch.long),
        'voice': torch.tensor(data['note_location']['voice'][:len(x)], dtype=torch.long),
        'section': torch.tensor(data['note_location']['section'][:len(x)], dtype=torch.long),
    }
    return x, note_locations

def predict_single(model, x, note_locations, device, sigmoid):
    batch_x = pack_sequence([x], enforce_sorted=True).to(device)
    note_locs = {k: v.unsqueeze(0).to(device) for k, v in note_locations.items()}
    with torch.no_grad():
        outputs = model(batch_x, None, None, note_locs)
        pred = sigmoid(outputs[-1]).squeeze(0).cpu().numpy()
    return pred

print("Helper functions defined")

In [None]:
# Generate symbolic predictions
symbolic_preds_file = DATA_ROOT / 'symbolic_predictions.json'

if not symbolic_preds_file.exists():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    sigmoid = torch.nn.Sigmoid()
    
    # Load config
    CONFIG_PATH = PERCEPIANO_PATH.parent / 'ymls' / 'shared' / 'label19' / 'han_measnote_nomask_bigger256.yml'
    with open(CONFIG_PATH, 'r') as f:
        config = yaml.safe_load(f)
    net_param = OmegaConf.create(config['nn_params'])
    net_param.graph_keys = []
    
    symbolic_predictions = {}
    
    for fold_id in range(4):
        print(f"\n{'='*50}")
        print(f"FOLD {fold_id}")
        print('='*50)
        
        # Load fold checkpoint
        checkpoint_path = PP_CKPT_ROOT / f'fold{fold_id}_best.pt'
        if not checkpoint_path.exists():
            print(f"Checkpoint not found: {checkpoint_path}")
            continue
        
        # Load fold stats
        fold_path = PP_DATA_ROOT / f'fold{fold_id}'
        with open(fold_path / 'train' / 'stat.pkl', 'rb') as f:
            fold_stats = pickle.load(f)
        
        # Update input size
        net_param.input_size = max(v[1] for v in fold_stats['key_to_dim']['input'].values())
        
        # Load model
        model = VirtuosoNetMultiLevel(net_param, fold_stats, multi_level="total_note_cat")
        checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
        model.load_state_dict(checkpoint['state_dict'])
        model = model.to(device)
        model.eval()
        
        print(f"Loaded model (R2={checkpoint['r2']:.4f}, epoch {checkpoint['epoch']})")
        
        # KEY FIX: Use PercePiano fold's VALID directory, not audio fold assignments!
        valid_dir = fold_path / 'valid'
        valid_files = [f for f in valid_dir.glob('*.pkl') if f.name != 'stat.pkl']
        
        print(f"Generating predictions for {len(valid_files)} validation samples...")
        
        count = 0
        for pkl_file in valid_files:
            key = extract_label_key(pkl_file.name)
            if key not in LABELS:
                continue
            
            x, note_locations = load_sample(pkl_file)
            pred = predict_single(model, x, note_locations, device, sigmoid)
            symbolic_predictions[key] = pred.tolist()
            count += 1
        
        print(f"Generated {count} predictions for fold {fold_id}")
        
        # Cleanup
        del model
        torch.cuda.empty_cache()
    
    with open(symbolic_preds_file, 'w') as f:
        json.dump(symbolic_predictions, f)
    print(f"\nSaved {len(symbolic_predictions)} symbolic predictions")
else:
    with open(symbolic_preds_file) as f:
        symbolic_predictions = json.load(f)
    print(f"Loaded {len(symbolic_predictions)} symbolic predictions")

In [None]:
# Align predictions and labels
# Convert fold assignments to sample_key -> fold_id mapping
FOLD_ASSIGNMENTS = {}
for fold_id in range(4):
    for key in FOLD_ASSIGNMENTS_RAW.get(f"fold_{fold_id}", []):
        FOLD_ASSIGNMENTS[key] = fold_id

# Get aligned sample keys (have audio, symbolic, and labels)
SAMPLE_KEYS = sorted(
    set(audio_predictions.keys()) & 
    set(symbolic_predictions.keys()) & 
    set(LABELS.keys()) &
    set(FOLD_ASSIGNMENTS.keys())
)

print(f"Audio predictions: {len(audio_predictions)}")
print(f"Symbolic predictions: {len(symbolic_predictions)}")
print(f"Aligned samples: {len(SAMPLE_KEYS)}")

# Create arrays
LABELS_ARR = np.array([LABELS[k][:19] for k in SAMPLE_KEYS])
AUDIO_ARR = np.array([audio_predictions[k] for k in SAMPLE_KEYS])
SYMBOLIC_ARR = np.array([symbolic_predictions[k] for k in SAMPLE_KEYS])

print(f"Shapes: labels={LABELS_ARR.shape}, audio={AUDIO_ARR.shape}, symbolic={SYMBOLIC_ARR.shape}")

---
## Part 3: Statistical Tests

In [None]:
FUSION_EXPERIMENT_IDS = [
    'S0_bootstrap', 'S1_paired_tests', 'S2_multiple_correction',
    'F0_simple', 'F1_weighted', 'F2_ridge', 'F3_confidence',
    'A0_stability', 'A1_category', 'A2_error_corr',
]

FUSION_COMPLETED = get_completed_experiments(GDRIVE_RESULTS)
print_experiment_status(FUSION_EXPERIMENT_IDS, FUSION_COMPLETED)

In [None]:
# S0: Bootstrap CIs
if should_run_experiment('S0_bootstrap', CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, FUSION_COMPLETED):
    ALL_RESULTS['S0_bootstrap'] = run_bootstrap_experiment(
        'S0_bootstrap', AUDIO_ARR, SYMBOLIC_ARR, LABELS_ARR, n_bootstrap=10000
    )
    save_fusion_experiment('S0_bootstrap', ALL_RESULTS['S0_bootstrap'], RESULTS_DIR, ALL_RESULTS)
    sync_experiment_to_gdrive('S0_bootstrap', ALL_RESULTS['S0_bootstrap'], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

In [None]:
# S1: Paired Tests
if should_run_experiment('S1_paired_tests', CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, FUSION_COMPLETED):
    ALL_RESULTS['S1_paired_tests'] = run_paired_tests_experiment(
        'S1_paired_tests', AUDIO_ARR, SYMBOLIC_ARR, LABELS_ARR
    )
    save_fusion_experiment('S1_paired_tests', ALL_RESULTS['S1_paired_tests'], RESULTS_DIR, ALL_RESULTS)
    sync_experiment_to_gdrive('S1_paired_tests', ALL_RESULTS['S1_paired_tests'], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

In [None]:
# S2: Multiple Correction
if should_run_experiment('S2_multiple_correction', CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, FUSION_COMPLETED):
    if 'S1_paired_tests' not in ALL_RESULTS:
        with open(RESULTS_DIR / 'S1_paired_tests.json') as f:
            ALL_RESULTS['S1_paired_tests'] = json.load(f)
    
    ALL_RESULTS['S2_multiple_correction'] = run_multiple_correction_experiment(
        'S2_multiple_correction', ALL_RESULTS['S1_paired_tests']
    )
    save_fusion_experiment('S2_multiple_correction', ALL_RESULTS['S2_multiple_correction'], RESULTS_DIR, ALL_RESULTS)
    sync_experiment_to_gdrive('S2_multiple_correction', ALL_RESULTS['S2_multiple_correction'], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

---
## Part 4: Fusion Strategies

In [None]:
# F0: Simple Average
if should_run_experiment('F0_simple', CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, FUSION_COMPLETED):
    ALL_RESULTS['F0_simple'] = run_simple_fusion_experiment(
        'F0_simple', AUDIO_ARR, SYMBOLIC_ARR, LABELS_ARR
    )
    save_fusion_experiment('F0_simple', ALL_RESULTS['F0_simple'], RESULTS_DIR, ALL_RESULTS)
    sync_experiment_to_gdrive('F0_simple', ALL_RESULTS['F0_simple'], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

In [None]:
# F1: Weighted Fusion (CV)
if should_run_experiment('F1_weighted', CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, FUSION_COMPLETED):
    ALL_RESULTS['F1_weighted'] = run_weighted_fusion_experiment(
        'F1_weighted', AUDIO_ARR, SYMBOLIC_ARR, LABELS_ARR,
        FOLD_ASSIGNMENTS, SAMPLE_KEYS
    )
    save_fusion_experiment('F1_weighted', ALL_RESULTS['F1_weighted'], RESULTS_DIR, ALL_RESULTS)
    sync_experiment_to_gdrive('F1_weighted', ALL_RESULTS['F1_weighted'], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

In [None]:
# F2: Ridge Stacking
if should_run_experiment('F2_ridge', CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, FUSION_COMPLETED):
    ALL_RESULTS['F2_ridge'] = run_ridge_fusion_experiment(
        'F2_ridge', AUDIO_ARR, SYMBOLIC_ARR, LABELS_ARR,
        FOLD_ASSIGNMENTS, SAMPLE_KEYS
    )
    save_fusion_experiment('F2_ridge', ALL_RESULTS['F2_ridge'], RESULTS_DIR, ALL_RESULTS)
    sync_experiment_to_gdrive('F2_ridge', ALL_RESULTS['F2_ridge'], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

In [None]:
# F3: Confidence Weighted
if should_run_experiment('F3_confidence', CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, FUSION_COMPLETED):
    ALL_RESULTS['F3_confidence'] = run_confidence_fusion_experiment(
        'F3_confidence', AUDIO_ARR, SYMBOLIC_ARR, LABELS_ARR
    )
    save_fusion_experiment('F3_confidence', ALL_RESULTS['F3_confidence'], RESULTS_DIR, ALL_RESULTS)
    sync_experiment_to_gdrive('F3_confidence', ALL_RESULTS['F3_confidence'], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

---
## Part 5: Ablations

In [None]:
# A0: Weight Stability
if should_run_experiment('A0_stability', CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, FUSION_COMPLETED):
    if 'F1_weighted' not in ALL_RESULTS:
        with open(RESULTS_DIR / 'F1_weighted.json') as f:
            ALL_RESULTS['F1_weighted'] = json.load(f)
    
    ALL_RESULTS['A0_stability'] = run_weight_stability_experiment(
        'A0_stability', ALL_RESULTS['F1_weighted']['fold_weights']
    )
    save_fusion_experiment('A0_stability', ALL_RESULTS['A0_stability'], RESULTS_DIR, ALL_RESULTS)
    sync_experiment_to_gdrive('A0_stability', ALL_RESULTS['A0_stability'], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

In [None]:
# A1: Category Fusion
if should_run_experiment('A1_category', CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, FUSION_COMPLETED):
    ALL_RESULTS['A1_category'] = run_category_fusion_experiment(
        'A1_category', AUDIO_ARR, SYMBOLIC_ARR, LABELS_ARR,
        FOLD_ASSIGNMENTS, SAMPLE_KEYS
    )
    save_fusion_experiment('A1_category', ALL_RESULTS['A1_category'], RESULTS_DIR, ALL_RESULTS)
    sync_experiment_to_gdrive('A1_category', ALL_RESULTS['A1_category'], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

In [None]:
# A2: Error Correlation
if should_run_experiment('A2_error_corr', CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, FUSION_COMPLETED):
    ALL_RESULTS['A2_error_corr'] = run_error_correlation_experiment(
        'A2_error_corr', AUDIO_ARR, SYMBOLIC_ARR, LABELS_ARR
    )
    save_fusion_experiment('A2_error_corr', ALL_RESULTS['A2_error_corr'], RESULTS_DIR, ALL_RESULTS)
    sync_experiment_to_gdrive('A2_error_corr', ALL_RESULTS['A2_error_corr'], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

---
## Results Summary

In [None]:
# Load all results from disk
for exp_id in FUSION_EXPERIMENT_IDS:
    if exp_id not in ALL_RESULTS:
        result_file = RESULTS_DIR / f"{exp_id}.json"
        if result_file.exists():
            with open(result_file) as f:
                ALL_RESULTS[exp_id] = json.load(f)

# Print summary
print("="*80)
print("ALIGNED FUSION RESULTS (PercePiano Folds)")
print("="*80)

# Baselines
audio_r2 = ALL_RESULTS.get('S0_bootstrap', {}).get('audio', {}).get('overall', {}).get('r2', 0)
symbolic_r2 = ALL_RESULTS.get('S0_bootstrap', {}).get('symbolic', {}).get('overall', {}).get('r2', 0)
best_single = max(audio_r2, symbolic_r2)

print(f"\n{'Model':<25} {'R2':>10} {'95% CI':>25} {'vs Best':>12}")
print("-"*75)

if 'S0_bootstrap' in ALL_RESULTS:
    s0 = ALL_RESULTS['S0_bootstrap']
    a = s0['audio']['overall']
    s = s0['symbolic']['overall']
    print(f"{'Audio (MERT L7-12)':<25} {a['r2']:>10.4f} [{a['ci_lower']:.3f}, {a['ci_upper']:.3f}] {'---':>12}")
    print(f"{'Symbolic (PercePiano)':<25} {s['r2']:>10.4f} [{s['ci_lower']:.3f}, {s['ci_upper']:.3f}] {'---':>12}")

print("-"*75)

fusion_exps = [('F0_simple', 'Simple Average'), ('F1_weighted', 'Weighted CV'),
               ('F2_ridge', 'Ridge Stacking'), ('F3_confidence', 'Confidence')]
for exp_id, name in fusion_exps:
    if exp_id in ALL_RESULTS:
        r = ALL_RESULTS[exp_id]
        r2 = r['overall_r2']
        b = r['bootstrap']['overall']
        diff = r2 - best_single
        print(f"{name:<25} {r2:>10.4f} [{b['ci_lower']:.3f}, {b['ci_upper']:.3f}] {diff:>+12.4f}")

print("="*75)

# Key comparison
print(f"\nKEY FINDING:")
if audio_r2 > symbolic_r2:
    print(f"  Audio beats Symbolic by {audio_r2 - symbolic_r2:.4f} R2")
else:
    print(f"  Symbolic beats Audio by {symbolic_r2 - audio_r2:.4f} R2")

In [None]:
# Final sync
with open(RESULTS_DIR / 'aligned_fusion_all_results.json', 'w') as f:
    json.dump(ALL_RESULTS, f, indent=2, default=str)

subprocess.run(['rclone', 'copy', str(RESULTS_DIR), GDRIVE_RESULTS], capture_output=True)
print(f"Done! Results at: {GDRIVE_RESULTS}")