# Definitive Experiments

## Parts
1. **M1a-M1d**: MuQ Layer Ablation (find optimal layers)
2. **F8-F11**: MuQ + Symbolic Fusion
3. **D9a-D9c**: MERT + MuQ Audio Fusion
4. **X1-X3**: Cross-Dataset Validation (PianoVAM, ASAP, PSyllabus)
5. **S3-S4**: Statistical Rigor (Bootstrap, Bonferroni)
6. **A3-A7**: Analysis (Error correlation, dimensions, calibration)
7. **Export**: Save all results to GDrive

## Requirements
- Compute: A100 (80GB VRAM)
- rclone configured with `gdrive:` remote
- External datasets: PianoVAM, ASAP, PSyllabus

In [None]:
# Cell 1: CUDA setup (must be before any CUDA operations)
import os
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'

import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    raise RuntimeError("GPU required")

In [None]:
# Cell 2: Install rclone
!curl -fsSL https://rclone.org/install.sh | sudo bash 2>&1 | grep -E "(successfully|already)" || echo "rclone installed"

In [None]:
# Cell 3: Install dependencies and clone repo
!pip install transformers librosa soundfile pytorch_lightning nnAudio scipy scikit-learn muq requests tqdm --quiet

import os
REPO_DIR = '/tmp/crescendai'
if os.path.exists(REPO_DIR):
    !cd {REPO_DIR} && git pull origin main
else:
    !git clone https://github.com/jai-dhiman/crescendai.git {REPO_DIR}

print(f"Repo: {REPO_DIR}")

In [None]:
# Cell 4: Imports
import sys
sys.path.insert(0, f'{REPO_DIR}/model/src')

import json
import subprocess
import warnings
from pathlib import Path
from typing import Dict, List, Tuple, Any, Optional

import numpy as np
import torch
import pytorch_lightning as pl
from scipy import stats
from sklearn.metrics import r2_score

from audio_experiments import PERCEPIANO_DIMENSIONS, DIMENSION_CATEGORIES, BASE_CONFIG, SEED
from audio_experiments.extractors import (
    extract_mert_for_layer_range,
    extract_muq_embeddings,
)
from audio_experiments.models import (
    MuQStatsModel,
    MuQBaseModel,
    MERTMuQEnsemble,
    MERTMuQConcatModel,
    AsymmetricGatedFusion,
)
from audio_experiments.training import (
    run_4fold_mert_experiment,
    run_4fold_dual_experiment,
    restore_all_from_gdrive,
    should_run_experiment,
    sync_experiment_to_gdrive,
    get_completed_experiments,
    print_experiment_status,
    # Fusion runners
    run_simple_fusion_experiment,
    run_weighted_fusion_experiment,
    run_ridge_fusion_experiment,
    run_confidence_fusion_experiment,
    run_error_correlation_experiment,
    save_fusion_experiment,
    # Statistics
    bootstrap_r2_extended,
    bootstrap_r2_comparison,
    paired_ttest_per_sample,
    wilcoxon_test,
    cohens_d,
    bonferroni_correction,
    fdr_correction,
    # Fusion strategies
    simple_average_fusion,
    weighted_fusion_grid_search,
    compute_error_correlation,
    compute_per_dimension_comparison,
)
from audio_experiments.training.sync import numpy_serializer

warnings.filterwarnings('ignore')
torch.set_float32_matmul_precision('medium')
pl.seed_everything(SEED, workers=True)

print(f"PyTorch: {torch.__version__}")
print(f"Imports: OK")

In [None]:
# Cell 5: Path configuration
DATA_ROOT = Path('/tmp/definitive_experiments')
AUDIO_DIR = DATA_ROOT / 'audio'
LABEL_DIR = DATA_ROOT / 'labels'
MUQ_CACHE_ROOT = DATA_ROOT / 'muq_cache'
MERT_CACHE_ROOT = DATA_ROOT / 'mert_cache'
CHECKPOINT_ROOT = DATA_ROOT / 'checkpoints'
RESULTS_DIR = DATA_ROOT / 'results'
LOG_DIR = DATA_ROOT / 'logs'
FIGURES_DIR = RESULTS_DIR / 'figures'

# Cross-dataset directories
PIANOVAM_DIR = DATA_ROOT / 'pianovam'
ASAP_DIR = DATA_ROOT / 'asap'
PSYLLABUS_DIR = DATA_ROOT / 'psyllabus'

# GDrive paths
GDRIVE_AUDIO = 'gdrive:crescendai_data/audio_baseline/percepiano_rendered'
GDRIVE_LABELS = 'gdrive:crescendai_data/percepiano_labels'
GDRIVE_FOLDS = 'gdrive:crescendai_data/percepiano_fold_assignments.json'
GDRIVE_MERT_CACHE = 'gdrive:crescendai_data/audio_baseline/mert_embeddings/L7-12'
GDRIVE_MUQ_CACHE = 'gdrive:crescendai_data/audio_baseline/muq_embeddings'
GDRIVE_RESULTS = 'gdrive:crescendai_data/checkpoints/definitive_experiments'
GDRIVE_SYMBOLIC = 'gdrive:crescendai_data/checkpoints/aligned_fusion/symbolic_predictions.json'

for d in [AUDIO_DIR, LABEL_DIR, MUQ_CACHE_ROOT, MERT_CACHE_ROOT, CHECKPOINT_ROOT,
          RESULTS_DIR, LOG_DIR, FIGURES_DIR, PIANOVAM_DIR, ASAP_DIR, PSYLLABUS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

def run_rclone(cmd, desc=""):
    if desc:
        print(f"{desc}...")
    result = subprocess.run(cmd, capture_output=True, text=True)
    return result

# Check rclone
result = subprocess.run(['rclone', 'listremotes'], capture_output=True, text=True)
if 'gdrive:' not in result.stdout:
    raise RuntimeError("rclone 'gdrive' not configured")

print(f"Data root: {DATA_ROOT}")
print(f"GDrive results: {GDRIVE_RESULTS}")

In [None]:
# Cell 6: Download data
run_rclone(['rclone', 'copy', GDRIVE_AUDIO, str(AUDIO_DIR), '--progress'], "Downloading audio")
run_rclone(['rclone', 'copy', GDRIVE_LABELS, str(LABEL_DIR)], "Downloading labels")

FOLD_FILE = DATA_ROOT / 'folds.json'
run_rclone(['rclone', 'copyto', GDRIVE_FOLDS, str(FOLD_FILE)], "Downloading folds")

# Load labels and folds
LABEL_FILE = LABEL_DIR / 'label_2round_mean_reg_19_with0_rm_highstd0.json'
with open(LABEL_FILE) as f:
    LABELS = json.load(f)
with open(FOLD_FILE) as f:
    FOLD_ASSIGNMENTS = json.load(f)

# Create key->fold_id mapping
FOLD_BY_KEY = {}
for fold_id in range(4):
    for key in FOLD_ASSIGNMENTS.get(f"fold_{fold_id}", []):
        FOLD_BY_KEY[key] = fold_id

ALL_KEYS = sorted(FOLD_BY_KEY.keys())
print(f"Samples per fold: {[len(FOLD_ASSIGNMENTS.get(f'fold_{i}', [])) for i in range(4)]}")
print(f"Total samples: {len(ALL_KEYS)}")
print(f"Audio files: {len(list(AUDIO_DIR.glob('*.wav')))}")

In [None]:
# Cell 7: Initialize results tracking
ALL_RESULTS = {}

# Get completed experiments from GDrive
print("Checking GDrive for completed experiments...")
COMPLETED_CACHE = get_completed_experiments(GDRIVE_RESULTS)
print(f"Found {len(COMPLETED_CACHE)} completed experiments")

# Define experiment IDs
EXPERIMENT_IDS = [
    # Part 1: MuQ Layer Ablation
    'M1a_muq_L1-6',
    'M1b_muq_L7-12',
    'M1c_muq_L13-24',
    'M1d_muq_L1-24',
    # Part 2: MuQ + Symbolic Fusion
    'F8_muq_symbolic_simple',
    'F9_muq_symbolic_weighted',
    'F10_muq_symbolic_ridge',
    'F11_muq_symbolic_confidence',
    # Part 3: MERT + MuQ Fusion
    'D9a_mert_muq_ensemble',
    'D9b_mert_muq_concat',
    'D9c_mert_muq_gated',
    # Part 4: Cross-Dataset
    'X1_pianovam_skill',
    'X2_asap_multiperformer',
    'X3_psyllabus_difficulty',
    # Part 5: Statistics
    'S3_bootstrap_all',
    'S4_significance_tests',
    # Part 6: Analysis
    'A3_error_correlation',
    'A4_dimension_breakdown',
    'A5_failure_cases',
    'A6_calibration',
    'A7_gate_visualization',
]

print_experiment_status(EXPERIMENT_IDS, COMPLETED_CACHE)

---
## Part 1: MuQ Layer Ablation (M1a-M1d)

Find optimal MuQ layer range, parallel to MERT experiments B1a-d.

In [None]:
# Cell 9: MuQ Layer Configurations
MUQ_LAYER_CONFIGS = {
    'M1a_muq_L1-6': {'layer_start': 1, 'layer_end': 7, 'desc': 'MuQ layers 1-6 (early acoustic)'},
    'M1b_muq_L7-12': {'layer_start': 7, 'layer_end': 13, 'desc': 'MuQ layers 7-12 (mid perceptual)'},
    'M1c_muq_L13-24': {'layer_start': 13, 'layer_end': 25, 'desc': 'MuQ layers 13-24 (late semantic)'},
    'M1d_muq_L1-24': {'layer_start': 1, 'layer_end': 25, 'desc': 'MuQ all layers 1-24'},
}

# MuQ Stats pooling config (proven best in prior experiments)
MUQ_CONFIG = {
    **BASE_CONFIG,
    'input_dim': 1024,
    'hidden_dim': 512,
    'dropout': 0.2,
    'learning_rate': 1e-4,
    'weight_decay': 1e-5,
    'pooling_stats': 'mean_std',  # 2x input dim
}

def make_muq_stats_model(cfg):
    return MuQStatsModel(
        input_dim=cfg['input_dim'],
        hidden_dim=cfg['hidden_dim'],
        dropout=cfg['dropout'],
        learning_rate=cfg['learning_rate'],
        weight_decay=cfg['weight_decay'],
        pooling_stats=cfg['pooling_stats'],
        max_epochs=cfg['max_epochs'],
    )

print("MuQ layer ablation configs ready")
for exp_id, cfg in MUQ_LAYER_CONFIGS.items():
    print(f"  {exp_id}: layers {cfg['layer_start']}-{cfg['layer_end']-1}")

In [None]:
# Cell 10: M1a - MuQ Layers 1-6
exp_id = 'M1a_muq_L1-6'
cfg = MUQ_LAYER_CONFIGS[exp_id]

if should_run_experiment(exp_id, CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, COMPLETED_CACHE):
    # Create layer-specific cache
    cache_dir = MUQ_CACHE_ROOT / f"L{cfg['layer_start']}-{cfg['layer_end']-1}"
    cache_dir.mkdir(parents=True, exist_ok=True)
    
    # Extract embeddings
    extract_muq_embeddings(
        AUDIO_DIR, cache_dir, ALL_KEYS,
        layer_start=cfg['layer_start'],
        layer_end=cfg['layer_end']
    )
    
    # Train
    ALL_RESULTS[exp_id] = run_4fold_mert_experiment(
        exp_id=exp_id,
        description=cfg['desc'],
        model_factory=make_muq_stats_model,
        mert_cache_dir=cache_dir,
        labels=LABELS,
        fold_assignments=FOLD_ASSIGNMENTS,
        config=MUQ_CONFIG,
        checkpoint_root=CHECKPOINT_ROOT,
        results_dir=RESULTS_DIR,
        log_dir=LOG_DIR,
    )
    sync_experiment_to_gdrive(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

In [None]:
# Cell 11: M1b - MuQ Layers 7-12
exp_id = 'M1b_muq_L7-12'
cfg = MUQ_LAYER_CONFIGS[exp_id]

if should_run_experiment(exp_id, CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, COMPLETED_CACHE):
    cache_dir = MUQ_CACHE_ROOT / f"L{cfg['layer_start']}-{cfg['layer_end']-1}"
    cache_dir.mkdir(parents=True, exist_ok=True)
    
    extract_muq_embeddings(
        AUDIO_DIR, cache_dir, ALL_KEYS,
        layer_start=cfg['layer_start'],
        layer_end=cfg['layer_end']
    )
    
    ALL_RESULTS[exp_id] = run_4fold_mert_experiment(
        exp_id=exp_id,
        description=cfg['desc'],
        model_factory=make_muq_stats_model,
        mert_cache_dir=cache_dir,
        labels=LABELS,
        fold_assignments=FOLD_ASSIGNMENTS,
        config=MUQ_CONFIG,
        checkpoint_root=CHECKPOINT_ROOT,
        results_dir=RESULTS_DIR,
        log_dir=LOG_DIR,
    )
    sync_experiment_to_gdrive(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

In [None]:
# Cell 12: M1c - MuQ Layers 13-24
exp_id = 'M1c_muq_L13-24'
cfg = MUQ_LAYER_CONFIGS[exp_id]

if should_run_experiment(exp_id, CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, COMPLETED_CACHE):
    cache_dir = MUQ_CACHE_ROOT / f"L{cfg['layer_start']}-{cfg['layer_end']-1}"
    cache_dir.mkdir(parents=True, exist_ok=True)
    
    extract_muq_embeddings(
        AUDIO_DIR, cache_dir, ALL_KEYS,
        layer_start=cfg['layer_start'],
        layer_end=cfg['layer_end']
    )
    
    ALL_RESULTS[exp_id] = run_4fold_mert_experiment(
        exp_id=exp_id,
        description=cfg['desc'],
        model_factory=make_muq_stats_model,
        mert_cache_dir=cache_dir,
        labels=LABELS,
        fold_assignments=FOLD_ASSIGNMENTS,
        config=MUQ_CONFIG,
        checkpoint_root=CHECKPOINT_ROOT,
        results_dir=RESULTS_DIR,
        log_dir=LOG_DIR,
    )
    sync_experiment_to_gdrive(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

In [None]:
# Cell 13: M1d - MuQ All Layers
exp_id = 'M1d_muq_L1-24'
cfg = MUQ_LAYER_CONFIGS[exp_id]

if should_run_experiment(exp_id, CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, COMPLETED_CACHE):
    cache_dir = MUQ_CACHE_ROOT / f"L{cfg['layer_start']}-{cfg['layer_end']-1}"
    cache_dir.mkdir(parents=True, exist_ok=True)
    
    extract_muq_embeddings(
        AUDIO_DIR, cache_dir, ALL_KEYS,
        layer_start=cfg['layer_start'],
        layer_end=cfg['layer_end']
    )
    
    ALL_RESULTS[exp_id] = run_4fold_mert_experiment(
        exp_id=exp_id,
        description=cfg['desc'],
        model_factory=make_muq_stats_model,
        mert_cache_dir=cache_dir,
        labels=LABELS,
        fold_assignments=FOLD_ASSIGNMENTS,
        config=MUQ_CONFIG,
        checkpoint_root=CHECKPOINT_ROOT,
        results_dir=RESULTS_DIR,
        log_dir=LOG_DIR,
    )
    sync_experiment_to_gdrive(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

In [None]:
# Cell 14: MuQ Layer Ablation Summary
print("\n" + "="*70)
print("MuQ LAYER ABLATION RESULTS")
print("="*70)
print(f"{'Experiment':<25} {'Layers':<12} {'R2':>10} {'Std':>10}")
print("-"*70)

best_muq_exp = None
best_muq_r2 = 0

for exp_id in ['M1a_muq_L1-6', 'M1b_muq_L7-12', 'M1c_muq_L13-24', 'M1d_muq_L1-24']:
    # Load from disk if not in memory
    if exp_id not in ALL_RESULTS:
        result_file = RESULTS_DIR / f"{exp_id}.json"
        if result_file.exists():
            with open(result_file) as f:
                ALL_RESULTS[exp_id] = json.load(f)
    
    if exp_id in ALL_RESULTS:
        r = ALL_RESULTS[exp_id]
        r2 = r['summary']['avg_r2']
        std = r['summary']['std_r2']
        cfg = MUQ_LAYER_CONFIGS[exp_id]
        layers = f"{cfg['layer_start']}-{cfg['layer_end']-1}"
        print(f"{exp_id:<25} {layers:<12} {r2:>10.4f} {std:>10.4f}")
        
        if r2 > best_muq_r2:
            best_muq_r2 = r2
            best_muq_exp = exp_id

print("-"*70)
if best_muq_exp:
    print(f"BEST: {best_muq_exp} (R2={best_muq_r2:.4f})")
    BEST_MUQ_CONFIG = MUQ_LAYER_CONFIGS[best_muq_exp]
    BEST_MUQ_CACHE = MUQ_CACHE_ROOT / f"L{BEST_MUQ_CONFIG['layer_start']}-{BEST_MUQ_CONFIG['layer_end']-1}"

---
## Part 2: MuQ + Symbolic Fusion (F8-F11)

In [None]:
# Cell 16: Load Symbolic Predictions
SYMBOLIC_PRED_FILE = DATA_ROOT / 'symbolic_predictions.json'
run_rclone(['rclone', 'copyto', GDRIVE_SYMBOLIC, str(SYMBOLIC_PRED_FILE)], "Downloading symbolic predictions")

with open(SYMBOLIC_PRED_FILE) as f:
    SYMBOLIC_PREDICTIONS = json.load(f)

print(f"Loaded symbolic predictions for {len(SYMBOLIC_PREDICTIONS)} samples")

In [None]:
# Cell 17: Generate MuQ Predictions
def generate_muq_predictions(checkpoint_dir: Path, cache_dir: Path, fold_assignments: Dict, labels: Dict) -> Dict[str, List[float]]:
    """Generate CV predictions from trained MuQ models."""
    from audio_experiments.data import MERTDataset, mert_collate_fn
    from torch.utils.data import DataLoader
    
    predictions = {}
    device = torch.device('cuda')
    
    for fold in range(4):
        ckpt_path = checkpoint_dir / f"fold{fold}_best.ckpt"
        if not ckpt_path.exists():
            print(f"Warning: checkpoint not found: {ckpt_path}")
            continue
        
        model = MuQStatsModel.load_from_checkpoint(ckpt_path)
        model = model.to(device).eval()
        
        # Get validation keys for this fold
        val_keys = fold_assignments.get(f"fold_{fold}", [])
        val_ds = MERTDataset(cache_dir, labels, fold_assignments, fold, "val", max_frames=1000)
        val_dl = DataLoader(val_ds, batch_size=32, shuffle=False, collate_fn=mert_collate_fn)
        
        with torch.no_grad():
            for batch in val_dl:
                pred = model(batch['embeddings'].to(device), batch['attention_mask'].to(device))
                for key, p in zip(batch['keys'], pred.cpu().numpy()):
                    predictions[key] = p.tolist()
        
        del model
        torch.cuda.empty_cache()
    
    return predictions

# Generate predictions from best MuQ model
if best_muq_exp:
    print(f"Generating MuQ predictions from {best_muq_exp}...")
    MUQ_PREDICTIONS = generate_muq_predictions(
        CHECKPOINT_ROOT / best_muq_exp,
        BEST_MUQ_CACHE,
        FOLD_ASSIGNMENTS,
        LABELS
    )
    print(f"Generated predictions for {len(MUQ_PREDICTIONS)} samples")
else:
    print("WARNING: No MuQ model trained yet")
    MUQ_PREDICTIONS = {}

In [None]:
# Cell 18: Align predictions
# Find common keys
FUSION_KEYS = sorted(
    set(MUQ_PREDICTIONS.keys()) &
    set(SYMBOLIC_PREDICTIONS.keys()) &
    set(LABELS.keys())
)
print(f"Aligned samples: {len(FUSION_KEYS)}")

# Create aligned arrays
MUQ_ARR = np.array([MUQ_PREDICTIONS[k] for k in FUSION_KEYS])
SYMBOLIC_ARR = np.array([SYMBOLIC_PREDICTIONS[k] for k in FUSION_KEYS])
LABELS_ARR = np.array([LABELS[k][:19] for k in FUSION_KEYS])

print(f"MuQ shape: {MUQ_ARR.shape}")
print(f"Symbolic shape: {SYMBOLIC_ARR.shape}")
print(f"Labels shape: {LABELS_ARR.shape}")

In [None]:
# Cell 19: F8 - Simple Average Fusion
exp_id = 'F8_muq_symbolic_simple'

if should_run_experiment(exp_id, CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, COMPLETED_CACHE):
    ALL_RESULTS[exp_id] = run_simple_fusion_experiment(
        exp_id, MUQ_ARR, SYMBOLIC_ARR, LABELS_ARR, n_bootstrap=10000
    )
    save_fusion_experiment(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, ALL_RESULTS)
    sync_experiment_to_gdrive(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

In [None]:
# Cell 20: F9 - Weighted Fusion
exp_id = 'F9_muq_symbolic_weighted'

if should_run_experiment(exp_id, CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, COMPLETED_CACHE):
    ALL_RESULTS[exp_id] = run_weighted_fusion_experiment(
        exp_id, MUQ_ARR, SYMBOLIC_ARR, LABELS_ARR, FOLD_BY_KEY, FUSION_KEYS, n_bootstrap=10000
    )
    save_fusion_experiment(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, ALL_RESULTS)
    sync_experiment_to_gdrive(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

In [None]:
# Cell 21: F10 - Ridge Stacking
exp_id = 'F10_muq_symbolic_ridge'

if should_run_experiment(exp_id, CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, COMPLETED_CACHE):
    ALL_RESULTS[exp_id] = run_ridge_fusion_experiment(
        exp_id, MUQ_ARR, SYMBOLIC_ARR, LABELS_ARR, FOLD_BY_KEY, FUSION_KEYS, n_bootstrap=10000
    )
    save_fusion_experiment(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, ALL_RESULTS)
    sync_experiment_to_gdrive(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

In [None]:
# Cell 22: F11 - Confidence Weighted
exp_id = 'F11_muq_symbolic_confidence'

if should_run_experiment(exp_id, CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, COMPLETED_CACHE):
    ALL_RESULTS[exp_id] = run_confidence_fusion_experiment(
        exp_id, MUQ_ARR, SYMBOLIC_ARR, LABELS_ARR, n_bootstrap=10000
    )
    save_fusion_experiment(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, ALL_RESULTS)
    sync_experiment_to_gdrive(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

---
## Part 3: MERT + MuQ Audio Fusion (D9a-D9c)

Test if two audio encoders provide complementary information.

In [None]:
# Cell 24: Extract MERT embeddings (layers 7-12, best from prior ablation)
MERT_CACHE = MERT_CACHE_ROOT / 'L7-12'
MERT_CACHE.mkdir(parents=True, exist_ok=True)

# Try to download from GDrive first
run_rclone(['rclone', 'copy', GDRIVE_MERT_CACHE, str(MERT_CACHE)], "Downloading MERT cache")

# Extract any missing
extract_mert_for_layer_range(7, 13, AUDIO_DIR, MERT_CACHE, ALL_KEYS)
print(f"MERT embeddings ready: {len(list(MERT_CACHE.glob('*.pt')))} files")

In [None]:
# Cell 25: Ensure MuQ embeddings for best config
if best_muq_exp:
    MUQ_CACHE = BEST_MUQ_CACHE
else:
    # Default to all layers if no ablation done yet
    MUQ_CACHE = MUQ_CACHE_ROOT / 'L1-24'
    MUQ_CACHE.mkdir(parents=True, exist_ok=True)
    extract_muq_embeddings(AUDIO_DIR, MUQ_CACHE, ALL_KEYS, layer_start=1, layer_end=25)

print(f"MuQ cache: {MUQ_CACHE}")
print(f"MuQ embeddings: {len(list(MUQ_CACHE.glob('*.pt')))} files")

In [None]:
# Cell 26: D9a - MERT+MuQ Ensemble (Late Fusion)
exp_id = 'D9a_mert_muq_ensemble'

DUAL_CONFIG = {
    **BASE_CONFIG,
    'input_dim': 1024,
    'hidden_dim': 512,
    'dropout': 0.2,
    'learning_rate': 1e-4,
    'weight_decay': 1e-5,
    'fusion_weight': 0.5,
}

def make_ensemble_model(cfg):
    return MERTMuQEnsemble(
        input_dim=cfg['input_dim'],
        hidden_dim=cfg['hidden_dim'],
        dropout=cfg['dropout'],
        learning_rate=cfg['learning_rate'],
        weight_decay=cfg['weight_decay'],
        pooling='attention',
        fusion_weight=cfg['fusion_weight'],
        max_epochs=cfg['max_epochs'],
    )

if should_run_experiment(exp_id, CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, COMPLETED_CACHE):
    ALL_RESULTS[exp_id] = run_4fold_dual_experiment(
        exp_id=exp_id,
        description='MERT+MuQ late fusion ensemble',
        model_factory=make_ensemble_model,
        mert_cache_dir=MERT_CACHE,
        muq_cache_dir=MUQ_CACHE,
        labels=LABELS,
        fold_assignments=FOLD_ASSIGNMENTS,
        config=DUAL_CONFIG,
        checkpoint_root=CHECKPOINT_ROOT,
        results_dir=RESULTS_DIR,
        log_dir=LOG_DIR,
    )
    sync_experiment_to_gdrive(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

In [None]:
# Cell 27: D9b - MERT+MuQ Concat (Early Fusion)
exp_id = 'D9b_mert_muq_concat'

def make_concat_model(cfg):
    return MERTMuQConcatModel(
        mert_dim=cfg['input_dim'],
        muq_dim=cfg['input_dim'],
        hidden_dim=cfg['hidden_dim'],
        dropout=cfg['dropout'],
        learning_rate=cfg['learning_rate'],
        weight_decay=cfg['weight_decay'],
        pooling='attention',
        max_epochs=cfg['max_epochs'],
    )

if should_run_experiment(exp_id, CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, COMPLETED_CACHE):
    ALL_RESULTS[exp_id] = run_4fold_dual_experiment(
        exp_id=exp_id,
        description='MERT+MuQ early fusion concat',
        model_factory=make_concat_model,
        mert_cache_dir=MERT_CACHE,
        muq_cache_dir=MUQ_CACHE,
        labels=LABELS,
        fold_assignments=FOLD_ASSIGNMENTS,
        config=DUAL_CONFIG,
        checkpoint_root=CHECKPOINT_ROOT,
        results_dir=RESULTS_DIR,
        log_dir=LOG_DIR,
    )
    sync_experiment_to_gdrive(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

In [None]:
# Cell 28: D9c - MERT+MuQ Gated Fusion
exp_id = 'D9c_mert_muq_gated'

def make_gated_model(cfg):
    return AsymmetricGatedFusion(
        mert_dim=cfg['input_dim'],
        muq_dim=cfg['input_dim'],
        mert_hidden=cfg['hidden_dim'],
        shared_dim=cfg['hidden_dim'],
        dropout=cfg['dropout'],
        learning_rate=cfg['learning_rate'],
        weight_decay=cfg['weight_decay'],
        pooling='attention',
        max_epochs=cfg['max_epochs'],
    )

if should_run_experiment(exp_id, CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, COMPLETED_CACHE):
    ALL_RESULTS[exp_id] = run_4fold_dual_experiment(
        exp_id=exp_id,
        description='MERT+MuQ asymmetric gated fusion',
        model_factory=make_gated_model,
        mert_cache_dir=MERT_CACHE,
        muq_cache_dir=MUQ_CACHE,
        labels=LABELS,
        fold_assignments=FOLD_ASSIGNMENTS,
        config=DUAL_CONFIG,
        checkpoint_root=CHECKPOINT_ROOT,
        results_dir=RESULTS_DIR,
        log_dir=LOG_DIR,
    )
    sync_experiment_to_gdrive(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

In [None]:
# Cell 29: Extract gate weights from D9c
exp_id = 'D9c_mert_muq_gated'
ckpt_path = CHECKPOINT_ROOT / exp_id / 'fold0_best.ckpt'

if ckpt_path.exists():
    from audio_experiments.data import DualEmbeddingDataset, dual_collate_fn
    from torch.utils.data import DataLoader
    
    model = AsymmetricGatedFusion.load_from_checkpoint(ckpt_path)
    model = model.to('cuda').eval()
    
    # Get sample batch for gate extraction
    val_keys = FOLD_ASSIGNMENTS.get('fold_0', [])[:32]
    ds = DualEmbeddingDataset(MERT_CACHE, MUQ_CACHE, LABELS, val_keys, max_frames=1000)
    dl = DataLoader(ds, batch_size=32, collate_fn=dual_collate_fn)
    batch = next(iter(dl))
    
    gate_info = model.get_learned_gates(
        batch['mert_embeddings'].cuda(),
        batch['muq_embeddings'].cuda(),
        batch['mert_mask'].cuda(),
        batch['muq_mask'].cuda(),
    )
    
    # Store gate weights per dimension
    GATE_WEIGHTS = {
        dim: float(gate_info['mert_weight_per_dim'][i])
        for i, dim in enumerate(PERCEPIANO_DIMENSIONS)
    }
    
    print("\nLearned Gate Weights (higher = more MERT):")
    for dim, weight in sorted(GATE_WEIGHTS.items(), key=lambda x: -x[1]):
        print(f"  {dim:<25}: {weight:.3f}")
    
    # Save to results
    if exp_id in ALL_RESULTS:
        ALL_RESULTS[exp_id]['gate_weights'] = GATE_WEIGHTS
    
    del model
    torch.cuda.empty_cache()
else:
    print(f"Checkpoint not found: {ckpt_path}")
    GATE_WEIGHTS = {}

---
## Part 4: Cross-Dataset Validation (X1-X3)

Validate model generalization on external datasets.

In [None]:
# Cell 31: X1 - PianoVAM Skill Level Validation
# Dataset: https://yonghyunk1m.github.io/PianoVAM
# 106 recordings, 10 amateur pianists, 3 skill levels

exp_id = 'X1_pianovam_skill'

def download_pianovam(data_dir: Path) -> Tuple[List[Path], np.ndarray, List[str]]:
    """Download PianoVAM dataset and return audio paths with skill levels."""
    import requests
    from tqdm import tqdm
    
    # PianoVAM metadata
    # Note: Replace with actual download URL when available
    metadata_url = "https://raw.githubusercontent.com/yonghyunk1m/PianoVAM/main/metadata.json"
    audio_base = "https://github.com/yonghyunk1m/PianoVAM/releases/download/v1.0/"
    
    audio_dir = data_dir / 'audio'
    audio_dir.mkdir(parents=True, exist_ok=True)
    
    # Try to download metadata
    try:
        resp = requests.get(metadata_url, timeout=10)
        metadata = resp.json()
    except:
        print("PianoVAM metadata not accessible. Using local data if available.")
        # Check for local files
        local_audio = list(audio_dir.glob('*.wav'))
        if local_audio:
            # Assume skill levels from filename pattern
            audio_paths = local_audio
            skill_levels = []
            keys = []
            for p in audio_paths:
                name = p.stem.lower()
                if 'advanced' in name:
                    skill_levels.append(2)
                elif 'intermediate' in name:
                    skill_levels.append(1)
                else:
                    skill_levels.append(0)
                keys.append(p.stem)
            return audio_paths, np.array(skill_levels), keys
        raise FileNotFoundError("PianoVAM data not found")
    
    # Download audio files
    audio_paths = []
    skill_levels = []
    keys = []
    
    for item in tqdm(metadata['recordings'], desc='Downloading PianoVAM'):
        audio_url = audio_base + item['filename']
        audio_path = audio_dir / item['filename']
        
        if not audio_path.exists():
            resp = requests.get(audio_url)
            if resp.status_code == 200:
                with open(audio_path, 'wb') as f:
                    f.write(resp.content)
        
        if audio_path.exists():
            audio_paths.append(audio_path)
            skill_map = {'Beginner': 0, 'Intermediate': 1, 'Advanced': 2}
            skill_levels.append(skill_map.get(item['skill_level'], 0))
            keys.append(audio_path.stem)
    
    return audio_paths, np.array(skill_levels), keys

if should_run_experiment(exp_id, CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, COMPLETED_CACHE):
    try:
        # Download dataset
        vam_audio_paths, vam_skill_levels, vam_keys = download_pianovam(PIANOVAM_DIR)
        print(f"PianoVAM: {len(vam_audio_paths)} recordings")
        print(f"Skill distribution: Beginner={sum(vam_skill_levels==0)}, Intermediate={sum(vam_skill_levels==1)}, Advanced={sum(vam_skill_levels==2)}")
        
        # Extract MuQ embeddings
        vam_cache = PIANOVAM_DIR / 'muq_cache'
        vam_cache.mkdir(parents=True, exist_ok=True)
        
        if best_muq_exp:
            cfg = BEST_MUQ_CONFIG
            extract_muq_embeddings(
                PIANOVAM_DIR / 'audio', vam_cache, vam_keys,
                layer_start=cfg['layer_start'], layer_end=cfg['layer_end']
            )
        
        # Load trained model and predict
        ckpt_path = CHECKPOINT_ROOT / best_muq_exp / 'fold0_best.ckpt'
        model = MuQStatsModel.load_from_checkpoint(ckpt_path)
        model = model.to('cuda').eval()
        
        vam_predictions = []
        with torch.no_grad():
            for key in vam_keys:
                emb_path = vam_cache / f"{key}.pt"
                if emb_path.exists():
                    emb = torch.load(emb_path).unsqueeze(0).cuda()
                    mask = torch.ones(1, emb.shape[1], dtype=torch.bool).cuda()
                    pred = model(emb, mask).cpu().numpy()[0]
                    vam_predictions.append(pred)
                else:
                    vam_predictions.append(np.zeros(19))
        
        vam_predictions = np.array(vam_predictions)
        
        # Compute mean predictions per skill level
        skill_means = {}
        for skill in [0, 1, 2]:
            mask = vam_skill_levels == skill
            skill_means[skill] = vam_predictions[mask].mean(axis=0)
        
        # ANOVA test
        beginner_preds = vam_predictions[vam_skill_levels == 0].mean(axis=1)  # Mean across dims
        intermediate_preds = vam_predictions[vam_skill_levels == 1].mean(axis=1)
        advanced_preds = vam_predictions[vam_skill_levels == 2].mean(axis=1)
        
        f_stat, p_value = stats.f_oneway(beginner_preds, intermediate_preds, advanced_preds)
        
        # Effect size (eta-squared)
        all_preds = np.concatenate([beginner_preds, intermediate_preds, advanced_preds])
        group_means = [beginner_preds.mean(), intermediate_preds.mean(), advanced_preds.mean()]
        grand_mean = all_preds.mean()
        ss_between = sum(len(g) * (m - grand_mean)**2 for g, m in zip(
            [beginner_preds, intermediate_preds, advanced_preds], group_means))
        ss_total = ((all_preds - grand_mean)**2).sum()
        eta_squared = ss_between / ss_total if ss_total > 0 else 0
        
        ALL_RESULTS[exp_id] = {
            'exp_id': exp_id,
            'n_samples': len(vam_keys),
            'skill_distribution': {
                'beginner': int(sum(vam_skill_levels == 0)),
                'intermediate': int(sum(vam_skill_levels == 1)),
                'advanced': int(sum(vam_skill_levels == 2)),
            },
            'mean_predictions': {
                'beginner': float(beginner_preds.mean()),
                'intermediate': float(intermediate_preds.mean()),
                'advanced': float(advanced_preds.mean()),
            },
            'anova': {
                'f_statistic': float(f_stat),
                'p_value': float(p_value),
                'significant': p_value < 0.01,
            },
            'eta_squared': float(eta_squared),
            'monotonic': advanced_preds.mean() > intermediate_preds.mean() > beginner_preds.mean(),
        }
        
        print(f"\nPianoVAM Results:")
        print(f"  F-statistic: {f_stat:.2f}")
        print(f"  p-value: {p_value:.4f}")
        print(f"  Eta-squared: {eta_squared:.4f}")
        print(f"  Monotonic ordering: {ALL_RESULTS[exp_id]['monotonic']}")
        
        save_fusion_experiment(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, ALL_RESULTS)
        sync_experiment_to_gdrive(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)
        
        del model
        torch.cuda.empty_cache()
        
    except FileNotFoundError as e:
        print(f"SKIP {exp_id}: {e}")
    except Exception as e:
        print(f"ERROR {exp_id}: {e}")

In [None]:
# Cell 32: X2 - ASAP Multi-Performer Analysis
# Dataset: https://github.com/fosfrancesco/asap-dataset

exp_id = 'X2_asap_multiperformer'

if should_run_experiment(exp_id, CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, COMPLETED_CACHE):
    try:
        # ASAP requires MAESTRO audio - check if available
        asap_audio = ASAP_DIR / 'audio'
        if not asap_audio.exists() or len(list(asap_audio.glob('*.wav'))) == 0:
            print("ASAP audio not available. Please run: python initialize_dataset.py -m [maestro_path]")
            raise FileNotFoundError("ASAP audio not found")
        
        # Load ASAP metadata
        asap_metadata = ASAP_DIR / 'metadata.json'
        if not asap_metadata.exists():
            raise FileNotFoundError("ASAP metadata not found")
        
        with open(asap_metadata) as f:
            metadata = json.load(f)
        
        # Find pieces with 5+ performances
        piece_to_performances = {}
        for item in metadata['performances']:
            piece_id = item['piece_id']
            if piece_id not in piece_to_performances:
                piece_to_performances[piece_id] = []
            piece_to_performances[piece_id].append(item)
        
        multi_performer_pieces = {
            p: perfs for p, perfs in piece_to_performances.items() if len(perfs) >= 5
        }
        
        print(f"Found {len(multi_performer_pieces)} pieces with 5+ performances")
        
        # Extract embeddings and predict
        asap_cache = ASAP_DIR / 'muq_cache'
        asap_cache.mkdir(parents=True, exist_ok=True)
        
        # Load model
        ckpt_path = CHECKPOINT_ROOT / best_muq_exp / 'fold0_best.ckpt'
        model = MuQStatsModel.load_from_checkpoint(ckpt_path)
        model = model.to('cuda').eval()
        
        piece_variances = {}
        for piece_id, performances in list(multi_performer_pieces.items())[:10]:  # Limit for speed
            piece_preds = []
            for perf in performances:
                audio_path = asap_audio / perf['audio_filename']
                key = audio_path.stem
                
                # Extract embedding if needed
                emb_path = asap_cache / f"{key}.pt"
                if not emb_path.exists() and audio_path.exists():
                    extract_muq_embeddings(
                        audio_path.parent, asap_cache, [key],
                        layer_start=BEST_MUQ_CONFIG['layer_start'],
                        layer_end=BEST_MUQ_CONFIG['layer_end']
                    )
                
                if emb_path.exists():
                    with torch.no_grad():
                        emb = torch.load(emb_path).unsqueeze(0).cuda()
                        mask = torch.ones(1, emb.shape[1], dtype=torch.bool).cuda()
                        pred = model(emb, mask).cpu().numpy()[0]
                        piece_preds.append(pred)
            
            if len(piece_preds) >= 2:
                piece_preds = np.array(piece_preds)
                piece_variances[piece_id] = {
                    'n_performances': len(piece_preds),
                    'mean_pred': float(piece_preds.mean()),
                    'std_pred': float(piece_preds.mean(axis=1).std()),
                    'per_dim_std': piece_preds.std(axis=0).tolist(),
                }
        
        # Compute overall statistics
        all_stds = [v['std_pred'] for v in piece_variances.values()]
        mean_intra_piece_std = np.mean(all_stds) if all_stds else 0
        
        ALL_RESULTS[exp_id] = {
            'exp_id': exp_id,
            'n_pieces': len(piece_variances),
            'mean_intra_piece_std': float(mean_intra_piece_std),
            'meaningful_variation': mean_intra_piece_std > 0.05,
            'piece_details': piece_variances,
        }
        
        print(f"\nASAP Multi-Performer Results:")
        print(f"  Pieces analyzed: {len(piece_variances)}")
        print(f"  Mean intra-piece std: {mean_intra_piece_std:.4f}")
        print(f"  Meaningful variation: {ALL_RESULTS[exp_id]['meaningful_variation']}")
        
        save_fusion_experiment(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, ALL_RESULTS)
        sync_experiment_to_gdrive(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)
        
        del model
        torch.cuda.empty_cache()
        
    except FileNotFoundError as e:
        print(f"SKIP {exp_id}: {e}")
    except Exception as e:
        print(f"ERROR {exp_id}: {e}")

In [None]:
# Cell 33: X3 - PSyllabus Difficulty Correlation
# Dataset: https://zenodo.org/records/14794592

exp_id = 'X3_psyllabus_difficulty'

if should_run_experiment(exp_id, CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, COMPLETED_CACHE):
    try:
        # Check for PSyllabus data
        psyllabus_metadata = PSYLLABUS_DIR / 'metadata.json'
        psyllabus_audio = PSYLLABUS_DIR / 'audio'
        
        if not psyllabus_metadata.exists():
            print("PSyllabus metadata not found. Please download from Zenodo.")
            raise FileNotFoundError("PSyllabus data not found")
        
        with open(psyllabus_metadata) as f:
            metadata = json.load(f)
        
        # Sample pieces across difficulty levels
        pieces_by_difficulty = {i: [] for i in range(1, 12)}  # Levels 1-11
        for item in metadata['pieces']:
            diff = item.get('difficulty', 0)
            if 1 <= diff <= 11:
                pieces_by_difficulty[diff].append(item)
        
        # Sample up to 50 per difficulty level
        sampled_pieces = []
        for diff, pieces in pieces_by_difficulty.items():
            sampled_pieces.extend(pieces[:50])
        
        print(f"PSyllabus: {len(sampled_pieces)} pieces sampled")
        
        # Extract embeddings and predict
        psyllabus_cache = PSYLLABUS_DIR / 'muq_cache'
        psyllabus_cache.mkdir(parents=True, exist_ok=True)
        
        ckpt_path = CHECKPOINT_ROOT / best_muq_exp / 'fold0_best.ckpt'
        model = MuQStatsModel.load_from_checkpoint(ckpt_path)
        model = model.to('cuda').eval()
        
        difficulties = []
        predictions = []
        
        for piece in sampled_pieces:
            audio_path = psyllabus_audio / piece['audio_filename']
            key = audio_path.stem
            
            emb_path = psyllabus_cache / f"{key}.pt"
            if not emb_path.exists() and audio_path.exists():
                extract_muq_embeddings(
                    audio_path.parent, psyllabus_cache, [key],
                    layer_start=BEST_MUQ_CONFIG['layer_start'],
                    layer_end=BEST_MUQ_CONFIG['layer_end']
                )
            
            if emb_path.exists():
                with torch.no_grad():
                    emb = torch.load(emb_path).unsqueeze(0).cuda()
                    mask = torch.ones(1, emb.shape[1], dtype=torch.bool).cuda()
                    pred = model(emb, mask).cpu().numpy()[0]
                    predictions.append(pred.mean())  # Mean across dimensions
                    difficulties.append(piece['difficulty'])
        
        if len(predictions) > 10:
            # Compute Spearman correlation
            rho, p_value = stats.spearmanr(difficulties, predictions)
            
            ALL_RESULTS[exp_id] = {
                'exp_id': exp_id,
                'n_samples': len(predictions),
                'spearman_rho': float(rho),
                'p_value': float(p_value),
                'significant': p_value < 0.05,
                'weak_positive': rho > 0.2,
                'difficulty_range': [int(min(difficulties)), int(max(difficulties))],
            }
            
            print(f"\nPSyllabus Results:")
            print(f"  Samples: {len(predictions)}")
            print(f"  Spearman rho: {rho:.4f}")
            print(f"  p-value: {p_value:.4f}")
            
            save_fusion_experiment(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, ALL_RESULTS)
            sync_experiment_to_gdrive(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)
        else:
            print("Insufficient data for correlation analysis")
        
        del model
        torch.cuda.empty_cache()
        
    except FileNotFoundError as e:
        print(f"SKIP {exp_id}: {e}")
    except Exception as e:
        print(f"ERROR {exp_id}: {e}")

---
## Part 5: Statistical Rigor (S3-S4)

Bootstrap CIs and significance tests for all comparisons.

In [None]:
# Cell 35: S3 - Bootstrap CIs for all comparisons
exp_id = 'S3_bootstrap_all'

if should_run_experiment(exp_id, CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, COMPLETED_CACHE):
    bootstrap_results = {}
    
    # MuQ vs Symbolic
    if len(MUQ_ARR) > 0 and len(SYMBOLIC_ARR) > 0:
        print("Computing MuQ vs Symbolic bootstrap...")
        bootstrap_results['muq_vs_symbolic'] = bootstrap_r2_comparison(
            LABELS_ARR, MUQ_ARR, SYMBOLIC_ARR, n_bootstrap=10000
        )
        print(f"  MuQ: {bootstrap_results['muq_vs_symbolic']['r2_a']:.4f}")
        print(f"  Symbolic: {bootstrap_results['muq_vs_symbolic']['r2_b']:.4f}")
        print(f"  Diff: {bootstrap_results['muq_vs_symbolic']['difference']:.4f}")
        print(f"  MuQ significantly better: {bootstrap_results['muq_vs_symbolic']['a_significantly_better']}")
    
    # MuQ CIs
    if len(MUQ_ARR) > 0:
        print("\nComputing MuQ bootstrap CIs...")
        bootstrap_results['muq_ci'] = bootstrap_r2_extended(LABELS_ARR, MUQ_ARR, n_bootstrap=10000)
        print(f"  R2: {bootstrap_results['muq_ci']['overall']['r2']:.4f}")
        print(f"  95% CI: [{bootstrap_results['muq_ci']['overall']['ci_lower']:.4f}, {bootstrap_results['muq_ci']['overall']['ci_upper']:.4f}]")
    
    ALL_RESULTS[exp_id] = {
        'exp_id': exp_id,
        'n_bootstrap': 10000,
        'comparisons': bootstrap_results,
    }
    
    save_fusion_experiment(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, ALL_RESULTS)
    sync_experiment_to_gdrive(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

In [None]:
# Cell 36: S4 - Significance Tests
exp_id = 'S4_significance_tests'

if should_run_experiment(exp_id, CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, COMPLETED_CACHE):
    significance_results = {}
    
    if len(MUQ_ARR) > 0 and len(SYMBOLIC_ARR) > 0:
        # Paired t-test
        ttest = paired_ttest_per_sample(LABELS_ARR, MUQ_ARR, SYMBOLIC_ARR)
        significance_results['paired_ttest'] = ttest
        print(f"Paired t-test: t={ttest['t_stat']:.4f}, p={ttest['p_value']:.2e}")
        
        # Wilcoxon
        wilcox = wilcoxon_test(LABELS_ARR, MUQ_ARR, SYMBOLIC_ARR)
        significance_results['wilcoxon'] = wilcox
        print(f"Wilcoxon: stat={wilcox['stat']:.4f}, p={wilcox['p_value']:.2e}")
        
        # Cohen's d
        d = cohens_d(LABELS_ARR, MUQ_ARR, SYMBOLIC_ARR)
        significance_results['cohens_d'] = d
        print(f"Cohen's d: {d:.4f}")
        
        # Per-dimension tests with Bonferroni
        per_dim_p = []
        for i in range(19):
            t = paired_ttest_per_sample(
                LABELS_ARR[:, i:i+1],
                MUQ_ARR[:, i:i+1],
                SYMBOLIC_ARR[:, i:i+1]
            )
            per_dim_p.append(t['p_value'])
        
        bonf_corrected, bonf_sig = bonferroni_correction(np.array(per_dim_p))
        significance_results['per_dimension'] = {
            dim: {
                'raw_p': per_dim_p[i],
                'corrected_p': float(bonf_corrected[i]),
                'significant': bool(bonf_sig[i]),
            }
            for i, dim in enumerate(PERCEPIANO_DIMENSIONS)
        }
        
        print(f"\nBonferroni correction: {sum(bonf_sig)}/19 dimensions significant")
    
    ALL_RESULTS[exp_id] = {
        'exp_id': exp_id,
        'tests': significance_results,
    }
    
    save_fusion_experiment(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, ALL_RESULTS)
    sync_experiment_to_gdrive(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

---
## Part 6: Analysis (A3-A7)

In [None]:
# Cell 38: A3 - Error Correlation Analysis
exp_id = 'A3_error_correlation'

if should_run_experiment(exp_id, CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, COMPLETED_CACHE):
    if len(MUQ_ARR) > 0 and len(SYMBOLIC_ARR) > 0:
        ALL_RESULTS[exp_id] = run_error_correlation_experiment(
            exp_id, MUQ_ARR, SYMBOLIC_ARR, LABELS_ARR
        )
        save_fusion_experiment(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, ALL_RESULTS)
        sync_experiment_to_gdrive(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

In [None]:
# Cell 39: A4 - Per-Dimension Breakdown
exp_id = 'A4_dimension_breakdown'

if should_run_experiment(exp_id, CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, COMPLETED_CACHE):
    if len(MUQ_ARR) > 0 and len(SYMBOLIC_ARR) > 0:
        fused = simple_average_fusion(MUQ_ARR, SYMBOLIC_ARR)
        
        dim_comparison = {}
        for i, dim in enumerate(PERCEPIANO_DIMENSIONS):
            muq_r2 = r2_score(LABELS_ARR[:, i], MUQ_ARR[:, i])
            symbolic_r2 = r2_score(LABELS_ARR[:, i], SYMBOLIC_ARR[:, i])
            fusion_r2 = r2_score(LABELS_ARR[:, i], fused[:, i])
            
            # Determine category
            category = None
            for cat, dims in DIMENSION_CATEGORIES.items():
                if dim in dims:
                    category = cat
                    break
            
            dim_comparison[dim] = {
                'muq_r2': float(muq_r2),
                'symbolic_r2': float(symbolic_r2),
                'fusion_r2': float(fusion_r2),
                'winner': 'muq' if muq_r2 > symbolic_r2 else 'symbolic',
                'muq_advantage': float(muq_r2 - symbolic_r2),
                'category': category,
            }
        
        # Count winners by category
        category_summary = {}
        for cat in DIMENSION_CATEGORIES:
            cat_dims = [d for d, v in dim_comparison.items() if v['category'] == cat]
            muq_wins = sum(1 for d in cat_dims if dim_comparison[d]['winner'] == 'muq')
            category_summary[cat] = {
                'total': len(cat_dims),
                'muq_wins': muq_wins,
                'symbolic_wins': len(cat_dims) - muq_wins,
            }
        
        ALL_RESULTS[exp_id] = {
            'exp_id': exp_id,
            'per_dimension': dim_comparison,
            'category_summary': category_summary,
            'muq_total_wins': sum(1 for v in dim_comparison.values() if v['winner'] == 'muq'),
        }
        
        print(f"\nDimension Breakdown:")
        print(f"  MuQ wins: {ALL_RESULTS[exp_id]['muq_total_wins']}/19 dimensions")
        
        save_fusion_experiment(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, ALL_RESULTS)
        sync_experiment_to_gdrive(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

In [None]:
# Cell 40: A5 - Failure Cases
exp_id = 'A5_failure_cases'

if should_run_experiment(exp_id, CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, COMPLETED_CACHE):
    if len(MUQ_ARR) > 0:
        # Compute per-sample MSE
        mse_per_sample = ((LABELS_ARR - MUQ_ARR) ** 2).mean(axis=1)
        
        # Find worst predictions
        worst_indices = np.argsort(mse_per_sample)[-10:]
        
        failure_cases = []
        for idx in worst_indices:
            key = FUSION_KEYS[idx]
            sample_mse = mse_per_sample[idx]
            
            # Find worst dimensions for this sample
            dim_errors = np.abs(LABELS_ARR[idx] - MUQ_ARR[idx])
            worst_dims = np.argsort(dim_errors)[-3:]
            
            failure_cases.append({
                'key': key,
                'mse': float(sample_mse),
                'worst_dimensions': [PERCEPIANO_DIMENSIONS[i] for i in worst_dims],
                'predicted': MUQ_ARR[idx].tolist(),
                'actual': LABELS_ARR[idx].tolist(),
            })
        
        ALL_RESULTS[exp_id] = {
            'exp_id': exp_id,
            'n_samples': len(mse_per_sample),
            'mean_mse': float(mse_per_sample.mean()),
            'max_mse': float(mse_per_sample.max()),
            'failure_cases': failure_cases,
        }
        
        print(f"\nFailure Case Analysis:")
        print(f"  Mean MSE: {ALL_RESULTS[exp_id]['mean_mse']:.4f}")
        print(f"  Max MSE: {ALL_RESULTS[exp_id]['max_mse']:.4f}")
        print(f"  Worst samples: {[f['key'] for f in failure_cases[:3]]}")
        
        save_fusion_experiment(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, ALL_RESULTS)
        sync_experiment_to_gdrive(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

In [None]:
# Cell 41: A6 - Calibration
exp_id = 'A6_calibration'

if should_run_experiment(exp_id, CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, COMPLETED_CACHE):
    if len(MUQ_ARR) > 0:
        # Compute calibration by decile
        n_bins = 10
        calibration = []
        
        # Flatten for overall calibration
        preds_flat = MUQ_ARR.flatten()
        labels_flat = LABELS_ARR.flatten()
        
        # Bin by predicted values
        bins = np.linspace(0, 1, n_bins + 1)
        for i in range(n_bins):
            mask = (preds_flat >= bins[i]) & (preds_flat < bins[i+1])
            if mask.sum() > 0:
                calibration.append({
                    'bin': i,
                    'bin_range': [float(bins[i]), float(bins[i+1])],
                    'count': int(mask.sum()),
                    'mean_predicted': float(preds_flat[mask].mean()),
                    'mean_actual': float(labels_flat[mask].mean()),
                    'error': float(preds_flat[mask].mean() - labels_flat[mask].mean()),
                })
        
        # Dispersion ratio
        pred_std = MUQ_ARR.std()
        label_std = LABELS_ARR.std()
        dispersion_ratio = pred_std / label_std if label_std > 0 else 0
        
        ALL_RESULTS[exp_id] = {
            'exp_id': exp_id,
            'calibration_bins': calibration,
            'dispersion_ratio': float(dispersion_ratio),
            'pred_std': float(pred_std),
            'label_std': float(label_std),
        }
        
        print(f"\nCalibration Analysis:")
        print(f"  Dispersion ratio: {dispersion_ratio:.4f}")
        print(f"  Prediction std: {pred_std:.4f}")
        print(f"  Label std: {label_std:.4f}")
        
        save_fusion_experiment(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, ALL_RESULTS)
        sync_experiment_to_gdrive(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)

In [None]:
# Cell 42: A7 - Gate Weight Visualization
exp_id = 'A7_gate_visualization'

if should_run_experiment(exp_id, CHECKPOINT_ROOT, RESULTS_DIR, GDRIVE_RESULTS, COMPLETED_CACHE):
    if GATE_WEIGHTS:
        # Sort by MERT preference
        sorted_dims = sorted(GATE_WEIGHTS.items(), key=lambda x: -x[1])
        
        # Group by category
        category_gates = {}
        for cat, dims in DIMENSION_CATEGORIES.items():
            cat_weights = [GATE_WEIGHTS.get(d, 0.5) for d in dims]
            category_gates[cat] = {
                'mean_mert_weight': float(np.mean(cat_weights)),
                'dimensions': {d: GATE_WEIGHTS.get(d, 0.5) for d in dims},
            }
        
        ALL_RESULTS[exp_id] = {
            'exp_id': exp_id,
            'gate_weights': GATE_WEIGHTS,
            'mert_preferred_dims': [d for d, w in sorted_dims[:5]],
            'muq_preferred_dims': [d for d, w in sorted_dims[-5:]],
            'category_summary': category_gates,
            'mean_gate': float(np.mean(list(GATE_WEIGHTS.values()))),
        }
        
        print(f"\nGate Weight Analysis:")
        print(f"  Mean gate (0.5=balanced): {ALL_RESULTS[exp_id]['mean_gate']:.3f}")
        print(f"  MERT-preferred: {ALL_RESULTS[exp_id]['mert_preferred_dims']}")
        print(f"  MuQ-preferred: {ALL_RESULTS[exp_id]['muq_preferred_dims']}")
        
        save_fusion_experiment(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, ALL_RESULTS)
        sync_experiment_to_gdrive(exp_id, ALL_RESULTS[exp_id], RESULTS_DIR, CHECKPOINT_ROOT, GDRIVE_RESULTS, ALL_RESULTS)
    else:
        print("No gate weights available (D9c not trained)")

---
## Part 7: Results Export

In [None]:
# Cell 44: Export all results
print("\n" + "="*70)
print("EXPORTING RESULTS")
print("="*70)

# Load any missing results from disk
for exp_id in EXPERIMENT_IDS:
    if exp_id not in ALL_RESULTS:
        result_file = RESULTS_DIR / f"{exp_id}.json"
        if result_file.exists():
            with open(result_file) as f:
                ALL_RESULTS[exp_id] = json.load(f)

# Save aggregate results
aggregate_file = RESULTS_DIR / 'definitive_all_results.json'
with open(aggregate_file, 'w') as f:
    json.dump(ALL_RESULTS, f, indent=2, default=numpy_serializer)
print(f"Saved: {aggregate_file}")

# Sync to GDrive
run_rclone(['rclone', 'copy', str(RESULTS_DIR), GDRIVE_RESULTS], "Syncing results to GDrive")
print(f"Synced to: {GDRIVE_RESULTS}")

In [None]:
# Cell 45: Final Summary
print("\n" + "="*70)
print("DEFINITIVE EXPERIMENTS SUMMARY")
print("="*70)

# Part 1: MuQ Layer Ablation
print("\nPart 1: MuQ Layer Ablation")
print("-"*40)
for exp_id in ['M1a_muq_L1-6', 'M1b_muq_L7-12', 'M1c_muq_L13-24', 'M1d_muq_L1-24']:
    if exp_id in ALL_RESULTS and 'summary' in ALL_RESULTS[exp_id]:
        r2 = ALL_RESULTS[exp_id]['summary']['avg_r2']
        print(f"  {exp_id}: R2={r2:.4f}")

# Part 2: MuQ + Symbolic Fusion
print("\nPart 2: MuQ + Symbolic Fusion")
print("-"*40)
for exp_id in ['F8_muq_symbolic_simple', 'F9_muq_symbolic_weighted', 'F10_muq_symbolic_ridge', 'F11_muq_symbolic_confidence']:
    if exp_id in ALL_RESULTS and 'overall_r2' in ALL_RESULTS[exp_id]:
        r2 = ALL_RESULTS[exp_id]['overall_r2']
        print(f"  {exp_id}: R2={r2:.4f}")

# Part 3: MERT + MuQ Fusion
print("\nPart 3: MERT + MuQ Audio Fusion")
print("-"*40)
for exp_id in ['D9a_mert_muq_ensemble', 'D9b_mert_muq_concat', 'D9c_mert_muq_gated']:
    if exp_id in ALL_RESULTS and 'summary' in ALL_RESULTS[exp_id]:
        r2 = ALL_RESULTS[exp_id]['summary']['avg_r2']
        print(f"  {exp_id}: R2={r2:.4f}")

# Part 4: Cross-Dataset
print("\nPart 4: Cross-Dataset Validation")
print("-"*40)
if 'X1_pianovam_skill' in ALL_RESULTS:
    r = ALL_RESULTS['X1_pianovam_skill']
    if 'anova' in r:
        print(f"  PianoVAM: F={r['anova']['f_statistic']:.2f}, p={r['anova']['p_value']:.4f}")
if 'X2_asap_multiperformer' in ALL_RESULTS:
    r = ALL_RESULTS['X2_asap_multiperformer']
    print(f"  ASAP: intra-piece std={r.get('mean_intra_piece_std', 0):.4f}")
if 'X3_psyllabus_difficulty' in ALL_RESULTS:
    r = ALL_RESULTS['X3_psyllabus_difficulty']
    print(f"  PSyllabus: rho={r.get('spearman_rho', 0):.4f}")

# Completion stats
completed = sum(1 for e in EXPERIMENT_IDS if e in ALL_RESULTS)
print(f"\nCompleted: {completed}/{len(EXPERIMENT_IDS)} experiments")
print("="*70)