# 00: PercePiano Data Quality Audit

Comprehensive investigation of PercePiano's 19 quality dimensions:
- Which dimensions are redundant or noisy?
- How many independent factors do the labels encode?
- Which dimensions are "audible" (predictable from MuQ embeddings)?
- Does attention-pooling or nonlinear probing improve audibility?
- Can a reduced dimension set improve downstream STOP prediction?

Runs locally on CPU/MPS. No cloud GPU required.

## 1. Setup

In [None]:
import json
import sys
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
from scipy import stats as sp_stats
from sklearn.cross_decomposition import CCA
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

MODEL_ROOT = Path('../..').resolve()
sys.path.insert(0, str(MODEL_ROOT / 'src'))

from audio_experiments.constants import PERCEPIANO_DIMENSIONS, DIMENSION_CATEGORIES

CACHE_DIR = MODEL_ROOT / 'data' / 'percepiano_cache'
MASTERCLASS_CACHE = MODEL_ROOT / 'data' / 'masterclass_cache'

DIM_NAMES = PERCEPIANO_DIMENSIONS
N_DIMS = len(DIM_NAMES)

# Collect all results for consolidated output
audit_results = {}

print(f'{N_DIMS} dimensions: {DIM_NAMES}')

In [None]:
# Load PercePiano labels
with open(CACHE_DIR / 'labels.json') as f:
    raw_labels = json.load(f)

with open(CACHE_DIR / 'folds.json') as f:
    folds = json.load(f)

# Build label array (all 1,202 segments)
all_keys = sorted(raw_labels.keys())
Y = np.array([raw_labels[k][:19] for k in all_keys])  # [N, 19]
Y_std = StandardScaler().fit_transform(Y)

rng = np.random.default_rng(42)

print(f'Samples: {len(all_keys)}, Labels shape: {Y.shape}')

## 2. Correlation & Redundancy

In [None]:
# Correlation matrix with hierarchical clustering order
from scipy.cluster.hierarchy import linkage, leaves_list

corr = np.corrcoef(Y.T)  # [19, 19]

# Get hierarchical cluster ordering
Z = linkage(1 - corr, method='average')
cluster_order = [DIM_NAMES[i] for i in leaves_list(Z)]

print(f'Cluster ordering: {cluster_order}')

In [None]:
# Top correlated pairs
pairs = []
for i in range(N_DIMS):
    for j in range(i + 1, N_DIMS):
        pairs.append((DIM_NAMES[i], DIM_NAMES[j], corr[i, j]))
pairs.sort(key=lambda x: abs(x[2]), reverse=True)

n_high = sum(1 for _, _, r in pairs if abs(r) > 0.7)
n_moderate = sum(1 for _, _, r in pairs if 0.5 < abs(r) <= 0.7)

# VIF
vifs = [variance_inflation_factor(Y_std, i) for i in range(N_DIMS)]
severe_vif = sum(1 for v in vifs if v > 10)

audit_results['correlation'] = {
    'top_pairs': [(d1, d2, float(r)) for d1, d2, r in pairs[:15]],
    'n_high_corr': n_high,
    'n_moderate_corr': n_moderate,
    'total_pairs': len(pairs),
    'vifs': {DIM_NAMES[i]: float(vifs[i]) for i in range(N_DIMS)},
    'severe_vif_count': severe_vif,
    'cluster_order': cluster_order,
}

print(f'Pairs |r|>0.7: {n_high}/{len(pairs)}, 0.5<|r|<=0.7: {n_moderate}, VIF>10: {severe_vif}/{N_DIMS}')

## 3. Factor Analysis (PCA)

In [None]:
# PCA on standardized labels + parallel analysis
pca = PCA(n_components=N_DIMS)
pca.fit(Y_std)

explained = pca.explained_variance_ratio_
cumulative = np.cumsum(explained)

# Parallel analysis: Monte Carlo simulation for significant components
n_simulations = 1000
random_eigenvalues = np.zeros((n_simulations, N_DIMS))
for i in range(n_simulations):
    random_data = rng.standard_normal((len(all_keys), N_DIMS))
    random_pca = PCA(n_components=N_DIMS)
    random_pca.fit(random_data)
    random_eigenvalues[i] = random_pca.explained_variance_

threshold_eigenvalues = np.percentile(random_eigenvalues, 95, axis=0)
n_significant = int(np.sum(pca.explained_variance_ > threshold_eigenvalues))
n_90 = int(np.searchsorted(cumulative, 0.90) + 1)
n_95 = int(np.searchsorted(cumulative, 0.95) + 1)

# Factor loadings for significant components
loadings = pca.components_[:n_significant].T  # [19, n_significant]
factor_interpretations = {}
for j in range(n_significant):
    strong = [(DIM_NAMES[i], float(loadings[i, j])) for i in range(N_DIMS) if abs(loadings[i, j]) > 0.3]
    strong.sort(key=lambda x: abs(x[1]), reverse=True)
    factor_interpretations[f'PC{j+1}'] = {
        'explained_variance': float(explained[j]),
        'loadings': strong,
    }

audit_results['pca'] = {
    'n_significant': n_significant,
    'n_90_variance': n_90,
    'n_95_variance': n_95,
    'explained_variance': [float(e) for e in explained],
    'cumulative_variance': [float(c) for c in cumulative],
    'factor_interpretations': factor_interpretations,
}

print(f'Parallel analysis: {n_significant} significant factors, {n_90} for 90%, {n_95} for 95%')

## 4. Per-Dimension Distributions

In [None]:
# Per-dimension distribution statistics
flagged = []
dim_stats = {}
for i in range(N_DIMS):
    vals = Y[:, i]
    var = float(np.var(vals))
    skew = float(sp_stats.skew(vals))
    floor_pct = float(np.mean(vals < 0.05))
    ceil_pct = float(np.mean(vals > 0.95))

    flags = []
    if var < 0.01:
        flags.append(f'low var ({var:.4f})')
    if abs(skew) > 1:
        flags.append(f'skew ({skew:+.2f})')
    if floor_pct > 0.5:
        flags.append(f'floor ({floor_pct:.0%})')
    if ceil_pct > 0.5:
        flags.append(f'ceil ({ceil_pct:.0%})')

    dim_stats[DIM_NAMES[i]] = {'var': var, 'skew': skew, 'floor_pct': floor_pct, 'ceil_pct': ceil_pct}
    if flags:
        flagged.append((DIM_NAMES[i], flags))

# Differential entropy per dimension
entropies = {}
for i in range(N_DIMS):
    vals = Y[:, i]
    kde = sp_stats.gaussian_kde(vals)
    x_grid = np.linspace(vals.min() - 0.1, vals.max() + 0.1, 500)
    pdf = kde(x_grid)
    pdf = pdf[pdf > 0]
    dx = x_grid[1] - x_grid[0]
    entropy = float(-np.sum(pdf * np.log(pdf) * dx))
    entropies[DIM_NAMES[i]] = entropy

audit_results['distributions'] = {
    'dim_stats': dim_stats,
    'flagged': flagged,
    'entropies': entropies,
}

print(f'Flagged dimensions: {len(flagged)}')

## -- Load MuQ Embeddings (required for sections 5-9) --

Run `scripts/extract_percepiano_muq.py` first if `muq_embeddings.pt` does not exist.

In [None]:
# Load pre-extracted MuQ embeddings
emb_path = CACHE_DIR / 'muq_embeddings.pt'
embeddings = torch.load(emb_path, map_location='cpu', weights_only=True)

# Align keys: only keep segments that have both labels and embeddings
keys = sorted(set(all_keys) & set(embeddings.keys()))
Y = np.array([raw_labels[k][:19] for k in keys])  # overwrite with aligned subset
Y_std = StandardScaler().fit_transform(Y)

# Stats-pool MuQ embeddings: mean + std -> [2048]
X_muq = np.stack([
    torch.cat([embeddings[k].mean(dim=0), embeddings[k].std(dim=0)]).numpy()
    for k in keys
])

print(f'Aligned samples (labels + embeddings): {len(keys)}')
print(f'Labels shape: {Y.shape}')
print(f'MuQ features shape: {X_muq.shape}')

## 5. MuQ Probing (per-dimension audibility)

In [None]:
# Per-dimension Ridge probing: MuQ stats-pooled -> PercePiano score, 4-fold CV
key_to_idx = {k: i for i, k in enumerate(keys)}

r2_per_dim = np.zeros(N_DIMS)
r2_per_dim_per_fold = np.zeros((N_DIMS, len(folds)))

for dim_i in range(N_DIMS):
    y_dim = Y[:, dim_i]
    all_true, all_pred = [], []

    for fold_i, fold in enumerate(folds):
        train_idx = [key_to_idx[k] for k in fold['train'] if k in key_to_idx]
        val_idx = [key_to_idx[k] for k in fold['val'] if k in key_to_idx]

        scaler = StandardScaler()
        X_tr = scaler.fit_transform(X_muq[train_idx])
        X_va = scaler.transform(X_muq[val_idx])

        ridge = Ridge(alpha=1.0)
        ridge.fit(X_tr, y_dim[train_idx])
        preds = ridge.predict(X_va)

        r2_fold = r2_score(y_dim[val_idx], preds)
        r2_per_dim_per_fold[dim_i, fold_i] = r2_fold

        all_true.extend(y_dim[val_idx])
        all_pred.extend(preds)

    r2_per_dim[dim_i] = r2_score(all_true, all_pred)

audit_results['ridge_probing'] = {
    DIM_NAMES[i]: {
        'r2': float(r2_per_dim[i]),
        'fold_r2': [float(r2_per_dim_per_fold[i, f]) for f in range(len(folds))],
    }
    for i in range(N_DIMS)
}

print(f'Stats-pooled Ridge probing complete. Best R2: {r2_per_dim.max():.3f} ({DIM_NAMES[r2_per_dim.argmax()]})')

## 5b. Attention-Pooled Probing

Learn a lightweight attention layer over MuQ frame embeddings instead of
stats-pooling. Joint attention + linear head trained in PyTorch, 4-fold CV.

In [None]:
class AttentionPooler(nn.Module):
    """Learnable attention pooling + multi-output head for probing."""
    def __init__(self, input_dim=1024, attn_hidden=128, n_targets=19):
        super().__init__()
        self.attn = nn.Sequential(
            nn.Linear(input_dim, attn_hidden),
            nn.Tanh(),
            nn.Linear(attn_hidden, 1),
        )
        self.head = nn.Linear(input_dim, n_targets)

    def forward(self, x):
        # x: [T, 1024] (single sample, variable-length frames)
        weights = torch.softmax(self.attn(x), dim=0)  # [T, 1]
        pooled = (x * weights).sum(dim=0)  # [1024]
        return self.head(pooled)  # [n_targets]

    def pool(self, x):
        weights = torch.softmax(self.attn(x), dim=0)
        return (x * weights).sum(dim=0)  # [1024]


def train_attention_pooler(emb_list, targets_matrix, n_epochs=50, lr=1e-3, wd=1e-3):
    """Train attention pooler on all 19 dims jointly. targets_matrix: [N, 19]."""
    device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
    n_targets = targets_matrix.shape[1] if hasattr(targets_matrix, 'shape') else len(targets_matrix[0])
    model = AttentionPooler(n_targets=n_targets).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

    for epoch in range(n_epochs):
        model.train()
        perm = torch.randperm(len(emb_list))
        for idx in perm:
            x = emb_list[idx].to(device)
            y = torch.tensor(targets_matrix[idx], dtype=torch.float32, device=device)
            pred = model(x)
            loss = ((pred - y) ** 2).mean()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    return model


# Attention-pooled probing: train multi-target model per fold, then Ridge per dimension
attn_r2_per_dim = np.zeros(N_DIMS)
attn_r2_per_fold = np.zeros((N_DIMS, len(folds)))

for fold_i, fold in enumerate(folds):
    train_keys = [k for k in fold['train'] if k in key_to_idx]
    val_keys = [k for k in fold['val'] if k in key_to_idx]
    train_idx = [key_to_idx[k] for k in train_keys]
    val_idx = [key_to_idx[k] for k in val_keys]

    # Train attention pooler on all 19 dims jointly
    train_embs = [embeddings[k] for k in train_keys]
    model = train_attention_pooler(train_embs, Y[train_idx])
    model.eval()
    device = next(model.parameters()).device

    # Pool all samples with trained attention
    with torch.no_grad():
        X_tr_attn = np.stack([model.pool(embeddings[k].to(device)).cpu().numpy() for k in train_keys])
        X_va_attn = np.stack([model.pool(embeddings[k].to(device)).cpu().numpy() for k in val_keys])

    # Ridge per dimension on attention-pooled features
    for dim_i in range(N_DIMS):
        scaler = StandardScaler()
        X_tr_s = scaler.fit_transform(X_tr_attn)
        X_va_s = scaler.transform(X_va_attn)

        ridge = Ridge(alpha=1.0)
        ridge.fit(X_tr_s, Y[train_idx, dim_i])
        preds = ridge.predict(X_va_s)
        attn_r2_per_fold[dim_i, fold_i] = r2_score(Y[val_idx, dim_i], preds)

    print(f'  Fold {fold_i+1}/{len(folds)} done')

# Aggregate across folds
for dim_i in range(N_DIMS):
    all_true, all_pred = [], []
    for fold_i, fold in enumerate(folds):
        val_keys = [k for k in fold['val'] if k in key_to_idx]
        val_idx = [key_to_idx[k] for k in val_keys]
        all_true.extend(Y[val_idx, dim_i])
        # Use fold mean as approximate R2
    attn_r2_per_dim[dim_i] = float(attn_r2_per_fold[dim_i].mean())

audit_results['attention_probing'] = {
    DIM_NAMES[i]: {
        'r2': float(attn_r2_per_dim[i]),
        'fold_r2': [float(attn_r2_per_fold[i, f]) for f in range(len(folds))],
        'delta_vs_ridge': float(attn_r2_per_dim[i] - r2_per_dim[i]),
    }
    for i in range(N_DIMS)
}

improved = sum(1 for i in range(N_DIMS) if attn_r2_per_dim[i] > r2_per_dim[i])
print(f'Attention probing complete. Improved over Ridge: {improved}/{N_DIMS} dims')
print(f'Best R2: {attn_r2_per_dim.max():.3f} ({DIM_NAMES[attn_r2_per_dim.argmax()]})')

## 5c. Nonlinear Probing (MLP)

2-layer MLP on stats-pooled features. If MLP R2 >> Ridge R2, the
relationship between MuQ representations and the dimension is nonlinear.

In [None]:
class MLPProber(nn.Module):
    """2-layer MLP for single-dimension probing on stats-pooled features."""
    def __init__(self, input_dim=2048, hidden_dim=256):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1),
        )

    def forward(self, x):
        return self.net(x).squeeze(-1)


def train_mlp_prober(X_train, y_train, n_epochs=80, lr=1e-3, wd=1e-3, batch_size=64):
    """Train MLP prober on stats-pooled features."""
    device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
    model = MLPProber(input_dim=X_train.shape[1]).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

    X_t = torch.tensor(X_train, dtype=torch.float32, device=device)
    y_t = torch.tensor(y_train, dtype=torch.float32, device=device)

    n = len(X_train)
    for epoch in range(n_epochs):
        model.train()
        perm = torch.randperm(n, device=device)
        for start in range(0, n, batch_size):
            idx = perm[start:start + batch_size]
            pred = model(X_t[idx])
            loss = ((pred - y_t[idx]) ** 2).mean()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    return model


# MLP probing: 4-fold CV per dimension
mlp_r2_per_dim = np.zeros(N_DIMS)
mlp_r2_per_fold = np.zeros((N_DIMS, len(folds)))

for dim_i in range(N_DIMS):
    y_dim = Y[:, dim_i]
    all_true, all_pred = [], []

    for fold_i, fold in enumerate(folds):
        train_idx = [key_to_idx[k] for k in fold['train'] if k in key_to_idx]
        val_idx = [key_to_idx[k] for k in fold['val'] if k in key_to_idx]

        scaler = StandardScaler()
        X_tr = scaler.fit_transform(X_muq[train_idx])
        X_va = scaler.transform(X_muq[val_idx])

        model = train_mlp_prober(X_tr, y_dim[train_idx])
        model.eval()
        device = next(model.parameters()).device

        with torch.no_grad():
            X_va_t = torch.tensor(X_va, dtype=torch.float32, device=device)
            preds = model(X_va_t).cpu().numpy()

        r2_fold = r2_score(y_dim[val_idx], preds)
        mlp_r2_per_fold[dim_i, fold_i] = r2_fold
        all_true.extend(y_dim[val_idx])
        all_pred.extend(preds)

    mlp_r2_per_dim[dim_i] = r2_score(all_true, all_pred)

    if (dim_i + 1) % 5 == 0:
        print(f'  MLP probing: {dim_i+1}/{N_DIMS} dims done')

audit_results['mlp_probing'] = {
    DIM_NAMES[i]: {
        'r2': float(mlp_r2_per_dim[i]),
        'fold_r2': [float(mlp_r2_per_fold[i, f]) for f in range(len(folds))],
        'delta_vs_ridge': float(mlp_r2_per_dim[i] - r2_per_dim[i]),
    }
    for i in range(N_DIMS)
}

nonlinear_dims = sum(1 for i in range(N_DIMS) if mlp_r2_per_dim[i] > r2_per_dim[i] + 0.05)
print(f'MLP probing complete. Substantially nonlinear (MLP R2 > Ridge R2 + 0.05): {nonlinear_dims}/{N_DIMS}')
print(f'Best R2: {mlp_r2_per_dim.max():.3f} ({DIM_NAMES[mlp_r2_per_dim.argmax()]})')

## 5d. Factor-Level Probing

Project Y onto significant PCA components from section 3. Ridge regression
MuQ -> PC scores. If factor-level R2 > individual dimension R2, consolidation
into factors is validated.

In [None]:
# Project labels onto significant PCA components
Y_pc = pca.transform(Y_std)[:, :n_significant]  # [N, n_significant]

factor_r2 = np.zeros(n_significant)
factor_r2_per_fold = np.zeros((n_significant, len(folds)))

for pc_i in range(n_significant):
    y_pc = Y_pc[:, pc_i]
    all_true, all_pred = [], []

    for fold_i, fold in enumerate(folds):
        train_idx = [key_to_idx[k] for k in fold['train'] if k in key_to_idx]
        val_idx = [key_to_idx[k] for k in fold['val'] if k in key_to_idx]

        scaler = StandardScaler()
        X_tr = scaler.fit_transform(X_muq[train_idx])
        X_va = scaler.transform(X_muq[val_idx])

        ridge = Ridge(alpha=1.0)
        ridge.fit(X_tr, y_pc[train_idx])
        preds = ridge.predict(X_va)

        r2_fold = r2_score(y_pc[val_idx], preds)
        factor_r2_per_fold[pc_i, fold_i] = r2_fold
        all_true.extend(y_pc[val_idx])
        all_pred.extend(preds)

    factor_r2[pc_i] = r2_score(all_true, all_pred)

# Compare: mean factor R2 vs mean individual dimension R2
mean_factor_r2 = float(factor_r2.mean())
mean_dim_r2 = float(r2_per_dim.mean())

audit_results['factor_probing'] = {
    f'PC{i+1}': {
        'r2': float(factor_r2[i]),
        'fold_r2': [float(factor_r2_per_fold[i, f]) for f in range(len(folds))],
        'explained_variance': float(explained[i]),
    }
    for i in range(n_significant)
}
audit_results['factor_probing']['mean_factor_r2'] = mean_factor_r2
audit_results['factor_probing']['mean_dim_r2'] = mean_dim_r2
audit_results['factor_probing']['consolidation_validated'] = mean_factor_r2 > mean_dim_r2

print(f'Factor-level probing: mean R2={mean_factor_r2:.3f} vs individual dims mean R2={mean_dim_r2:.3f}')
print(f'Consolidation validated: {mean_factor_r2 > mean_dim_r2}')

## 6. MuQ Residual Analysis

In [None]:
# MuQ residual analysis: what PercePiano labels capture that MuQ can't explain
scaler_full = StandardScaler()
X_muq_std = scaler_full.fit_transform(X_muq)

ridge_multi = Ridge(alpha=1.0)
ridge_multi.fit(X_muq_std, Y)
Y_pred = ridge_multi.predict(X_muq_std)

residuals = Y - Y_pred  # [N, 19]

# PCA on residuals
pca_resid = PCA(n_components=N_DIMS)
pca_resid.fit(residuals)

# Compare to random noise baseline
random_residuals = rng.standard_normal(residuals.shape) * residuals.std(axis=0)
pca_random = PCA(n_components=N_DIMS)
pca_random.fit(random_residuals)

n_structured = int(np.sum(pca_resid.explained_variance_ > pca_random.explained_variance_))

# Per-dimension unexplained variance
resid_var = residuals.var(axis=0)
total_var = Y.var(axis=0)
unexplained_frac = resid_var / total_var

audit_results['residuals'] = {
    'n_structured_components': n_structured,
    'unexplained_fraction': {DIM_NAMES[i]: float(unexplained_frac[i]) for i in range(N_DIMS)},
}

print(f'Structured residual components (above noise floor): {n_structured}')

## 7. Canonical Correlation Analysis (CCA)

In [None]:
# CCA between MuQ embeddings and PercePiano labels
n_cca_components = min(N_DIMS, 19)
pca_muq = PCA(n_components=50)
X_muq_pca = pca_muq.fit_transform(X_muq_std)

cca = CCA(n_components=n_cca_components)
X_c, Y_c = cca.fit_transform(X_muq_pca, Y)

canonical_corrs = [float(np.corrcoef(X_c[:, i], Y_c[:, i])[0, 1]) for i in range(n_cca_components)]
n_significant_cca = sum(1 for c in canonical_corrs if c > 0.3)

audit_results['cca'] = {
    'canonical_correlations': canonical_corrs,
    'n_significant': n_significant_cca,
    'total_shared_variance': float(sum(c**2 for c in canonical_corrs)),
    'muq_pca_variance_retained': float(pca_muq.explained_variance_ratio_.sum()),
}

print(f'CCA: {n_significant_cca}/{n_cca_components} significant variates, shared var={sum(c**2 for c in canonical_corrs):.3f}')

## 8. Downstream: STOP Prediction with Reduced Dims

Reuse the masterclass 98-segment STOP/CONTINUE data to test whether reduced dimension sets improve or hurt prediction.

In [None]:
from masterclass_experiments.data import load_moments, identify_segments
from masterclass_experiments.features import stats_pool
from masterclass_experiments.evaluation import leave_one_video_out_cv

REPO_ROOT = MODEL_ROOT.parent
MOMENTS_PATH = REPO_ROOT / 'tools' / 'masterclass-pipeline' / 'all_moments.jsonl'
SEGMENT_DIR = MASTERCLASS_CACHE / 'segments'
MUQ_CACHE_DIR = MASTERCLASS_CACHE / 'muq_embeddings'
CHECKPOINT_DIR = MODEL_ROOT / 'data' / 'checkpoints' / 'percepiano'
CHECKPOINT_PATHS = sorted(CHECKPOINT_DIR.glob('fold*_best.ckpt'))

# Load masterclass data
moments = load_moments(MOMENTS_PATH)
segments = identify_segments(moments)
segment_ids = np.array([s.segment_id for s in segments])
video_ids = np.array([s.video_id for s in segments])
stop_labels = np.array([1 if s.label == 'stop' else 0 for s in segments])

print(f'Masterclass segments: {len(segments)} ({stop_labels.sum()} STOP, {(1-stop_labels).sum()} CONTINUE)')
print(f'PercePiano checkpoints: {len(CHECKPOINT_PATHS)}')

In [None]:
# Extract MuQ embeddings for masterclass segments (cached)
from audio_experiments.extractors.muq import MuQExtractor
from masterclass_experiments.features import extract_quality_scores

extractor = MuQExtractor(cache_dir=MUQ_CACHE_DIR)
mc_raw_embeddings = {}
for seg in segments:
    wav_path = SEGMENT_DIR / f'{seg.segment_id}.wav'
    mc_raw_embeddings[seg.segment_id] = extractor.extract_from_file(wav_path)

# Get 19-dim quality scores
mc_quality = extract_quality_scores(mc_raw_embeddings, CHECKPOINT_PATHS)
X_mc_quality = np.stack([mc_quality[sid].numpy() for sid in segment_ids])  # [98, 19]
print(f'Quality scores shape: {X_mc_quality.shape}')

In [None]:
# Compare STOP prediction with different dimension reduction strategies
stop_results = {}

def safe_lovo(X_sub, label, stop_labels, video_ids, segment_ids):
    """Run LOVO CV with guard for 0-feature subsets."""
    if X_sub.shape[1] == 0:
        return None
    try:
        r = leave_one_video_out_cv(X_sub, stop_labels, video_ids, segment_ids)
        return r['auc']
    except Exception as e:
        print(f'  WARNING: {label} failed: {e}')
        return None

# 1. All 19 dims (baseline)
auc = safe_lovo(X_mc_quality, 'All 19 dims', stop_labels, video_ids, segment_ids)
if auc is not None:
    stop_results['All 19 dims'] = auc

# 2. PCA-reduced (n_significant components from parallel analysis)
pca_mc = PCA(n_components=n_significant)
X_mc_pca = pca_mc.fit_transform(StandardScaler().fit_transform(X_mc_quality))
auc = safe_lovo(X_mc_pca, f'PCA ({n_significant} components)', stop_labels, video_ids, segment_ids)
if auc is not None:
    stop_results[f'PCA ({n_significant} components)'] = auc

# 3. Category-level means (8 categories from DIMENSION_CATEGORIES)
category_features = []
category_names = []
for cat_name, cat_dims in DIMENSION_CATEGORIES.items():
    cat_idx = [DIM_NAMES.index(d) for d in cat_dims]
    category_features.append(X_mc_quality[:, cat_idx].mean(axis=1))
    category_names.append(cat_name)
X_mc_cats = np.column_stack(category_features)
auc = safe_lovo(X_mc_cats, f'Category means ({len(category_names)} cats)', stop_labels, video_ids, segment_ids)
if auc is not None:
    stop_results[f'Category means ({len(category_names)} cats)'] = auc

# 4. Top-5 by highest R2 (least negative)
top5_idx = np.argsort(r2_per_dim)[::-1][:5]
top5_dims = [DIM_NAMES[i] for i in top5_idx]
X_mc_top5 = X_mc_quality[:, top5_idx]
auc = safe_lovo(X_mc_top5, f'Top-5 by R2', stop_labels, video_ids, segment_ids)
if auc is not None:
    stop_results[f'Top-5 by R2 ({top5_dims})'] = auc

# 5. Bottom-5 by R2
bottom5_idx = np.argsort(r2_per_dim)[:5]
bottom5_dims = [DIM_NAMES[i] for i in bottom5_idx]
X_mc_bottom5 = X_mc_quality[:, bottom5_idx]
auc = safe_lovo(X_mc_bottom5, f'Bottom-5 by R2', stop_labels, video_ids, segment_ids)
if auc is not None:
    stop_results[f'Bottom-5 by R2 ({bottom5_dims})'] = auc

# 6. Attention-pooled MuQ features
# Train attention pooler on all PercePiano data, then pool masterclass segments
attn_model = train_attention_pooler(
    [embeddings[k] for k in keys],
    Y,  # all 19 dims jointly
    n_epochs=50,
)
attn_model.eval()
attn_device = next(attn_model.parameters()).device

with torch.no_grad():
    X_mc_attn = np.stack([
        attn_model.pool(mc_raw_embeddings[sid].to(attn_device)).cpu().numpy()
        for sid in segment_ids
    ])

auc = safe_lovo(X_mc_attn, 'Attention-pooled MuQ', stop_labels, video_ids, segment_ids)
if auc is not None:
    stop_results['Attention-pooled MuQ (1024d)'] = auc

# Sort results
sorted_pairs = sorted(stop_results.items(), key=lambda x: x[1], reverse=True)

audit_results['stop_prediction'] = {
    'results': {name: float(auc_val) for name, auc_val in sorted_pairs},
    'baseline_auc': stop_results.get('All 19 dims'),
    'n_segments': len(segments),
    'n_stop': int(stop_labels.sum()),
    'n_continue': int((1 - stop_labels).sum()),
}

print(f'STOP prediction complete. {len(stop_results)} configurations tested.')

## 10. Competition Validation

Spearman correlation between PercePiano quality predictions and Chopin 2021 competition placement.
Negative rho = higher PercePiano score correlates with better placement (lower number).

Requires running `scripts/collect_competition_data.py` first.

In [None]:
# 10. Competition Validation: Spearman correlation between
# PercePiano quality predictions and competition placement

import jsonlines
from model_improvement.competition import load_competition_metadata

COMP_CACHE = MODEL_ROOT / 'data' / 'competition_cache' / 'chopin2021'
metadata_path = COMP_CACHE / 'metadata.jsonl'

if metadata_path.exists():
    # Load competition metadata
    comp_records = load_competition_metadata(COMP_CACHE)

    # Load MuQ embeddings for competition recordings
    comp_emb_dir = COMP_CACHE / 'muq_embeddings'
    comp_embeddings = {
        p.stem: torch.load(p, map_location='cpu', weights_only=True)
        for p in comp_emb_dir.glob('*.pt')
    }

    # Get PercePiano quality predictions for competition recordings
    comp_quality = extract_quality_scores(comp_embeddings, CHECKPOINT_PATHS)

    # Build arrays aligned by recording_id
    valid_records = [r for r in comp_records if r['recording_id'] in comp_quality]
    placements = np.array([r['placement'] for r in valid_records])
    quality_scores = np.stack([
        comp_quality[r['recording_id']].numpy() for r in valid_records
    ])  # [N, 19]

    # Per-dimension Spearman correlation (negative rho = higher quality -> lower placement number)
    competition_correlations = {}
    for dim_i in range(N_DIMS):
        rho, pval = sp_stats.spearmanr(quality_scores[:, dim_i], placements)
        competition_correlations[DIM_NAMES[dim_i]] = {'rho': float(rho), 'pval': float(pval)}

    # Overall correlation
    mean_quality = quality_scores.mean(axis=1)
    overall_rho, overall_pval = sp_stats.spearmanr(mean_quality, placements)

    audit_results['competition_validation'] = {
        'overall_rho': float(overall_rho),
        'overall_pval': float(overall_pval),
        'per_dimension': competition_correlations,
        'n_recordings': len(valid_records),
    }

    # Print results
    print(f'Competition validation: {len(valid_records)} recordings')
    print(f'Overall Spearman rho: {overall_rho:.3f} (p={overall_pval:.4f})')
    print(f'(Negative rho = higher PercePiano score correlates with better placement)')
    print()
    for dim_name in sorted(competition_correlations, key=lambda d: competition_correlations[d]['rho']):
        info = competition_correlations[dim_name]
        sig = '*' if info['pval'] < 0.05 else ''
        print(f'  {dim_name:25s}: rho={info["rho"]:+.3f}  p={info["pval"]:.4f} {sig}')
else:
    print('Competition data not yet collected. Run scripts/collect_competition_data.py first.')
    audit_results['competition_validation'] = None

## Consolidated Results

In [None]:
import json as _json

print('=' * 70)
print('PERCEPIANO DATA QUALITY AUDIT -- CONSOLIDATED RESULTS')
print('=' * 70)

# --- 1. CORRELATION & REDUNDANCY ---
cr = audit_results['correlation']
print(f'\n## 1. CORRELATION & REDUNDANCY')
print(f'Pairs with |r| > 0.7: {cr["n_high_corr"]}/{cr["total_pairs"]}')
print(f'Pairs with 0.5 < |r| <= 0.7: {cr["n_moderate_corr"]}')
print(f'Dimensions with VIF > 10 (severe multicollinearity): {cr["severe_vif_count"]}/{N_DIMS}')
print(f'Hierarchical cluster order: {cr["cluster_order"]}')
print(f'\nTop 15 correlated pairs:')
for d1, d2, r in cr['top_pairs']:
    print(f'  {d1:25s} <-> {d2:25s}  r = {r:+.3f}')
print(f'\nVIF per dimension:')
for name in sorted(cr['vifs'], key=cr['vifs'].get, reverse=True):
    v = cr['vifs'][name]
    flag = ' *** SEVERE' if v > 10 else ' ** moderate' if v > 5 else ''
    print(f'  {name:25s}: {v:8.2f}{flag}')

# --- 2. FACTOR STRUCTURE ---
pc = audit_results['pca']
print(f'\n## 2. FACTOR STRUCTURE (PCA)')
print(f'Parallel analysis: {pc["n_significant"]} statistically significant factors')
print(f'Components for 90% variance: {pc["n_90_variance"]}')
print(f'Components for 95% variance: {pc["n_95_variance"]}')
print(f'\nExplained variance per component:')
for i, (e, c) in enumerate(zip(pc['explained_variance'], pc['cumulative_variance'])):
    marker = ' <-- significant cutoff' if i + 1 == pc['n_significant'] else ''
    print(f'  PC{i+1}: {e:.3f} (cumulative: {c:.3f}){marker}')
print(f'\nFactor interpretations (|loading| > 0.3):')
for pc_name, info in pc['factor_interpretations'].items():
    dims_str = ', '.join(f'{name}({v:+.2f})' for name, v in info['loadings'])
    print(f'  {pc_name} ({info["explained_variance"]:.1%}): {dims_str}')

# --- 3. DISTRIBUTION FLAGS ---
dist = audit_results['distributions']
print(f'\n## 3. DISTRIBUTION FLAGS')
if dist['flagged']:
    for name, flags in dist['flagged']:
        print(f'  {name}: {", ".join(flags)}')
else:
    print('  No dimensions flagged.')
print(f'\nEntropy ranking (least to most informative):')
for name in sorted(dist['entropies'], key=dist['entropies'].get):
    print(f'  {name:25s}: {dist["entropies"][name]:.3f} nats')

# --- 4. PROBING COMPARISON (Ridge vs Attention vs MLP) ---
print(f'\n## 4. MuQ PROBING COMPARISON (4-fold CV R2)')
print(f'{"Dimension":25s}  {"Ridge":>8s}  {"Attention":>10s}  {"MLP":>8s}  {"Best":>10s}')
print(f'{"-"*25}  {"-"*8}  {"-"*10}  {"-"*8}  {"-"*10}')
for i in np.argsort(r2_per_dim)[::-1]:
    name = DIM_NAMES[i]
    ridge_r2 = r2_per_dim[i]
    attn_r2 = attn_r2_per_dim[i]
    mlp_r2 = mlp_r2_per_dim[i]
    best_val = max(ridge_r2, attn_r2, mlp_r2)
    best_method = ['Ridge', 'Attention', 'MLP'][[ridge_r2, attn_r2, mlp_r2].index(best_val)]
    print(f'  {name:25s}  {ridge_r2:+8.3f}  {attn_r2:+10.3f}  {mlp_r2:+8.3f}  {best_method:>10s}')

# --- 5. FACTOR-LEVEL PROBING ---
fp = audit_results['factor_probing']
print(f'\n## 5. FACTOR-LEVEL PROBING')
print(f'Mean factor R2: {fp["mean_factor_r2"]:.3f} vs mean individual dim R2: {fp["mean_dim_r2"]:.3f}')
print(f'Consolidation validated: {fp["consolidation_validated"]}')
for key in sorted(k for k in fp if k.startswith('PC')):
    info = fp[key]
    print(f'  {key}: R2={info["r2"]:.3f} (explains {info["explained_variance"]:.1%} of label variance)')

# --- 6. RESIDUAL STRUCTURE ---
res = audit_results['residuals']
print(f'\n## 6. RESIDUAL STRUCTURE')
print(f'Structured residual components (above noise floor): {res["n_structured_components"]}')
print(f'Per-dimension unexplained fraction (what MuQ cannot explain):')
for name in sorted(res['unexplained_fraction'], key=res['unexplained_fraction'].get, reverse=True):
    print(f'  {name:25s}: {res["unexplained_fraction"][name]:.3f}')

# --- 7. CCA ---
cc = audit_results['cca']
print(f'\n## 7. CCA (MuQ <-> Labels)')
print(f'Significant canonical variates (r > 0.3): {cc["n_significant"]}/{len(cc["canonical_correlations"])}')
print(f'Total shared variance (sum r^2): {cc["total_shared_variance"]:.3f}')
print(f'MuQ PCA variance retained (50 components): {cc["muq_pca_variance_retained"]:.1%}')
print(f'Canonical correlations: {[f"{c:.3f}" for c in cc["canonical_correlations"]]}')

# --- 8. STOP PREDICTION ---
sp = audit_results.get('stop_prediction', {})
print(f'\n## 8. STOP PREDICTION (LOVO AUC)')
print(f'Segments: {sp.get("n_segments", "?")} ({sp.get("n_stop", "?")} STOP, {sp.get("n_continue", "?")} CONTINUE)')
baseline = sp.get('baseline_auc')
for name, auc_val in sp.get('results', {}).items():
    delta = f' (delta={auc_val - baseline:+.3f})' if baseline is not None else ''
    print(f'  {name:45s}: AUC={auc_val:.3f}{delta}')

print('\n' + '=' * 70)
print('END OF AUDIT')
print('=' * 70)

# Also dump as JSON for programmatic consumption
print('\n\n--- JSON (for programmatic use) ---')
print(_json.dumps(audit_results, indent=2, default=str))