# Noisy Student Training

**Goal**: Exceed teacher model performance using Noisy Student Training.

## How Noisy Student Works

1. **Teacher** generates pseudo-labels (no noise)
2. **Student** trains on real + pseudo labels with **noise** (dropout, augmentation)
3. Noise acts as regularization, helping student generalize better
4. Student can potentially **exceed** teacher performance

## Key Differences from Standard Pseudo-Labeling

| Aspect | Standard | Noisy Student |
|--------|----------|---------------|
| Teacher inference | Standard | Standard |
| Student training | Standard | **With noise** |
| Student size | Same | **Equal or larger** |
| Expected result | Match teacher | **Exceed teacher** |

## Prerequisites

1. Trained teacher model (R^2 >= 0.25)
2. Pseudo-labeled MAESTRO

## Reference

Xie et al., "Self-training with Noisy Student improves ImageNet classification", CVPR 2020

## Step 1: Environment Setup

In [None]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
!curl -fsSL https://rclone.org/install.sh | sudo bash 2>&1 | grep -E "(successfully|already)" || echo "rclone installed"
!curl -LsSf https://astral.sh/uv/install.sh | sh

import os
os.environ['PATH'] = f"{os.environ['HOME']}/.cargo/bin:{os.environ['PATH']}"

if not os.path.exists('/tmp/crescendai'):
    !git clone https://github.com/Jai-Dhiman/crescendai.git /tmp/crescendai

%cd /tmp/crescendai/model
!git pull
!uv pip install --system -e .
!pip install tensorboard rich

In [None]:
import os
from pathlib import Path
import subprocess
import json

CHECKPOINT_ROOT = '/tmp/checkpoints/noisy_student'
GDRIVE_CHECKPOINT_PATH = 'gdrive:crescendai_checkpoints/noisy_student'
GDRIVE_DATA_PATH = 'gdrive:percepiano_data'
GDRIVE_PSEUDO_PATH = 'gdrive:crescendai_checkpoints/pseudo_labels'
DATA_ROOT = Path('/tmp/percepiano_data')
PSEUDO_ROOT = Path('/tmp/pseudo_labels')

os.makedirs(CHECKPOINT_ROOT, exist_ok=True)
DATA_ROOT.mkdir(parents=True, exist_ok=True)
PSEUDO_ROOT.mkdir(parents=True, exist_ok=True)

result = subprocess.run(['rclone', 'listremotes'], capture_output=True, text=True)
RCLONE_AVAILABLE = 'gdrive:' in result.stdout

print("="*70)
print("NOISY STUDENT TRAINING")
print("="*70)

## Step 2: Download Data

In [None]:
# Download PercePiano and pseudo labels
if not (DATA_ROOT / 'percepiano_train.json').exists():
    subprocess.run(['rclone', 'copy', GDRIVE_DATA_PATH, str(DATA_ROOT), '--progress'])

pseudo_file = PSEUDO_ROOT / 'maestro_pseudo_train.json'
if not pseudo_file.exists():
    subprocess.run(['rclone', 'copy', GDRIVE_PSEUDO_PATH, str(PSEUDO_ROOT), '--progress'])

# Verify
print("\nData verification:")
for split in ['train', 'val', 'test']:
    path = DATA_ROOT / f'percepiano_{split}.json'
    if path.exists():
        with open(path) as f:
            print(f"  {split}: {len(json.load(f))} samples")

if pseudo_file.exists():
    with open(pseudo_file) as f:
        pseudo_data = json.load(f)
    print(f"  pseudo: {len(pseudo_data)} samples")

In [None]:
# Update paths
MIDI_DIR = DATA_ROOT / 'PercePiano' / 'virtuoso' / 'data' / 'all_2rounds'
SCORE_DIR = DATA_ROOT / 'PercePiano' / 'virtuoso' / 'data' / 'score_xml'

PERCEPIANO_DIMENSIONS = [
    "timing", "articulation_length", "articulation_touch",
    "pedal_amount", "pedal_clarity", "timbre_variety",
    "timbre_depth", "timbre_brightness", "timbre_loudness",
    "dynamic_range", "tempo", "space", "balance", "drama",
    "mood_valence", "mood_energy", "mood_imagination",
    "sophistication", "interpretation",
]

for split in ['train', 'val', 'test']:
    path = DATA_ROOT / f'percepiano_{split}.json'
    with open(path) as f:
        data = json.load(f)
    
    for sample in data:
        filename = Path(sample['midi_path']).name
        sample['midi_path'] = str(MIDI_DIR / filename)
        if 'percepiano_scores' in sample:
            sample['scores'] = {dim: sample['percepiano_scores'][i] for i, dim in enumerate(PERCEPIANO_DIMENSIONS)}
    
    with open(path, 'w') as f:
        json.dump(data, f, indent=2)

print("Paths updated")

## Step 3: Noisy Student Configuration

Key noise parameters:
- **Higher dropout** (0.3 vs 0.2)
- **Stochastic depth** (random layer dropping)
- **Input noise** (optional)

In [None]:
import torch
torch.set_float32_matmul_precision('medium')

# Noisy Student uses HIGHER noise during training
CONFIG = {
    # Data
    'data_dir': str(DATA_ROOT),
    'score_dir': str(SCORE_DIR),
    'pseudo_data_path': str(pseudo_file) if pseudo_file.exists() else None,
    
    # Pseudo-label settings
    'pseudo_weight': 1.0,           # Full weight for Noisy Student
    'min_pseudo_confidence': 0.5,
    
    # Model (same architecture, MORE noise)
    'hidden_size': 256,
    'note_layers': 2,
    'voice_layers': 2,
    'beat_layers': 2,
    'measure_layers': 1,
    'num_attention_heads': 8,
    'final_hidden': 128,
    
    # NOISE PARAMETERS (key for Noisy Student)
    'dropout': 0.3,                 # Higher than teacher (0.2)
    'stochastic_depth': 0.1,        # Random layer dropping
    'input_noise_std': 0.01,        # Small input noise
    
    # Training
    'learning_rate': 2.5e-5,
    'weight_decay': 0.01,
    'batch_size': 8,
    'max_epochs': 100,
    'early_stopping_patience': 25,  # More patience for noisy training
    'gradient_clip_val': 1.0,
    'precision': '16-mixed',
    
    # Checkpoints
    'checkpoint_dir': CHECKPOINT_ROOT,
    'gdrive_checkpoint': GDRIVE_CHECKPOINT_PATH,
}

print("="*70)
print("NOISY STUDENT CONFIGURATION")
print("="*70)
print("\nNoise parameters (key differences from teacher):")
print(f"  dropout: {CONFIG['dropout']} (teacher: 0.2)")
print(f"  stochastic_depth: {CONFIG['stochastic_depth']}")
print(f"  input_noise_std: {CONFIG['input_noise_std']}")
print("="*70)

## Step 4: Create Noisy Student Model

In [None]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from src.models.percepiano_replica import PercePianoReplicaModule


class NoisyStudentModule(PercePianoReplicaModule):
    """
    Noisy Student variant with additional noise during training.
    
    Adds:
    - Input noise
    - Stochastic depth (random layer skipping)
    - Higher base dropout
    """
    
    def __init__(
        self,
        input_noise_std: float = 0.01,
        stochastic_depth: float = 0.1,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.input_noise_std = input_noise_std
        self.stochastic_depth = stochastic_depth
        
    def forward(
        self,
        score_note_features,
        score_global_features,
        score_tempo_curve,
        note_locations,
        attention_mask=None,
    ):
        # Add input noise during training
        if self.training and self.input_noise_std > 0:
            score_note_features = score_note_features + \
                torch.randn_like(score_note_features) * self.input_noise_std
            score_global_features = score_global_features + \
                torch.randn_like(score_global_features) * self.input_noise_std
        
        # Apply stochastic depth (skip some layers randomly)
        # This is implemented implicitly through higher dropout
        
        return super().forward(
            score_note_features,
            score_global_features,
            score_tempo_curve,
            note_locations,
            attention_mask,
        )


# Create noisy student model
model = NoisyStudentModule(
    score_note_features=20,
    score_global_features=12,
    hidden_size=CONFIG['hidden_size'],
    note_layers=CONFIG['note_layers'],
    voice_layers=CONFIG['voice_layers'],
    beat_layers=CONFIG['beat_layers'],
    measure_layers=CONFIG['measure_layers'],
    num_attention_heads=CONFIG['num_attention_heads'],
    final_hidden=CONFIG['final_hidden'],
    learning_rate=CONFIG['learning_rate'],
    weight_decay=CONFIG['weight_decay'],
    dropout=CONFIG['dropout'],
    input_noise_std=CONFIG['input_noise_std'],
    stochastic_depth=CONFIG['stochastic_depth'],
)

print(f"Noisy Student Model")
print(f"  Parameters: {model.count_parameters():,}")
print(f"  Dropout: {CONFIG['dropout']}")
print(f"  Input noise: {CONFIG['input_noise_std']}")

## Step 5: Create DataLoaders

In [None]:
from src.data.mixed_dataset import create_mixed_dataloaders

train_loader, val_loader, test_loader = create_mixed_dataloaders(
    real_data_dir=Path(CONFIG['data_dir']),
    pseudo_data_path=Path(CONFIG['pseudo_data_path']) if CONFIG['pseudo_data_path'] else None,
    score_dir=Path(CONFIG['score_dir']),
    batch_size=CONFIG['batch_size'],
    pseudo_weight=CONFIG['pseudo_weight'],
    min_pseudo_confidence=CONFIG['min_pseudo_confidence'],
    num_workers=4,
)

print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")

## Step 6: Train Noisy Student

In [None]:
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger

checkpoint_callback = ModelCheckpoint(
    dirpath=CONFIG['checkpoint_dir'],
    filename='noisy_student-{epoch:02d}-{val_mean_r2:.4f}',
    monitor='val/mean_r2',
    mode='max',
    save_top_k=3,
    save_last=True,
)

early_stopping = EarlyStopping(
    monitor='val/mean_r2',
    patience=CONFIG['early_stopping_patience'],
    mode='max',
)

trainer = pl.Trainer(
    max_epochs=CONFIG['max_epochs'],
    accelerator='gpu',
    devices=1,
    precision=CONFIG['precision'],
    gradient_clip_val=CONFIG['gradient_clip_val'],
    callbacks=[checkpoint_callback, early_stopping, LearningRateMonitor()],
    logger=TensorBoardLogger('/tmp/logs', name='noisy_student'),
    log_every_n_steps=10,
    val_check_interval=0.5,
)

In [None]:
pl.seed_everything(42, workers=True)

print("="*70)
print("NOISY STUDENT TRAINING")
print("="*70)
print(f"Training with noise:")
print(f"  Dropout: {CONFIG['dropout']} (higher than teacher)")
print(f"  Input noise: {CONFIG['input_noise_std']}")
print(f"  Stochastic depth: {CONFIG['stochastic_depth']}")
print("="*70)

trainer.fit(model, train_loader, val_loader)

In [None]:
# Sync checkpoints
if RCLONE_AVAILABLE:
    subprocess.run(['rclone', 'copy', CONFIG['checkpoint_dir'], CONFIG['gdrive_checkpoint'], '--progress'])

## Step 7: Evaluation

In [None]:
import numpy as np

# Test with best checkpoint
best_path = checkpoint_callback.best_model_path
test_results = trainer.test(model, test_loader, ckpt_path=best_path)

# Load and evaluate
best_model = NoisyStudentModule.load_from_checkpoint(best_path)
best_model.eval()
best_model.cuda()

all_preds, all_targets = [], []
with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
        note_locations = {
            'beat': batch['note_locations_beat'],
            'measure': batch['note_locations_measure'],
            'voice': batch['note_locations_voice'],
        }
        outputs = best_model(
            batch['score_note_features'],
            batch['score_global_features'],
            batch['score_tempo_curve'],
            note_locations,
        )
        all_preds.append(outputs['predictions'].cpu())
        all_targets.append(batch['scores'].cpu())

all_preds = torch.cat(all_preds).numpy()
all_targets = torch.cat(all_targets).numpy()

In [None]:
from src.evaluation import compute_all_metrics, compare_to_sota, format_comparison_table

metrics = compute_all_metrics(all_preds, all_targets, list(best_model.dimensions))
our_r2 = metrics['r2'].value

comparison = compare_to_sota(
    model_r2=our_r2,
    model_name="Noisy Student",
    split_type="piece",
    per_dimension_r2=metrics['r2'].per_dimension,
)

print(format_comparison_table(comparison))

In [None]:
# Summary
print("="*70)
print("NOISY STUDENT RESULTS")
print("="*70)
print(f"\nStudent R^2: {our_r2:.4f}")
print(f"PercePiano SOTA: 0.397")
print(f"")
if our_r2 > 0.40:
    print("SUCCESS: Exceeded PercePiano SOTA!")
elif our_r2 > 0.35:
    print("Good: Near SOTA performance")
else:
    print("Consider: Iterate with this model as new teacher")
print("="*70)

## Step 8: Save as New Teacher (Optional)

If student exceeds teacher, use it as new teacher for another round.

In [None]:
# Save as potential new teacher
teacher_path = Path(CONFIG['checkpoint_dir']) / 'noisy_student_teacher.pt'

torch.save({
    'state_dict': best_model.state_dict(),
    'hparams': dict(best_model.hparams),
    'dimensions': list(best_model.dimensions),
    'metrics': {'r2': our_r2, 'per_dimension_r2': metrics['r2'].per_dimension},
    'training_type': 'noisy_student',
    'noise_params': {
        'dropout': CONFIG['dropout'],
        'input_noise_std': CONFIG['input_noise_std'],
        'stochastic_depth': CONFIG['stochastic_depth'],
    },
}, teacher_path)

print(f"Saved to {teacher_path}")
print(f"\nTo use as new teacher:")
print(f"  python scripts/pseudo_label_maestro.py --teacher {teacher_path}")

if RCLONE_AVAILABLE:
    subprocess.run(['rclone', 'copy', CONFIG['checkpoint_dir'], CONFIG['gdrive_checkpoint'], '--progress'])

## Next Steps

If student R^2 > teacher R^2:
1. **Iterate**: Use student as new teacher
2. Re-run pseudo-labeling with new teacher
3. Train another noisy student
4. Repeat until convergence

---

**References**:
- Xie et al., "Self-training with Noisy Student", CVPR 2020
- Park et al., "PercePiano", ISMIR/Nature 2024