# Audio Model Training

This notebook trains two audio classification models on Xeno-Canto MFCC features:
1. **AudioCNN** - Convolutional neural network for audio
2. **AudioViT** - Vision Transformer adapted for audio spectrograms

Both models classify bird species from MFCC features extracted from 3-second audio clips.

In [None]:
import sys
from pathlib import Path
import pandas as pd
import json
import torch
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import numpy as np

sys.path.insert(0, str(Path('..').resolve()))

from src.models.audio_cnn import AudioCNN
from src.models.audio_vit import AudioViT
from src.datasets.audio import AudioMFCCDataset
from src.training.trainer import Trainer

ARTIFACTS = Path('../artifacts')
MODELS_DIR = ARTIFACTS / 'models'
MODELS_DIR.mkdir(exist_ok=True)

device_obj = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = str(device_obj)
print(f"Using device: {device}")
print(f"PyTorch version: {torch.__version__}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## Load Data and Create Datasets

In [None]:
# Load filtered Xeno-Canto data
xc_df = pd.read_parquet(ARTIFACTS / 'xeno_canto_filtered.parquet')

# Filter to species with >=2 samples
xc_counts = xc_df['species_normalized'].value_counts()
species_to_keep = xc_counts[xc_counts >= 2].index
xc_df = xc_df[xc_df['species_normalized'].isin(species_to_keep)].copy()

# Load splits
with open(ARTIFACTS / 'splits' / 'xeno_canto_audio_splits.json', 'r') as f:
    splits = json.load(f)

# Create species to label mapping
species_list = sorted(xc_df['species_normalized'].unique())
species_to_idx = {sp: i for i, sp in enumerate(species_list)}
num_classes = len(species_list)

print(f"Dataset: {len(xc_df)} recordings, {num_classes} species")
print(f"Train: {len(splits['train'])} samples")
print(f"Val: {len(splits['val'])} samples")
print(f"Test: {len(splits['test'])} samples")

# Create datasets
cache_dir = ARTIFACTS / 'audio_mfcc_cache' / 'xeno_canto'

train_dataset = AudioMFCCDataset(
    df=xc_df,
    cache_dir=cache_dir,
    indices=splits['train'],
    species_to_idx=species_to_idx,
    transform=None
)

val_dataset = AudioMFCCDataset(
    df=xc_df,
    cache_dir=cache_dir,
    indices=splits['val'],
    species_to_idx=species_to_idx,
    transform=None
)

test_dataset = AudioMFCCDataset(
    df=xc_df,
    cache_dir=cache_dir,
    indices=splits['test'],
    species_to_idx=species_to_idx,
    transform=None
)

print(f"\nDataset sizes:")
print(f"  Train: {len(train_dataset)}")
print(f"  Val: {len(val_dataset)}")
print(f"  Test: {len(test_dataset)}")

# Create dataloaders
train_loader = DataLoader(
    train_dataset, batch_size=32, shuffle=True,
    num_workers=4, pin_memory=True
)
val_loader = DataLoader(
    val_dataset, batch_size=32, shuffle=False,
    num_workers=4, pin_memory=True
)
test_loader = DataLoader(
    test_dataset, batch_size=32, shuffle=False,
    num_workers=4, pin_memory=True
)

print(f"\nDataloader batches:")
print(f"  Train: {len(train_loader)} batches")
print(f"  Val: {len(val_loader)} batches")
print(f"  Test: {len(test_loader)} batches")

## Train AudioCNN

Convolutional neural network designed for audio MFCC features.

In [None]:
# Initialize AudioCNN
model = AudioCNN(num_classes=num_classes).to(device_obj)
print(f"Model: {sum(p.numel() for p in model.parameters()):,} parameters")

# Setup optimizer and scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

# Create trainer
trainer = Trainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    scheduler=scheduler,
    device=device,
    checkpoint_dir=MODELS_DIR / 'audio_cnn',
    experiment_name='AudioCNN',
    use_amp=True,
    gradient_clip=1.0,
    early_stopping_patience=7
)

print("\nðŸš€ Starting AudioCNN training...")
print("This may take 20-40 minutes depending on your GPU.\n")

In [None]:
# Train the model
history = trainer.train(num_epochs=50)

print(f"\nâœ“ AudioCNN training complete")
print(f"âœ“ Best val accuracy: {max(history['val_acc']):.4f}")
print(f"âœ“ Final train loss: {history['train_loss'][-1]:.4f}")
print(f"âœ“ Final val loss: {history['val_loss'][-1]:.4f}")

In [None]:
# Plot training curves
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Loss
axes[0].plot(history['train_loss'], label='Train', linewidth=2)
axes[0].plot(history['val_loss'], label='Val', linewidth=2)
axes[0].set_xlabel('Epoch', fontsize=12)
axes[0].set_ylabel('Loss', fontsize=12)
axes[0].set_title('AudioCNN - Loss', fontsize=14, fontweight='bold')
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3)

# Accuracy
axes[1].plot(history['train_acc'], label='Train', linewidth=2)
axes[1].plot(history['val_acc'], label='Val', linewidth=2)
axes[1].set_xlabel('Epoch', fontsize=12)
axes[1].set_ylabel('Accuracy', fontsize=12)
axes[1].set_title('AudioCNN - Accuracy', fontsize=14, fontweight='bold')
axes[1].legend(fontsize=11)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(MODELS_DIR / 'audio_cnn' / 'training_curves.png', dpi=150, bbox_inches='tight')
plt.show()

# Save history
with open(MODELS_DIR / 'audio_cnn' / 'history.json', 'w') as f:
    json.dump({k: [float(v) for v in vals] for k, vals in history.items()}, f, indent=2)

print(f"âœ“ Saved training curves and history to {MODELS_DIR / 'audio_cnn'}")

## Train AudioViT

Vision Transformer adapted for audio spectrograms (MFCC features).

In [None]:
# Initialize AudioViT
model = AudioViT(num_classes=num_classes, pretrained='google/vit-base-patch16-224').to(device_obj)
print(f"Model: {sum(p.numel() for p in model.parameters()):,} parameters")

# Setup optimizer and scheduler (AdamW + Cosine annealing for ViT)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-2)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer, T_max=50, eta_min=1e-6
)

# Create trainer
trainer = Trainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    scheduler=scheduler,
    device=device,
    checkpoint_dir=MODELS_DIR / 'audio_vit',
    experiment_name='AudioViT',
    use_amp=True,
    gradient_clip=1.0,
    early_stopping_patience=10
)

print("\nðŸš€ Starting AudioViT training...")
print("This may take 40-80 minutes depending on your GPU.\n")

In [None]:
# Train the model
vit_history = trainer.train(num_epochs=50)

print(f"\nâœ“ AudioViT training complete")
print(f"âœ“ Best val accuracy: {max(vit_history['val_acc']):.4f}")
print(f"âœ“ Final train loss: {vit_history['train_loss'][-1]:.4f}")
print(f"âœ“ Final val loss: {vit_history['val_loss'][-1]:.4f}")

In [None]:
# Plot training curves for AudioViT
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Loss
axes[0].plot(vit_history['train_loss'], label='Train', linewidth=2)
axes[0].plot(vit_history['val_loss'], label='Val', linewidth=2)
axes[0].set_xlabel('Epoch', fontsize=12)
axes[0].set_ylabel('Loss', fontsize=12)
axes[0].set_title('AudioViT - Loss', fontsize=14, fontweight='bold')
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3)

# Accuracy
axes[1].plot(vit_history['train_acc'], label='Train', linewidth=2)
axes[1].plot(vit_history['val_acc'], label='Val', linewidth=2)
axes[1].set_xlabel('Epoch', fontsize=12)
axes[1].set_ylabel('Accuracy', fontsize=12)
axes[1].set_title('AudioViT - Accuracy', fontsize=14, fontweight='bold')
axes[1].legend(fontsize=11)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(MODELS_DIR / 'audio_vit' / 'training_curves.png', dpi=150, bbox_inches='tight')
plt.show()

# Save history
with open(MODELS_DIR / 'audio_vit' / 'history.json', 'w') as f:
    json.dump({k: [float(v) for v in vals] for k, vals in vit_history.items()}, f, indent=2)

print(f"âœ“ Saved training curves and history to {MODELS_DIR / 'audio_vit'}")

## Summary

Both audio models have been trained and their checkpoints saved. The models can now be evaluated on the test set.