# **MLiP Group 25 BirdCLEF 2025 Training Notebook**

This is our training pipeline for the BirdCLEF 2025 challenge using PyTorch and Timm (for pretrained models). This is the corresponding inference notebook:

- [EfficientNet-B0 Inference](https://www.kaggle.com/code/maxgewald/efficientnet-b0-inference)

This notebook starts from the previous work of Kadircan İdrisoğlu:

- [EfficientNet B0 Pytorch [Train] | BirdCLEF'25](https://www.kaggle.com/code/kadircandrisolu/efficientnet-b0-pytorch-train-birdclef-25)


**Original Features**
* Implementation with Pytorch and Timm
* Flexible audio processing with both pre-computed and on-the-fly mel spectrograms
* Stratified 5-fold cross-validation with ensemble capability
* Mixup training for improved generalization
* Spectrogram augmentations (time/frequency masking, brightness adjustment)
* AdamW optimizer with Cosine Annealing LR scheduling
* Debug mode for quick experimentation with smaller datasets

**Features added by us**
* Weighted combination of FocalBCE loss and BCE loss with label smoothing
* Early stopping
* Selection of best five second window with labels from external classifier
* Pseudolabeled data from soundscapes (directly soft labels or hard labels via thresholding)
* Stratified sampling of pseudo-labels for better representation of rare classes
* Weighting of secondary labels (in loss calculation, not in targets)
* ROC/AUC calculation even when using soft labels
* Speed and memory optimizations (model compilation, automatic mixed precision)
* Expanded spectrogram augmentations, introduced cutmix for higher robustness
* Experimental optimized fold splitting (use with caution, could worsen results)

**Pre-computed Spectrograms** \
For faster training, we used pre-computed Mel-spectrograms for all five seconds of the audio. These large directories will not be uploaded to Kaggle, but allowed us to avoid recomputing spectrograms each run. Thus, LOAD_DATA=True does not work here (apart from when using no filtering and pseudolabels, then the single .npy file in the dataset is loaded). 

## Libraries

In [None]:
# Basic imports
import numpy as np, pandas as pd, os, random, warnings, json, datetime, time, sys
from tqdm.auto import tqdm
from copy import deepcopy
from pathlib import Path
sys.path.insert(1, "/kaggle/input/birdcelf-scripts")


# Specific imports
import logging
from metric_logger import MetricLogger

# PyTorch imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.amp import autocast, GradScaler

# Other ML imports
from sklearn.model_selection import StratifiedKFold
import timm

# Custom imports
from processing import process_audio_file, generate_spectrograms
from utilities import set_seed, collate_fn
from training_utilities import get_scheduler, get_criterion, clean_gpu_memory, compile_model
from training_utilities import calculate_soft_label_metrics, calculate_hard_label_metrics, get_improved_folds
from label_loading import load_pseudolabels, load_soft_pseudolabels, filter_training_labels

# Suppress warnings and set logging level
warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.ERROR)

## Configuration
Here, all paths, training parameters, and modes (like train filtering) are defined. The config will be saved to .json, once the training has run successfully.

In [None]:
class CFG:
    
    seed = 2025
    debug = True 
    LOAD_DATA = True # works in Kaggle only with False
    
    # Paths and directories
    OUTPUT_DIR = Path('ouput/')
    OUTPUT_DIR.mkdir(exist_ok=True)
    plots_dir = Path('output/plots/')
    plots_dir.mkdir(exist_ok=True)
    metrics_dir = Path('output/metrics/')
    metrics_dir.mkdir(exist_ok=True)
    configs_dir = Path('output/configs/')
    configs_dir.mkdir(exist_ok=True)
    models_dir = Path('output/models/')
    models_dir.mkdir(exist_ok=True)
    train_datadir = 'birdclef-2025/train_audio'
    train_csv = 'birdclef-2025/train.csv'
    train_soundscapes = 'birdclef-2025/train_soundscapes'
    test_soundscapes = 'birdclef-2025/test_soundscapes'
    submission_csv = 'birdclef-2025/sample_submission.csv'
    taxonomy_csv = 'birdclef-2025/taxonomy.csv'
    spectrogram_npy = 'archive/birdclef25-mel-spectrograms/birdclef2025_melspec_5sec_256_256.npy'
    train_soundscapes_spectrograms = 'archive/train_soundscapes_melspec_12x5_256_256/'
    train_full_spectrograms = 'archive/train_audio_melspec_Xx5_256_256/'
    
    # Label processing
    use_train_filtering = True
    train_pseudolabels = 'train_pseudolabels.csv'
    train_label_confidence = 0.0
    secondary_label_confidence = 0.0
    secondary_weight = 1.0
    rare_label_threshold = 20
    min_confidence_rare = 0.2
    
    normalize_labels = False

    # Pseudolabeling settings
    use_external_pseudolabels = True
    use_soft_labels = False
    external_pseudolabels = 'pseudolabels_best.csv'
    pseudolabel_confidence_threshold = 0.2  # Only use predictions above this threshold, only if using hard labels
    max_pseudolabels = 30000  # Maximum number of pseudolabeled samples to use
    stratified_pseudolabels = True  # Use stratified sampling for pseudolabels (currently only for hard labels)
 
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

    # Training settings
    epochs = 15
    n_fold = 5
    folds_to_train = [0, 1, 2, 3, 4]
    use_early_stopping = True
    early_stopping_epochs = 3 
    use_improved_folds = False

    # Mel spectrogram parameters
    # only relevant if LOAD_DATA is False
    FS = 32000
    TARGET_DURATION = 5.0
    TARGET_SHAPE = (256, 256)
    N_FFT = 1024
    HOP_LENGTH = 512
    N_MELS = 128
    FMIN = 50
    FMAX = 14000
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")
    
    # Loss parameters
    criterion = 'CombinedLoss'  # Options: 'BCEWithLogitsLoss', 'FocalLoss', 'CombinedLoss'
    focal_alpha = 1.0
    focal_gamma = 3.0
    bce_weight = 0.5
    focal_weight = 0.5

    # optimizer parameters
    lr = 0.5e-3 
    weight_decay = 5.0e-5

    #scheduler parameters
    scheduler = 'CosineAnnealingLR'
    min_lr = 1e-6
    use_lr_warmup = True
    warmup_epochs = 2

    # augmentation options
    aug_prob = 0.5
    spec_augment = True
    mixup_alpha = 0.5
    cutmix_alpha = 1.0
    use_cutmix = True
    
    # Model architecture options
    model_name = 'seresnext26t_32x4d'  # options: efficientnet_bx seresnext26t_32x4d
    pretrained = True
    in_channels = 1
    dropout_rate = 0.2
    drop_path_rate = 0.2
    projection_dim = 512 # only effnet
    projection_dropout = 0.3 # only effnet

    # Memory and speed optimizations
    gradient_accumulation_steps = 2  # Increase effective batch size without more memory
    use_amp = True                   # Use automatic mixed precision
    pin_memory = True                # Faster data transfer to GPU
    num_workers = 0                  # Set to 0 on Windows
    batch_size = 32                  # Effective batch size will be batch_size * gradient_accumulation_steps
    
    # Compiler settings
    compile_backend = "eager"        # Options: "eager", "inductor"
    compile_mode = "default"         # Options: "default", "reduce-overhead", "max-autotune"

    def update_debug_settings(self):
        if self.debug:
            self.n_fold = 2
            self.epochs = 2

    def save_config(self):
        """Save configuration to file unless in debug mode"""
        if not self.debug:    
            config_dict = {attr: getattr(self, attr) for attr in dir(self) if not attr.startswith('__') and not callable(getattr(self, attr))}
            filename = f"config_{self.timestamp}_{self.model_name}.json"
            filepath = os.path.join(self.configs_dir, filename)
            
            with open(filepath, 'w') as f:
                json.dump(config_dict, f, indent=4, default=str)
            print(f"Config saved to {filepath}")

cfg = CFG()
set_seed(cfg.seed)
cfg.update_debug_settings()

## Dataset Preparation and Data Augmentations
We'll convert audio to mel spectrograms (if they are not pre-computed) and apply random augmentations with 50% probability each - including time stretching, pitch shifting, and volume adjustments. This randomized approach creates diverse training samples from the same audio files. The dataset does not differentiate between real and pseudo-labels and expects the same format from both dataframes.

In [None]:
class BirdCLEFDatasetFromNPY(Dataset):
    _taxonomy_cache = {}
    
    def __init__(self, df, cfg, spectrograms=None, mode="train"):
        self.df, self.cfg, self.mode, self.spectrograms = df, cfg, mode, spectrograms
        
        tax_df = pd.read_csv(cfg.taxonomy_csv)
        self.species_ids = tax_df['primary_label'].tolist()
        self.num_classes = len(self.species_ids)
        self.label_to_idx = {l: i for i, l in enumerate(self.species_ids)}

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        samplename = row['samplename']

        # Get spectrogram
        if self.spectrograms and samplename in self.spectrograms:
            spec = self.spectrograms[samplename]
        elif not self.cfg.LOAD_DATA:
            # Pass timestamp if available in the row
            timestamp = row.get('timestamp', None)
            spec = process_audio_file(row['filepath'], self.cfg, timestamp=timestamp)
        else:
            spec = np.zeros(self.cfg.TARGET_SHAPE, dtype=np.float32)
            print(f"Warning: Spectrogram for {samplename} not found, using zero array.")
            
        # Convert to tensor and apply augmentations
        spec = torch.from_numpy(spec).float().unsqueeze(0)
        if self.mode == "train" and random.random() < self.cfg.aug_prob and self.cfg.spec_augment:
            spec = self.apply_spec_augmentations(spec)
        
        # Create target
        if row["primary_label"] == 'soft':
            target = row['secondary_labels']
        else:
            target = np.zeros(self.num_classes, dtype=np.float32)
            primary_idx = self.label_to_idx.get(row['primary_label'])
            if primary_idx is not None:
                target[primary_idx] = 1.0
            
            # Handle secondary labels
            if 'secondary_labels' in row and row['secondary_labels'] not in [[''], None, np.nan, [], "['']", "[]"]:
                sec_labels = eval(row['secondary_labels']) if isinstance(row['secondary_labels'], str) else row['secondary_labels']
                for label in sec_labels:
                    idx = self.label_to_idx.get(label)
                    if idx is not None:
                        target[idx] = self.cfg.secondary_weight
                
                if self.cfg.normalize_labels:
                    target /= np.sum(target)
            
        return {'melspec': spec, 'target': torch.from_numpy(target).float()}
    
    def apply_spec_augmentations(self, spec):
        # Time masking
        if random.random() < 0.5:
            for _ in range(random.randint(1, 3)):
                w, s = random.randint(5, 20), random.randint(0, spec.shape[2] - 20)
                spec[0, :, s:s+w] = 0
        
        # Frequency masking
        if random.random() < 0.5:
            for _ in range(random.randint(1, 3)):
                h, s = random.randint(5, 20), random.randint(0, spec.shape[1] - 20)
                spec[0, s:s+h, :] = 0
        
        # Combined brightness/contrast adjustment and noise
        needs_clamp = False
        
        if random.random() < 0.5:
            spec = spec * random.uniform(0.8, 1.2) + random.uniform(-0.1, 0.1)
            needs_clamp = True
        
        if random.random() < 0.3:
            spec = spec + torch.randn_like(spec) * random.uniform(0.001, 0.005)
            needs_clamp = True
            
        if needs_clamp:
            spec = torch.clamp(spec, 0, 1)
            
        # Random shifts
        if random.random() < 0.3:
            x, y = random.randint(-4, 4), random.randint(-4, 4)
            if x or y:
                spec = torch.roll(spec, shifts=(y, x), dims=(1, 2))
        
        return spec

## Model Definition

Creates specified model from timm library with parameters (like dropout) defined in the config. Adds projection layer and a mix of mixup and cut-mix. Different backbones are supported, but we focused mostly on efficientnet and resnext.

In [None]:
# Used for Resnext
class AttBlockV2(nn.Module):
    def __init__(self, in_features: int, out_features: int, activation="linear"):
        super().__init__()

        self.activation = activation
        self.att = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True,
        )
        self.cla = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True,
        )

        self.init_weights()

    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)

    def forward(self, x):
        # x: (n_samples, n_in, n_time)
        norm_att = torch.softmax(torch.tanh(self.att(x)), dim=-1)
        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        return x, norm_att, cla

    def nonlinear_transform(self, x):
        if self.activation == "linear":
            return x
        elif self.activation == "sigmoid":
            return torch.sigmoid(x)


def init_layer(layer):
    nn.init.xavier_uniform_(layer.weight)

    if hasattr(layer, "bias"):
        if layer.bias is not None:
            layer.bias.data.fill_(0.0)

def init_bn(bn):
    bn.bias.data.fill_(0.0)
    bn.weight.data.fill_(1.0)

In [None]:
class BirdCLEFModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        taxonomy_df = pd.read_csv(cfg.taxonomy_csv)
        self.num_classes = len(taxonomy_df)
        
        # Support for different model architectures
        self.backbone = timm.create_model(
            cfg.model_name,
            pretrained=cfg.pretrained,
            in_chans=cfg.in_channels,
            drop_rate=cfg.dropout_rate,
            drop_path_rate=cfg.drop_path_rate
        )
        
        # Determine model type based on model_name
        if 'seresnext' in cfg.model_name or 'resnet' in cfg.model_name:
            self.model_type = 'attention_based'
        else:
            self.model_type = 'efficientnet'
                
        # Extract feature dimension based on model type
        if 'efficientnet' in cfg.model_name:
            backbone_out = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
        elif 'convnext' in cfg.model_name:
            backbone_out = self.backbone.head.fc.in_features
            self.backbone.head.fc = nn.Identity()
        elif 'resnet' in cfg.model_name or 'seresnext' in cfg.model_name:
            backbone_out = self.backbone.fc.in_features
            self.backbone.fc = nn.Identity()
        else:
            backbone_out = self.backbone.get_classifier().in_features
            self.backbone.reset_classifier(0, '')
        
        # Setup architecture based on model type
        if self.model_type == 'attention_based':
            # Attention-based architecture (from SEResNeXt notebook)
            self.bn0 = nn.BatchNorm2d(cfg.TARGET_SHAPE[0])
            layers = list(self.backbone.children())[:-2]
            self.encoder = nn.Sequential(*layers)
            
            self.fc1 = nn.Linear(backbone_out, backbone_out, bias=True)
            self.att_block = AttBlockV2(backbone_out, self.num_classes, activation="linear")
            self.pooling = None  # Not used in attention model
            
        else:
            # EfficientNet-style architecture
            self.pooling = nn.AdaptiveAvgPool2d(1)
            self.feat_dim = backbone_out
            
            if hasattr(cfg, 'projection_dim') and cfg.projection_dim > 0:
                self.projection = nn.Sequential(
                    nn.Linear(backbone_out, cfg.projection_dim),
                    nn.BatchNorm1d(cfg.projection_dim),
                    nn.ReLU(inplace=True),
                    nn.Dropout(cfg.projection_dropout),
                    nn.Linear(cfg.projection_dim, cfg.num_classes)
                )
                self.classifier = self.projection
            else:
                self.classifier = nn.Linear(backbone_out, cfg.num_classes)
        
        # Mixup and CutMix support
        self.mixup_enabled = hasattr(cfg, 'mixup_alpha') and cfg.mixup_alpha > 0
        self.cutmix_enabled = hasattr(cfg, 'use_cutmix') and cfg.use_cutmix and cfg.cutmix_alpha > 0
        
        if self.mixup_enabled:
            self.mixup_alpha = cfg.mixup_alpha
        if self.cutmix_enabled:
            self.cutmix_alpha = cfg.cutmix_alpha
    
    def extract_feature(self, x):
        """Feature extraction for attention-based models"""
        x = x.permute((0, 1, 3, 2))
        frames_num = x.shape[2]
        
        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)
        
        x = x.transpose(2, 3)
        # (batch_size, channels, freq, frames)
        x = self.encoder(x)
        
        # (batch_size, channels, frames)
        x = torch.mean(x, dim=2)
        
        # channel smoothing
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        x = x1 + x2
        
        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)
        return x, frames_num
    
    def forward(self, x):
        if self.model_type == 'attention_based':
            # Attention-based forward pass
            x, frames_num = self.extract_feature(x)
            (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
            return clipwise_output
        else:
            # EfficientNet-style forward pass
            features = self.backbone(x)
            
            if isinstance(features, dict):
                features = features['features']
                
            if len(features.shape) == 4:
                features = self.pooling(features)
                features = features.view(features.size(0), -1)
            
            logits = self.classifier(features)
            return logits
    
    def mixup_data(self, x, targets):
        """Applies mixup to the data batch"""
        batch_size = x.size(0)
        lam = np.random.beta(self.mixup_alpha, self.mixup_alpha)
        indices = torch.randperm(batch_size).to(x.device, non_blocking=True)
        mixed_x = lam * x + (1 - lam) * x[indices]
        
        return mixed_x, targets, targets[indices], lam
    
    def cutmix_data(self, x, targets):
        batch_size = x.size(0)
        lam = np.random.beta(self.cutmix_alpha, self.cutmix_alpha)
        
        # Get random indices for mixing
        indices = torch.randperm(batch_size).to(x.device)
        
        # Get random box coordinates
        W, H = x.size(2), x.size(3)
        cut_ratio = np.sqrt(1. - lam)
        cut_w = np.int_(W * cut_ratio)
        cut_h = np.int_(H * cut_ratio)
        
        cx = np.random.randint(W)
        cy = np.random.randint(H)
        
        bbx1 = np.clip(cx - cut_w // 2, 0, W)
        bby1 = np.clip(cy - cut_h // 2, 0, H)
        bbx2 = np.clip(cx + cut_w // 2, 0, W)
        bby2 = np.clip(cy + cut_h // 2, 0, H)
        
        # Apply cutmix
        x_mixed = x.clone()
        x_mixed[:, :, bbx1:bbx2, bby1:bby2] = x[indices, :, bbx1:bbx2, bby1:bby2]
        
        # Adjust lambda to actual area ratio
        lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (W * H))
        
        return x_mixed, targets, targets[indices], lam

## Training Loop
For training one epoch. Uses optimizer, LR-scheduler, etc. defined in config. Collects metrics from training and validation data and returns them.

In [None]:
def train_one_epoch(model, loader, optimizer, criterion, device, scheduler=None, use_amp=True, grad_accum_steps=1):
    model.train()
    scaler = GradScaler(enabled=use_amp)
    
    # Use lists to accumulate batches, but don't keep all outputs in memory
    batch_count = 0
    running_loss = 0
    outputs_for_metrics = []
    targets_for_metrics = []

    optimizer.zero_grad(set_to_none=True)
    pbar = tqdm(enumerate(loader), total=len(loader), desc="Training")
    
    # Track additional metrics
    total_samples = 0
    epoch_start_time = time.time()
    
    for step, batch in pbar:
        inputs = batch['melspec'].to(device, non_blocking=True)
        targets = batch['target'].to(device, non_blocking=True)
        
        with autocast(enabled=use_amp, device_type=device):
            # Handle model outputs with mixup/cutmix
            if (model.mixup_enabled or model.cutmix_enabled) and model.training:
                if model.mixup_enabled and model.cutmix_enabled:
                    # Randomly choose between mixup and cutmix
                    if random.random() < 0.5:
                        mixed_x, targets_a, targets_b, lam = model.mixup_data(inputs, targets)
                    else:
                        mixed_x, targets_a, targets_b, lam = model.cutmix_data(inputs, targets)
                elif model.mixup_enabled:
                    mixed_x, targets_a, targets_b, lam = model.mixup_data(inputs, targets)
                else:
                    mixed_x, targets_a, targets_b, lam = model.cutmix_data(inputs, targets)
                    
                outputs = model(mixed_x)
                loss = lam * criterion(outputs, targets_a) + (1 - lam) * criterion(outputs, targets_b)
            else:
                outputs = model(inputs)
                loss = criterion(outputs, targets)
        
        # Normalize loss for gradient accumulation
        loss = loss / grad_accum_steps
        scaler.scale(loss).backward()
        
        batch_count += 1
        running_loss += loss.item() * grad_accum_steps
        total_samples += inputs.size(0)

        outputs_for_metrics.append(outputs.detach().cpu())
        targets_for_metrics.append(targets.detach().cpu())
        
        # Step optimizer after accumulating gradients
        if batch_count % grad_accum_steps == 0 or step == len(loader) - 1:
            # Unscale before possible gradient clipping
            scaler.unscale_(optimizer)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            
            if scheduler and isinstance(scheduler, lr_scheduler.OneCycleLR):
                scheduler.step()

        # Update progress bar with running loss
        pbar.set_postfix({
            'it/s/bs' : pbar.n / pbar.format_dict['elapsed'] / cfg.batch_size,
            'train_loss': running_loss / (step + 1),
            'lr': optimizer.param_groups[0]['lr'],
        })
        del inputs, outputs

    all_outputs = torch.cat(outputs_for_metrics)
    all_targets = torch.cat(targets_for_metrics)

    # Metric calculation
    epoch_time = time.time() - epoch_start_time
    if cfg.use_soft_labels:
        soft_metrics = calculate_soft_label_metrics(all_targets.numpy(), all_outputs.numpy())
    else:
        soft_metrics = calculate_hard_label_metrics(all_targets.numpy(), all_outputs.numpy())
    auc = soft_metrics['macro_auc']
    avg_loss = running_loss / len(loader)
    
    # Return comprehensive metrics
    metrics = {
        'train_loss': avg_loss,
        'train_auc': auc,
        'learning_rate': optimizer.param_groups[0]['lr'],
        'epoch_time_minutes': epoch_time / 60,
        'samples_per_second': total_samples / epoch_time,
        'total_samples': total_samples
    }
    
    del outputs_for_metrics, targets_for_metrics, all_outputs, all_targets
    
    return metrics

# Fixed validate function to properly accumulate predictions
def validate(model, loader, criterion, device, use_amp=True):
    model.eval()
    total_loss = 0.0
    
    # Store all predictions and targets for final metrics
    all_probs_accumulated = []
    all_targets_accumulated = []
    total_samples = 0
    val_start_time = time.time()
    
    with torch.no_grad():
        for batch_idx, batch in enumerate(tqdm(loader, desc="Validation")):
            inputs = batch['melspec'].to(device, non_blocking=True)
            targets = batch['target'].to(device, non_blocking=True)

            with autocast(enabled=use_amp, device_type=device):
                outputs = model(inputs)
                loss = criterion(outputs, targets)

            total_loss += loss.item()
            total_samples += inputs.size(0)
            
            probs = torch.sigmoid(outputs).cpu().numpy()
            targets_np = targets.cpu().numpy()
            
            # Accumulate all predictions and targets
            all_probs_accumulated.append(probs)
            all_targets_accumulated.append(targets_np)
            
            # Clear memory but keep accumulating
            del inputs, outputs, targets, probs, targets_np
    
    # Combine all accumulated predictions and targets
    probs_array = np.vstack(all_probs_accumulated)
    targets_array = np.vstack(all_targets_accumulated)
    
    val_time = time.time() - val_start_time
    
    # Use macro-averaged AUC as primary metric
    if cfg.use_soft_labels:
        soft_metrics = calculate_soft_label_metrics(targets_array, probs_array)
    else:
        soft_metrics = calculate_hard_label_metrics(targets_array, probs_array)
    auc = soft_metrics['macro_auc']
    valid_classes = soft_metrics['valid_classes']
    
    avg_loss = total_loss / len(loader)
    
    # Return comprehensive metrics focused on macro-averaged AUC
    metrics = {
        'val_loss': avg_loss,
        'val_auc': auc,  # This is macro-averaged AUC
        'val_valid_classes': valid_classes,
        'val_time_minutes': val_time / 60,
        'val_samples_per_second': total_samples / val_time if val_time > 0 else 0,
        'val_total_samples': total_samples
    }
    
    # Return ROC data
    roc_data = (targets_array, probs_array)
    
    # Clean up memory
    del all_probs_accumulated, all_targets_accumulated
    
    return metrics, roc_data

## Training one Fold
Reinstantiate datasets, loaders and model for every epoch to prevent leakage between folds. Use metric logger to collect and store metrics to later save and plot them. In the end cleanup of memory and gpu.


In [None]:
def train_fold(cfg, train_df, val_df, fold, metric_logger, spectrograms=None):
    
    print(f'Training set: {len(train_df)} samples')
    print(f'Validation set: {len(val_df)} samples')

    # Prepare datasets
    train_dataset = BirdCLEFDatasetFromNPY(train_df, cfg, spectrograms=spectrograms, mode='train')
    val_dataset = BirdCLEFDatasetFromNPY(val_df, cfg, spectrograms=spectrograms, mode='valid')
    
    # Prepare data loaders
    train_loader = DataLoader(
        train_dataset, 
        batch_size=cfg.batch_size, 
        shuffle=True, 
        num_workers=cfg.num_workers,
        pin_memory=True,
        collate_fn=collate_fn,
        drop_last=True
    )

    val_loader = DataLoader(
        val_dataset, 
        batch_size=cfg.batch_size * 2,
        shuffle=False, 
        num_workers=cfg.num_workers,
        pin_memory=True,
        collate_fn=collate_fn
    )

    print(f"\n{'-'*20} Training Model {'-'*20}")

    model = BirdCLEFModel(cfg).to(cfg.device, non_blocking=True)
    model = compile_model(model, cfg)
    optimizer = optim.AdamW(
        model.parameters(),
        lr=cfg.lr,
        weight_decay=cfg.weight_decay
    )
    criterion = get_criterion(cfg)

    if cfg.scheduler == 'CosineAnnealingLR':
        cfg.T_max = cfg.epochs
    scheduler = get_scheduler(optimizer, cfg, len(train_loader))
    
    best_auc, best_epoch = 0, 0
    fold_start_time = time.time()
    
    for epoch in range(cfg.epochs):
        print(f"\n{'='*50}")
        print(f"Epoch {epoch+1}/{cfg.epochs} | Fold {fold}")
        print(f"{'='*50}")
        
        # Get comprehensive training metrics
        train_metrics = train_one_epoch(
            model, train_loader, optimizer, criterion, cfg.device,
            scheduler if isinstance(scheduler, lr_scheduler.OneCycleLR) else None,
            use_amp=cfg.use_amp,
            grad_accum_steps=cfg.gradient_accumulation_steps
        )
        
        # Get comprehensive validation metrics
        val_metrics, roc_data = validate(model, val_loader, criterion, cfg.device, use_amp=cfg.use_amp)
        
        # Log metrics to our metrics dataframe
        metric_logger.log_metrics(epoch, fold, train_metrics, val_metrics, roc_data)
        
        if scheduler is not None and not isinstance(scheduler, lr_scheduler.OneCycleLR):
            if isinstance(scheduler, lr_scheduler.ReduceLROnPlateau):
                scheduler.step(val_metrics['val_loss'])
            else:
                scheduler.step()
        
        # Print epoch summary
        print(f"\n--- Epoch {epoch+1} Summary ---")
        print(f"Train Loss: {train_metrics['train_loss']:.4f}, Train AUC: {train_metrics['train_auc']:.4f}")
        print(f"Val Loss: {val_metrics['val_loss']:.4f}, Val AUC: {val_metrics['val_auc']:.4f}")
        print(f"LR: {train_metrics['learning_rate']:.6f}, Epoch Time: {train_metrics['epoch_time_minutes']:.2f}m")
        print(f"Valid Classes: {val_metrics['val_valid_classes']}")
        
        if val_metrics['val_auc'] > best_auc:
            best_auc = val_metrics['val_auc']
            best_epoch = epoch
            print(f"New best AUC: {best_auc:.4f} at epoch {best_epoch+1}")
            
            if not cfg.debug:
                model_path = f"{cfg.models_dir}/model_{cfg.timestamp}_{cfg.model_name}_fold{fold}.pth"
                torch.save({
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'scheduler_state_dict': scheduler.state_dict() if scheduler else None,
                    'epoch': epoch,
                    'best_auc': best_auc,
                    'cfg': cfg
                }, model_path)
        elif cfg.use_early_stopping and epoch - best_epoch >= cfg.early_stopping_epochs:
            print(f"Early stopping at epoch {epoch+1}")
            break
    
    # Calculate fold completion time
    fold_time = time.time() - fold_start_time
    fold_metrics = {
        'fold_time_minutes': fold_time / 60,
        'best_epoch': best_epoch,
        'best_val_auc': best_auc
    }

    print(f"\n*** FOLD {fold} COMPLETE ***")
    print(f"Best AUC: {best_auc:.4f} at epoch {best_epoch+1}")
    print(f"Fold training time: {fold_time/60:.1f} minutes")
     
    del model, optimizer, scheduler, criterion
    del train_loader, val_loader
    del train_dataset, val_dataset
    del val_df, train_df, spectrograms
    del train_metrics, val_metrics, roc_data
    clean_gpu_memory(cfg)

    return fold_metrics, best_auc

## Data Loading
If enabled, applies data filtering to find the best five seconds of training audio and generate dataframe for pseudo-labeled samples. Also loads spectrograms, if LOAD_DATA is set to True.

In [None]:
def load_data(df, cfg, soundscape_df=None):
    taxonomy_df = pd.read_csv(cfg.taxonomy_csv)
    species_ids = taxonomy_df['primary_label'].tolist()
    cfg.num_classes = len(species_ids)

    if cfg.debug: 
        print("Debug mode: using a small subset of the data")
        df = df.sample(min(1000, len(df)), random_state=cfg.seed).reset_index(drop=True)

    if cfg.use_train_filtering:
        external_df = pd.read_csv(cfg.train_pseudolabels)
        df = filter_training_labels(df, external_df, cfg)
        
        if cfg.LOAD_DATA:
            spectrograms = {}
            for _, row in tqdm(df.iterrows(), desc="Loading training spectrograms", total=len(df)):
                spectrograms[row['samplename']] = np.load(f"{cfg.train_full_spectrograms}{row['samplename']}.npy", allow_pickle=True)
        else:
            # Generate spectrograms on-the-fly for filtered training data
            spectrograms = generate_spectrograms(df, cfg)
            
    else: 
        df['filepath'] = cfg.train_datadir + '/' + df.filename
        df['samplename'] = df.filename.map(lambda x: x.split('/')[0] + '-' + x.split('/')[-1].split('.')[0])

        if cfg.LOAD_DATA:
            spectrograms = np.load(cfg.spectrogram_npy, allow_pickle=True).item()
            print(f"Loaded {len(spectrograms)} pre-computed mel spectrograms for labeled data")
        else:   
            spectrograms = None
            print("Will generate spectrograms on-the-fly during training.")

    if cfg.use_external_pseudolabels and soundscape_df is not None:
        print(f"Loading external pseudolabels...")
        if cfg.use_soft_labels:
            print("Using soft pseudolabels")
            pseudolabel_df = load_soft_pseudolabels(soundscape_df, cfg)
        else:
            print("Filtering pseudolabels with confidence threshold:", cfg.pseudolabel_confidence_threshold)
            pseudolabel_df = load_pseudolabels(soundscape_df, cfg)
            
        if cfg.LOAD_DATA:
            if spectrograms is None:
                spectrograms = {}
            for _, row in tqdm(pseudolabel_df.iterrows(), desc="Loading soundscape spectrograms", total=len(pseudolabel_df)):
                spectrograms[row['samplename']] = np.load(f"{cfg.train_soundscapes_spectrograms}{row['samplename']}.npy", allow_pickle=True)
        else:
            # Generate spectrograms on-the-fly for pseudolabels
            soundscape_spectrograms = generate_spectrograms(pseudolabel_df, cfg)
            
            # Merge with existing spectrograms if any
            if spectrograms is None:
                spectrograms = soundscape_spectrograms
            else:
                spectrograms.update(soundscape_spectrograms)
            print("Will generate soundscape spectrograms on-the-fly during data loading.")

        df = pd.concat([df, pseudolabel_df], ignore_index=True)
        
    return df, spectrograms

## Finally: Training!
Load dataframes from csv files, use data loading to generate samples, fold splitting then train folds in separate processes to avoid leakage (degradation of validation score for successive folds has been a large problem and was finally fixed by using this clear separation). In the end, save metrics and config and display final scores.

In [None]:
metric_logger = MetricLogger(cfg)
all_fold_metrics = []
best_scores = []
train_df = pd.read_csv(cfg.train_csv)

soundscape_df = None
if cfg.use_external_pseudolabels:
    print(f"Will use external pseudolabels from: {cfg.external_pseudolabels}")
    soundscape_df = pd.read_csv(cfg.external_pseudolabels)

df, spectrograms = load_data(pd.read_csv(cfg.train_csv), cfg, soundscape_df=soundscape_df)

# Create folds
if cfg.n_fold > 1:
    if cfg.use_improved_folds:
        folds = get_improved_folds(df, cfg)
    else:
        skf = StratifiedKFold(n_splits=cfg.n_fold, shuffle=True, random_state=cfg.seed)
        folds = skf.split(df, df['primary_label'])
else:
    folds = [(np.arange(len(df)), np.arange(len(df)))]

for fold, (train_idx, val_idx) in enumerate(folds):
    if cfg.n_fold > 1 and fold not in cfg.folds_to_train:
        continue
    print(f"\n{'*' * 30} Starting Fold {fold} {'*' * 30}")
    df_fold = deepcopy(df)
    spectrograms_fold = deepcopy(spectrograms)
    set_seed(cfg.seed + fold)
    train_df = df.iloc[train_idx].reset_index(drop=True)
    val_df = df.iloc[val_idx].reset_index(drop=True)

    if __name__ == "__main__":
        # Run training and get best scores
        fold_metrics, best_auc = train_fold(cfg, train_df, val_df, fold, metric_logger, spectrograms=spectrograms_fold)
        best_scores.append(best_auc)
        all_fold_metrics.append(fold_metrics)

print("\n" + "="*60 + "\n" + "CROSS-VALIDATION RESULTS:" + "\n" + "="*60)
for fold, score in enumerate(best_scores):
    print(f"Fold {fold}: {score:.4f} (Best epoch: {all_fold_metrics[fold]['best_epoch']+1})")

print(f"\nMean AUC: {np.mean(best_scores):.4f} ± {np.std(best_scores):.4f}")
print(f"Min AUC: {np.min(best_scores):.4f}")
print(f"Max AUC: {np.max(best_scores):.4f}")
print("="*60)

print("\nTraining complete!")

# Generate and save plots & config
if not cfg.debug:
    metric_logger.plot_metrics()
    cfg.save_config()
    
# Additional analysis
print(f"\nFinal Results Summary:")
print(f"Best scores per fold: {[f'{score:.4f}' for score in best_scores]}")
print(f"Cross-validation mean: {np.mean(best_scores):.4f} ± {np.std(best_scores):.4f}")

## Cross-Validation Analysis

Before training, let's analyze class distribution across folds to understand potential causes of variance.

In [None]:
def analyze_fold_distributions(df, cfg):
    """Analyze and visualize class distributions across folds"""
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    # Create folds
    if cfg.use_improved_folds:
        fold_indices = get_improved_folds(df, cfg)
    else:
        skf = StratifiedKFold(n_splits=cfg.n_fold, shuffle=True, random_state=cfg.seed)
        fold_indices = list(skf.split(df, df['primary_label']))
    
    # Count samples per class per fold
    all_labels = df['primary_label'].unique()
    fold_class_counts = {}
    
    for fold_idx, (train_idx, val_idx) in enumerate(fold_indices):
        val_df = df.iloc[val_idx]
        class_counts = val_df['primary_label'].value_counts()
        fold_class_counts[f'Fold {fold_idx}'] = class_counts
    
    # Convert to DataFrame for easier plotting
    fold_df = pd.DataFrame(fold_class_counts)
    fold_df = fold_df.fillna(0)  # Replace NaN with 0

    
    # Calculate coefficient of variation (CV) for each class across folds
    fold_df['Mean'] = fold_df.mean(axis=1)
    fold_df['Std'] = fold_df.std(axis=1)
    fold_df['CV'] = (fold_df['Std'] / fold_df['Mean']) * 100  # CV as percentage
    
    # Sort by CV to identify classes with highest variance
    fold_df = fold_df.sort_values('CV', ascending=False)
    
    # Visualize distribution for top 20 most variable classes
    plt.figure(figsize=(15, 10))
    sns.heatmap(fold_df.drop(['Mean', 'Std', 'CV'], axis=1).head(20), 
                annot=True, cmap='YlGnBu', fmt='.0f')
    plt.title('Top 20 Classes with Highest Variance Across Folds')
    plt.tight_layout()
    plt.show()
    
    # Plot coefficient of variation distribution
    plt.figure(figsize=(15, 6))
    sns.histplot(fold_df['CV'].dropna(), bins=50)
    plt.title('Distribution of Coefficient of Variation Across Classes')
    plt.xlabel('Coefficient of Variation (%)')
    plt.axvline(fold_df['CV'].median(), color='red', linestyle='--', 
                label=f'Median CV: {fold_df["CV"].median():.2f}%')
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    # Return problematic classes
    return fold_df

# Run the analysis before training
train_df = pd.read_csv(cfg.train_csv)
fold_analysis = analyze_fold_distributions(train_df, cfg)
print(f"Top 10 most variable classes across folds:")
display(fold_analysis[['Mean', 'Std', 'CV']].head(10))