# **BirdCLEF 2025 Training Notebook**

This is a baseline training pipeline for BirdCLEF 2025 using EfficientNetB0 with PyTorch and Timm(for pretrained EffNet). You can check inference and preprocessing notebooks in the following links: 

- [EfficientNet B0 Pytorch [Inference] | BirdCLEF'25](https://www.kaggle.com/code/kadircandrisolu/efficientnet-b0-pytorch-inference-birdclef-25)

  
- [Transforming Audio-to-Mel Spec. | BirdCLEF'25](https://www.kaggle.com/code/kadircandrisolu/transforming-audio-to-mel-spec-birdclef-25)  

Note that by default this notebook is in Debug Mode, so it will only train the model with 2 epochs, but the [weight](https://www.kaggle.com/datasets/kadircandrisolu/birdclef25-effnetb0-starter-weight) I used in the inference notebook was obtained after 10 epochs of training.

**Features**
* Implement with Pytorch and Timm
* Flexible audio processing with both pre-computed and on-the-fly mel spectrograms
* Stratified 5-fold cross-validation with ensemble capability
* Mixup training for improved generalization
* Spectrogram augmentations (time/frequency masking, brightness adjustment)
* AdamW optimizer with Cosine Annealing LR scheduling
* Debug mode for quick experimentation with smaller datasets

**Pre-computed Spectrograms**
For faster training, you can use pre-computed mel spectrograms from [this dataset](https://www.kaggle.com/datasets/kadircandrisolu/birdclef25-mel-spectrograms) by setting `LOAD_DATA = True`

## Libraries

In [2]:
# Basic imports
import numpy as np, pandas as pd, math, os, random, warnings, json, datetime
from tqdm.auto import tqdm


# Specific imports
import logging, gc, cv2

# Audio processing imports
import librosa

# PyTorch imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from torch.amp import autocast, GradScaler

# Other ML imports
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import timm

# Custom imports
from processing import audio2melspec, process_audio_file, generate_spectrograms
from utilities import set_seed, collate_fn
from training_utilities import get_optimizer, get_scheduler, get_criterion, clean_gpu_memory, calculate_auc, compile_model

# Suppress warnings and set logging level
warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.ERROR)

## Configuration

In [None]:
class CFG:
    
    seed = 42
    debug = False 
    LOAD_DATA = True
    
    # Paths and directories
    OUTPUT_DIR = 'output/'
    train_datadir = 'birdclef-2025/train_audio'
    train_csv = 'birdclef-2025/train.csv'
    train_soundscapes = 'birdclef-2025/train_soundscapes'
    test_soundscapes = 'birdclef-2025/test_soundscapes'
    submission_csv = 'birdclef-2025/sample_submission.csv'
    taxonomy_csv = 'birdclef-2025/taxonomy.csv'
    spectrogram_npy = 'archive/train_melspec_5_256_256.npy'
    train_soundscapes_spectrograms = 'archive/train_soundscapes_melspec_12x5_256_256/'
    
    # External pseudolabels settings
    use_external_pseudolabels = True
    external_pseudolabels = 'pseudolabels_better.csv'
    pseudolabel_confidence_threshold = 0.1  # Only use predictions above this threshold
    max_pseudolabels = 10000  # Maximum number of pseudolabeled samples to use
    stratified_pseudolabels = False  # Use stratified sampling for pseudolabels
 
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

    # Training settings
    epochs = 15
    n_fold = 5 

    # Mel spectrogram parameters
    FS = 32000
    TARGET_DURATION = 5.0
    TARGET_SHAPE = (256, 256)
    N_FFT = 1024
    HOP_LENGTH = 512
    N_MELS = 128
    FMIN = 50
    FMAX = 14000
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")
    
    # Loss parameters
    criterion = 'CombinedLoss'  # Options: 'BCEWithLogitsLoss', 'FocalLoss', 'CombinedLoss'
    focal_alpha = 1.0
    focal_gamma = 3.0
    bce_weight = 0.5
    focal_weight = 0.5

    # optimizer and scheduler parameters
    optimizer = 'AdamW'
    lr = 5e-4 
    weight_decay = 1e-5
    scheduler = 'CosineAnnealingLR'
    min_lr = 1e-6
    use_lr_warmup = True
    warmup_epochs = 2

    # augmentation options
    aug_prob = 0.5  
    spec_augment = True
    spec_augment_params = {
        'time_mask_param': 30,
        'freq_mask_param': 20,
        'num_masks': 2,
    }
    mixup_alpha = 0.5
    cutmix_alpha = 1.0
    use_cutmix = True
    
    # Model architecture options
    model_name = 'efficientnet_b0'  # Options: 'efficientnetv2_s', 'convnext_tiny', 'efficientnet_b0' 
    pretrained = True
    in_channels = 1
    dropout_rate = 0.2
    drop_path_rate = 0.2
    
    # Regularization techniques
    label_smoothing = 0.01
    use_stochastic_depth = True

    # Memory and speed optimizations
    gradient_accumulation_steps = 4  # Increase effective batch size without more memory
    use_amp = True                   # Use automatic mixed precision
    pin_memory = True                # Faster data transfer to GPU
    persistent_workers = True        # Keep workers alive between epochs
    num_workers = 0                  # Match to number of CPU cores
    prefetch_factor = 4              # Number of batches to prefetch (default is 2)
    batch_size = 32                  # Effective batch size will be batch_size * gradient_accumulation_steps
    
    # Compiler settings
    compile_backend = "eager"        # Options: "eager", "torchscript", "onnx", "inductor"
    compile_mode = "default"         # Options: "default", "reduce-overhead", "max-autotune"
    
    # Memory usage management
    gc_after_epoch = True            # Force garbage collection after each epoch
    cache_dataset = False            # Cache dataset in memory if possible

    def update_debug_settings(self):
        if self.debug:
            self.n_fold = 1
            self.epochs = 2

    def save_config(self):
        config_dict = {attr: getattr(self, attr) for attr in dir(self) if not attr.startswith('__') and not callable(getattr(self, attr))}
        if self.debug:
            filename = f"config_{self.timestamp}_{self.model_name}_DEBUG.json"
        else:
            filename = f"config_{self.timestamp}_{self.model_name}.json"
        with open(os.path.join(self.OUTPUT_DIR, filename), 'w') as f:
            json.dump(config_dict, f, indent=4, default=str)
        print(f"Config saved to {os.path.join(self.OUTPUT_DIR, filename)}")

cfg = CFG()
set_seed(cfg.seed)
cfg.update_debug_settings()

Using device: cuda


## Pre-processing
These functions handle the transformation of audio files to mel spectrograms for model input, with flexibility controlled by the `LOAD_DATA` parameter. The process involves either loading pre-computed spectrograms from this [dataset](https://www.kaggle.com/datasets/kadircandrisolu/birdclef25-mel-spectrograms) (when `LOAD_DATA=True`) or dynamically generating them (when `LOAD_DATA=False`), transforming audio data into spectrogram representations, and preparing it for the neural network.

## Dataset Preparation and Data Augmentations
We'll convert audio to mel spectrograms and apply random augmentations with 50% probability each - including time stretching, pitch shifting, and volume adjustments. This randomized approach creates diverse training samples from the same audio files

In [4]:
class BirdCLEFDatasetFromNPY(Dataset):
    _cache = {}  # Class-level cache
    
    def __init__(self, df, cfg, spectrograms=None, mode="train", label_dict=None, soundscape_loading=False):
        self.df = df
        self.cfg = cfg
        self.mode = mode
        self.spectrograms = spectrograms
        self.label_dict = label_dict
        self.soundscape_loading = soundscape_loading
        
        taxonomy_df = pd.read_csv(self.cfg.taxonomy_csv)
        self.species_ids = taxonomy_df['primary_label'].tolist()
        self.num_classes = len(self.species_ids)
        self.label_to_idx = {label: idx for idx, label in enumerate(self.species_ids)}
        self.data_dir = cfg.train_datadir if not soundscape_loading else cfg.train_soundscapes

        if self.soundscape_loading:
            self.df.rename(columns={"row_id": "samplename"}, inplace=True)
            self.df['filename'] = self.df['samplename'].apply(lambda x: "_".join(x.split('_')[:3])+".ogg")
            self.df['timestamp'] = self.df['samplename'].apply(lambda x: int(x.split('_')[3]))
            self.df['filepath'] = self.df['filename'].apply(lambda x: os.path.join(self.data_dir, x))
            self.df['primary_label'] = self.df['samplename'].apply(lambda x: self.label_dict.get(x, [])[0])
            self.df['secondary_labels'] = self.df['samplename'].apply(lambda x: self.label_dict.get(x, [])[1:])
        else:
            self.df['filepath'] = self.data_dir + '/' + self.df.filename
            self.df['samplename'] = self.df.filename.map(lambda x: x.split('/')[0] + '-' + x.split('/')[-1].split('.')[0])
        
        if cfg.debug:
            self.df = self.df.sample(min(1000, len(self.df)), random_state=cfg.seed).reset_index(drop=True)
        
        self.use_cache = cfg.cache_dataset if hasattr(cfg, 'cache_dataset') else False
        self.cache_hits = 0
        self.cache_misses = 0
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        # Check if we should use the cache and if the item is in cache
        cache_key = f"{self.mode}_{idx}"
        if self.use_cache and cache_key in self.__class__._cache:
            self.cache_hits += 1
            return self.__class__._cache[cache_key]
        
        row = self.df.iloc[idx]  # Fixed: Using square brackets instead of parentheses
        samplename = row['samplename']

        if self.spectrograms and samplename in self.spectrograms:
            spec = self.spectrograms[samplename]
        elif not self.cfg.LOAD_DATA:
            spec = process_audio_file(row['filepath'], self.cfg)
        else: 
            spec = np.zeros(self.cfg.TARGET_SHAPE, dtype=np.float32)
            print(f"Warning: Spectrogram for {samplename} not found and could not be generated")    

        spec = torch.from_numpy(spec).float().unsqueeze(0)  # Add channel dimension

        if self.mode == "train" and random.random() < self.cfg.aug_prob:
            spec = self.apply_spec_augmentations(spec)
        
        target = self.encode_label(row['primary_label'])
        
        if 'secondary_labels' in row and row['secondary_labels'] not in [[''], None, np.nan, []]:
            if isinstance(row['secondary_labels'], str):
                secondary_labels = eval(row['secondary_labels'])
            else:
                secondary_labels = row['secondary_labels']
            
            for label in secondary_labels:
                idx = self.label_to_idx.get(label)
                if idx is not None:
                    target[idx] = 1.0
        
        item = {
            'melspec': spec, 
            'target': torch.from_numpy(target).float(),
        }
        
        # Cache the item if caching is enabled
        if self.use_cache:
            self.cache_misses += 1
            self.__class__._cache[cache_key] = item
            
            # Print cache stats occasionally 
            if (self.cache_hits + self.cache_misses) % 1000 == 0:
                hit_rate = self.cache_hits / (self.cache_hits + self.cache_misses)
                print(f"Cache hit rate: {hit_rate:.2%}, hits: {self.cache_hits}, misses: {self.cache_misses}")
                
        return item
    
    def apply_spec_augmentations(self, spec):
        """Apply augmentations to spectrogram"""
        
        # Time/frequency masking
        if random.random() < 0.5:
            for _ in range(random.randint(1, 3)):
                width = random.randint(5, 20)
                start = random.randint(0, spec.shape[2] - width)
                spec[0, :, start:start+width] = 0
        
        if random.random() < 0.5:
            for _ in range(random.randint(1, 3)):
                height = random.randint(5, 20)
                start = random.randint(0, spec.shape[1] - height)
                spec[0, start:start+height, :] = 0
        
        # Random brightness/contrast adjustment
        if random.random() < 0.5:
            gain = random.uniform(0.8, 1.2)
            bias = random.uniform(-0.1, 0.1)
            spec = spec * gain + bias
            spec = torch.clamp(spec, 0, 1)
        
        # Gaussian noise for robustness
        if random.random() < 0.3:
            noise = torch.randn_like(spec) * random.uniform(0.001, 0.005)
            spec = spec + noise
            spec = torch.clamp(spec, 0, 1)
            
        # Random time/frequency shifts
        if random.random() < 0.3:
            shift_x = random.randint(-4, 4)
            shift_y = random.randint(-4, 4)
            spec = torch.roll(spec, shifts=(shift_y, shift_x), dims=(1, 2))
        
        return spec
    
    def encode_label(self, label):
        """Encode label to one-hot vector"""
        target = np.zeros(self.num_classes)
        idx = self.label_to_idx.get(label)
        if idx is not None:
            target[idx] = 1.0
        return target

## Model Definition

In [5]:
class BirdCLEFModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        taxonomy_df = pd.read_csv(cfg.taxonomy_csv)
        cfg.num_classes = len(taxonomy_df)
        
        # Support for different model architectures
        self.backbone = timm.create_model(
            cfg.model_name,
            pretrained=cfg.pretrained,
            in_chans=cfg.in_channels,
            drop_rate=cfg.dropout_rate,
            drop_path_rate=cfg.drop_path_rate if hasattr(cfg, 'drop_path_rate') else 0.2
        )
        
        # Extract feature dimension based on model type
        if 'efficientnet' in cfg.model_name:
            backbone_out = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
        elif 'convnext' in cfg.model_name:
            backbone_out = self.backbone.head.fc.in_features
            self.backbone.head.fc = nn.Identity()
        elif 'resnet' in cfg.model_name:
            backbone_out = self.backbone.fc.in_features
            self.backbone.fc = nn.Identity()
        else:
            backbone_out = self.backbone.get_classifier().in_features
            self.backbone.reset_classifier(0, '')
        
        self.pooling = nn.AdaptiveAvgPool2d(1)
        self.feat_dim = backbone_out
        
        # Add an additional projection layer for better feature representation
        if hasattr(cfg, 'projection_dim') and cfg.projection_dim > 0:
            self.projection = nn.Sequential(
                nn.Linear(backbone_out, cfg.projection_dim),
                nn.BatchNorm1d(cfg.projection_dim),
                nn.ReLU(inplace=True),
                nn.Dropout(0.3),
                nn.Linear(cfg.projection_dim, cfg.num_classes)
            )
            self.classifier = self.projection
        else:
            self.classifier = nn.Linear(backbone_out, cfg.num_classes)
        
        # Mixup and CutMix support
        self.mixup_enabled = hasattr(cfg, 'mixup_alpha') and cfg.mixup_alpha > 0
        self.cutmix_enabled = hasattr(cfg, 'use_cutmix') and cfg.use_cutmix and hasattr(cfg, 'cutmix_alpha') and cfg.cutmix_alpha > 0
        
        if self.mixup_enabled:
            self.mixup_alpha = cfg.mixup_alpha
        if self.cutmix_enabled:
            self.cutmix_alpha = cfg.cutmix_alpha
    
    def forward(self, x, targets=None):
    
        if self.training and self.mixup_enabled and targets is not None:
            mixed_x, targets_a, targets_b, lam = self.mixup_data(x, targets)
            x = mixed_x
        else:
            targets_a, targets_b, lam = None, None, None
        
        features = self.backbone(x)
        
        if isinstance(features, dict):
            features = features['features']
            
        if len(features.shape) == 4:
            features = self.pooling(features)
            features = features.view(features.size(0), -1)
        
        logits = self.classifier(features)
        
        if self.training and self.mixup_enabled and targets is not None:
            loss = self.mixup_criterion(F.binary_cross_entropy_with_logits, logits, targets_a, targets_b, lam)
            return logits, loss
            
        return logits
    
    def mixup_data(self, x, targets):
        """Applies mixup to the data batch"""
        batch_size = x.size(0)
        lam = np.random.beta(self.mixup_alpha, self.mixup_alpha)
        indices = torch.randperm(batch_size).to(x.device, non_blocking=True)
        mixed_x = lam * x + (1 - lam) * x[indices]
        
        return mixed_x, targets, targets[indices], lam
    
    def mixup_criterion(self, criterion, pred, y_a, y_b, lam):
        """Applies mixup to the loss function"""
        return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)
    
    def cutmix_data(self, x, targets):
        batch_size = x.size(0)
        lam = np.random.beta(self.cutmix_alpha, self.cutmix_alpha)
        
        # Get random indices for mixing
        indices = torch.randperm(batch_size).to(x.device)
        
        # Get random box coordinates
        W, H = x.size(2), x.size(3)
        cut_ratio = np.sqrt(1. - lam)
        cut_w = np.int_(W * cut_ratio)
        cut_h = np.int_(H * cut_ratio)
        
        cx = np.random.randint(W)
        cy = np.random.randint(H)
        
        bbx1 = np.clip(cx - cut_w // 2, 0, W)
        bby1 = np.clip(cy - cut_h // 2, 0, H)
        bbx2 = np.clip(cx + cut_w // 2, 0, W)
        bby2 = np.clip(cy + cut_h // 2, 0, H)
        
        # Apply cutmix
        x_mixed = x.clone()
        x_mixed[:, :, bbx1:bbx2, bby1:bby2] = x[indices, :, bbx1:bbx2, bby1:bby2]
        
        # Adjust lambda to actual area ratio
        lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (W * H))
        
        return x_mixed, targets, targets[indices], lam

## Training Utilities
We are configuring our optimization strategy with the AdamW optimizer, cosine scheduling, and the BCEWithLogitsLoss criterion.

## Training Loop

In [6]:
def train_one_epoch(model, loader, optimizer, criterion, device, scheduler=None, use_amp=True, grad_accum_steps=1):
    model.train()
    scaler = GradScaler(enabled=use_amp)
    
    # Use lists to accumulate batches, but don't keep all outputs in memory
    batch_count = 0
    num_batches = len(loader)
    running_loss = 0
    outputs_for_metrics = []
    targets_for_metrics = []  # Fixed missing equal sign
    metric_collection_interval = min(100, len(loader) // 10 or 1)  # Collect metrics every N batches

    optimizer.zero_grad(set_to_none=True)  # More efficient than zero_grad()
    pbar = tqdm(enumerate(loader), total=len(loader), desc="Training")
    
    for step, batch in pbar:
        # Move to device with non_blocking for potential speedup
        inputs = batch['melspec'].to(device, non_blocking=True)
        targets = batch['target'].to(device, non_blocking=True)
        
        with autocast(enabled=use_amp, device_type=device):
            # Handle model outputs with mixup/cutmix
            if (model.mixup_enabled or model.cutmix_enabled) and model.training:
                if model.mixup_enabled and model.cutmix_enabled:
                    # Randomly choose between mixup and cutmix
                    if random.random() < 0.5:
                        mixed_x, targets_a, targets_b, lam = model.mixup_data(inputs, targets)
                    else:
                        mixed_x, targets_a, targets_b, lam = model.cutmix_data(inputs, targets)
                elif model.mixup_enabled:
                    mixed_x, targets_a, targets_b, lam = model.mixup_data(inputs, targets)
                else:
                    mixed_x, targets_a, targets_b, lam = model.cutmix_data(inputs, targets)
                    
                outputs = model(mixed_x)
                loss = lam * criterion(outputs, targets_a) + (1 - lam) * criterion(outputs, targets_b)
            else:
                outputs = model(inputs)
                loss = criterion(outputs, targets)
        
        # Normalize loss for gradient accumulation
        loss = loss / grad_accum_steps
        scaler.scale(loss).backward()
        
        batch_count += 1
        running_loss += loss.item() * grad_accum_steps
        
        # Only collect some batches for metrics to save memory
        if step % metric_collection_interval == 0:
            outputs_for_metrics.append(outputs.detach().cpu())
            targets_for_metrics.append(targets.detach().cpu())
        
        # Step optimizer after accumulating gradients
        if batch_count % grad_accum_steps == 0 or step == len(loader) - 1:
            # Unscale before possible gradient clipping
            scaler.unscale_(optimizer)
            
            # Optional gradient clipping
            # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            
            if scheduler and isinstance(scheduler, lr_scheduler.OneCycleLR):
                scheduler.step()

        # Update progress bar with running loss
        pbar.set_postfix({
            'it/s/bs' : pbar.n / pbar.format_dict['elapsed'] / cfg.batch_size,
            'train_loss': running_loss / (step + 1),
            'lr': optimizer.param_groups[0]['lr'],
        })
        
        # Free memory explicitly
        del inputs, outputs
        if step % 10 == 0:  # Periodically clear CUDA cache
            torch.cuda.empty_cache()

    # Calculate metrics on the subset of data we collected
    if outputs_for_metrics:
        all_outputs = torch.cat(outputs_for_metrics)
        all_targets = torch.cat(targets_for_metrics)
        auc = calculate_auc(all_targets.numpy(), all_outputs)
    else:
        auc = 0.0
        
    avg_loss = running_loss / len(loader)
    
    # Clean up
    del outputs_for_metrics, targets_for_metrics
    torch.cuda.empty_cache()
    
    return avg_loss, auc

# Update the validation function with batch processing for large validation sets
def validate(model, loader, criterion, device, use_amp=True):
    model.eval()
    total_loss = 0.0
    
    # Process predictions in chunks to save memory
    all_probs = []
    all_targets = []
    max_batches_in_memory = 50  # Adjust based on your memory constraints
    
    with torch.no_grad():
        for batch_idx, batch in enumerate(tqdm(loader, desc="Validation")):
            inputs = batch['melspec'].to(device, non_blocking=True)
            targets = batch['target'].to(device, non_blocking=True)

            with autocast(enabled=use_amp, device_type=device):
                outputs = model(inputs)
                loss = criterion(outputs, targets)

            total_loss += loss.item()
            
            # Convert to probabilities and store
            probs = torch.sigmoid(outputs).cpu().numpy()
            targets_np = targets.cpu().numpy()
            
            all_probs.append(probs)
            all_targets.append(targets_np)
            
            # Clear memory periodically
            del inputs, outputs, targets
            if batch_idx % 10 == 0:
                torch.cuda.empty_cache()
            
            # Process predictions in chunks to avoid memory issues
            if len(all_probs) >= max_batches_in_memory:
                # Calculate partial metrics
                probs_array = np.vstack(all_probs)
                targets_array = np.vstack(all_targets)
                
                # Clear the lists to free memory
                all_probs = []
                all_targets = []
                torch.cuda.empty_cache()
    
    # Process any remaining predictions
    if all_probs:
        probs_array = np.vstack(all_probs) if len(all_probs) > 1 else all_probs[0]
        targets_array = np.vstack(all_targets) if len(all_targets) > 1 else all_targets[0]
    
    # Calculate AUC
    auc = calculate_auc(targets_array, probs_array)
    avg_loss = total_loss / len(loader)
    
    # Final cleanup
    del all_probs, all_targets, probs_array, targets_array
    torch.cuda.empty_cache()
    
    return avg_loss, auc

In [7]:
def load_pseudolabels(soundscape_df, cfg, fold_seed=None):
    """
    Load pseudolabels and sample to ensure balanced class representation.
    Returns a maximum of cfg.max_pseudolabels samples with balanced label distribution.
    """
    random_seed = fold_seed if fold_seed is not None else cfg.seed
    
    # Step 1: Filter to high-confidence samples first
    high_confidence_dict = {}  # {sample_id: [labels]}
    label_cols = soundscape_df.columns[1:]  # Skip row_id column
    for _, row in soundscape_df.iterrows():
        samplename = row['row_id']
        row_sum = row[label_cols].sum()
        if row_sum>0:
            # normalize probabilities to [0, 1]
            row[label_cols] = row[label_cols] / row[label_cols].sum()
            label_values = {col: row[col] for col in label_cols}
            high_conf_labels = [col for col, prob in label_values.items() if prob >= cfg.pseudolabel_confidence_threshold]
            if high_conf_labels:
                high_confidence_dict[samplename] = sorted(high_conf_labels, key=lambda x: label_values[x], reverse=True)
    print(f"Found {len(high_confidence_dict)} samples with confidence > {cfg.pseudolabel_confidence_threshold}")

    if cfg.stratified_pseudolabels:    
        print("Stratified sampling of pseudolabels")
        # Step 2: Create a label-to-samples mapping to track which samples have each label
        label_to_samples = {}
        for samplename, labels in high_confidence_dict.items():
            for label in labels:
                if label not in label_to_samples:
                    label_to_samples[label] = []
                label_to_samples[label].append(samplename)
        
        # Step 3: Determine target count per label for balanced sampling
        num_unique_labels = len(label_to_samples)
        target_per_label = max(1, int(cfg.max_pseudolabels / num_unique_labels))
        print(f"Found {num_unique_labels} unique bird species in pseudolabels")
        print(f"Target ~{target_per_label} samples per species to stay under {cfg.max_pseudolabels} total")
        
        # Shuffle labels to ensure different folds get different priority
        shuffled_labels = list(label_to_samples.keys())
        random.seed(random_seed)
        random.shuffle(shuffled_labels)

        # Step 4: Sample entries for each label
        selected_samples = set()
        label_counts = {}
        
        for label in shuffled_labels:
            samples = label_to_samples[label]
            # Shuffle samples for this label
            random.seed(random_seed + hash(label) % 10000)  # Different seed for each label
            random.shuffle(samples)
            
            samples_needed = min(target_per_label, len(samples))
            samples_added = 0
            
            for sample in samples:
                if samples_added >= samples_needed:
                    break
                    
                selected_samples.add(sample)
                samples_added += 1
                
                # Keep track of how many times we've sampled each label
                for label_in_sample in high_confidence_dict[sample]:
                    label_counts[label_in_sample] = label_counts.get(label_in_sample, 0) + 1
    
        # Step 5: Create final filtered dict and dataframe with only selected samples
        filtered_dict = {k: v for k, v in high_confidence_dict.items() if k in selected_samples}
        filtered_df = soundscape_df[soundscape_df['row_id'].isin(selected_samples)].reset_index(drop=True)
    
         # Report statistics
        print(f"Sampled {len(filtered_dict)} pseudolabeled samples")
    
        return filtered_df, filtered_dict
    
    else:
        filtered_df = soundscape_df[soundscape_df['row_id'].isin(high_confidence_dict.keys())].reset_index(drop=True)
        filtered_df = filtered_df.sample(frac=1, random_state=fold_seed).reset_index(drop=True)
        print(f"Loaded all pseudolabels that pass confidence threshold")
        return filtered_df, high_confidence_dict

In [None]:
def run_training(df, cfg, soundscape_df=None):

    taxonomy_df = pd.read_csv(cfg.taxonomy_csv)
    species_ids = taxonomy_df['primary_label'].tolist()
    cfg.num_classes = len(species_ids)

    if cfg.LOAD_DATA:
        spectrograms = np.load(cfg.spectrogram_npy, allow_pickle=True).item()
        print(f"Loaded {len(spectrograms)} pre-computed mel spectrograms for labeled data")
    else:   
        spectrograms = None
        print("Will generate spectrograms on-the-fly during training.")

    # Create cross-validation folds
    if cfg.n_fold > 1:
        skf = StratifiedKFold(n_splits=cfg.n_fold, shuffle=True, random_state=cfg.seed)
        folds = skf.split(df, df['primary_label'])
    else:
        folds = [(np.arange(len(df)), np.arange(len(df)))]

    best_scores = []
            
    for fold, (train_idx, val_idx) in enumerate(folds):
        print(f'\n{"="*30} Fold {fold} {"="*30}')
        
        train_df = df.iloc[train_idx].reset_index(drop=True)
        val_df = df.iloc[val_idx].reset_index(drop=True)
        
        print(f'Training set: {len(train_df)} samples')
        print(f'Validation set: {len(val_df)} samples')

        # Prepare datasets
        train_dataset = BirdCLEFDatasetFromNPY(train_df, cfg, spectrograms=spectrograms, mode='train')
        val_dataset = BirdCLEFDatasetFromNPY(val_df, cfg, spectrograms=spectrograms, mode='valid')
        pseudolabeled_dataset = None

        if cfg.use_external_pseudolabels and soundscape_df is not None:
            print("\nInitializing soundscape dataset with external pseudolabels...")
            # Get stratified samples from soundscape_df
            filtered_soundscape_df, high_confidence_dict = load_pseudolabels(soundscape_df, cfg, fold_seed=cfg.seed+fold*1337)
            soundscape_spectrograms = None
            if cfg.LOAD_DATA:
                soundscape_spectrograms = {}
                for samplename in high_confidence_dict.keys():
                    samplepath = os.path.join(cfg.train_soundscapes_spectrograms, samplename + ".npy")
                    if os.path.exists(samplepath):
                        soundscape_spectrograms[samplename] = np.load(samplepath, allow_pickle=True)
                    else:
                        print(f"Warning: Spectrogram for {samplename} not found in soundscapes directory")

            pseudolabeled_dataset = BirdCLEFDatasetFromNPY(
                filtered_soundscape_df, 
                cfg, 
                spectrograms=soundscape_spectrograms,
                mode='train',
                label_dict=high_confidence_dict,
                soundscape_loading=True
            )
        
        # Use original training set by default
        final_train_dataset = train_dataset
        
        # Combine with pseudolabels if available
        if pseudolabeled_dataset is not None and len(pseudolabeled_dataset) > 0:
            final_train_dataset = ConcatDataset([train_dataset, pseudolabeled_dataset])
            print(f"Training with combined dataset: {len(train_dataset)} original + {len(pseudolabeled_dataset)} pseudolabeled = {len(final_train_dataset)} total samples")
        
        # Prepare data loaders
        train_loader = DataLoader(
            final_train_dataset, 
            batch_size=cfg.batch_size, 
            shuffle=True, 
            num_workers=cfg.num_workers,
            pin_memory=cfg.pin_memory,
            persistent_workers=cfg.persistent_workers if cfg.num_workers > 0 else False,
            prefetch_factor=cfg.prefetch_factor if cfg.num_workers > 0 else None,
            collate_fn=collate_fn,
            drop_last=True
        )

        val_loader = DataLoader(
            val_dataset, 
            batch_size=cfg.batch_size * 2,  # Can use larger batch size for validation
            shuffle=False, 
            num_workers=cfg.num_workers,
            pin_memory=cfg.pin_memory,
            persistent_workers=cfg.persistent_workers if cfg.num_workers > 0 else False,
            prefetch_factor=cfg.prefetch_factor if cfg.num_workers > 0 else None,
            collate_fn=collate_fn
        )

        # Train the model
        print(f"\n{'-'*20} Training Model {'-'*20}")
        model = BirdCLEFModel(cfg).to(cfg.device, non_blocking=True)
        model = compile_model(model, cfg)
        optimizer = get_optimizer(model, cfg)
        criterion = get_criterion(cfg)

        # Configure scheduler
        if cfg.scheduler == 'CosineAnnealingLR':
            cfg.T_max = cfg.epochs
        scheduler = get_scheduler(optimizer, cfg, len(train_loader))
        
        best_auc, best_epoch = 0, 0
        
        for epoch in range(cfg.epochs):
            print(f"\nEpoch {epoch+1}/{cfg.epochs}")
            
            train_loss, train_auc = train_one_epoch(
                model, train_loader, optimizer, criterion, cfg.device,
                scheduler if isinstance(scheduler, lr_scheduler.OneCycleLR) else None,
                use_amp=cfg.use_amp,
                grad_accum_steps=cfg.gradient_accumulation_steps
                )
            
            val_loss, val_auc = validate(model, val_loader, criterion, cfg.device, use_amp=cfg.use_amp)
            
            if scheduler is not None and not isinstance(scheduler, lr_scheduler.OneCycleLR):
                if isinstance(scheduler, lr_scheduler.ReduceLROnPlateau):
                    scheduler.step(val_loss)
                else:
                    scheduler.step()
            
            print(f"Train Loss: {train_loss:.4f}, Train AUC: {train_auc:.4f}")
            print(f"Val Loss: {val_loss:.4f}, Val AUC: {val_auc:.4f}")
            
            if val_auc > best_auc:
                best_auc = val_auc
                best_epoch = epoch + 1
                print(f"New best AUC: {best_auc:.4f} at epoch {best_epoch}")
                
                # Save best model
                model_path = f"{cfg.OUTPUT_DIR}/model_{cfg.timestamp}_{cfg.model_name}_fold{fold}.pth"
                torch.save({
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'scheduler_state_dict': scheduler.state_dict() if scheduler else None,
                    'epoch': epoch,
                    'val_auc': val_auc,
                    'train_auc': train_auc,
                    'cfg': cfg
                }, model_path)
        
        # Record best score for this fold
        best_scores.append(best_auc)
        print(f"\nTraining complete for fold {fold}. Best AUC: {best_auc:.4f} at epoch {best_epoch}")
        
        # Memory cleanup
        del model, optimizer, scheduler
        del train_loader, val_loader
        del train_dataset, val_dataset, final_train_dataset, pseudolabeled_dataset
        del filtered_soundscape_df, high_confidence_dict, soundscape_spectrograms

        clean_gpu_memory()
    
    print("\n" + "="*60)
    print("Cross-Validation Results:")
    for fold, score in enumerate(best_scores):
        print(f"Fold {fold}: {score:.4f}")
    print(f"Mean AUC: {np.mean(best_scores):.4f}")
    print("="*60)

In [9]:
if __name__ == "__main__":
    print("\nLoading training data...")
    train_df = pd.read_csv(cfg.train_csv)

    # Load soundscape data if external pseudolabels are enabled
    soundscape_df = None
    if cfg.use_external_pseudolabels:
        print(f"Will use external pseudolabels from: {cfg.external_pseudolabels}")
        soundscape_df = pd.read_csv(cfg.external_pseudolabels)

    print("\nStarting training...")
    print(f"LOAD_DATA is set to {cfg.LOAD_DATA}")

    run_training(train_df, cfg, soundscape_df=soundscape_df)
    print("\nTraining complete!")
    cfg.save_config()


Loading training data...
Will use external pseudolabels from: pseudolabels_better.csv

Starting training...
LOAD_DATA is set to True
Loaded 28579 pre-computed mel spectrograms for labeled data

Training set: 22863 samples
Validation set: 5716 samples

Initializing soundscape dataset with external pseudolabels...
Found 6637 samples with confidence > 0.1
Loaded all pseudolabels that pass confidence threshold
Training with combined dataset: 22863 original + 6637 pseudolabeled = 29500 total samples

-------------------- Training Model --------------------
Model compiled successfully with backend 'inductor', mode 'default'

Epoch 1/15


Training:   0%|          | 0/921 [00:00<?, ?it/s]

W0519 22:07:09.337000 9600 torch\_inductor\utils.py:1137] [0/0] Not enough SMs to use max_autotune_gemm mode


BackendCompilerFailed: backend='inductor' raised:
CalledProcessError: Command '['C:\\Users\\maxge\\AppData\\Local\\MYSYS2\\ucrt64\\bin\\gcc.EXE', 'C:\\Users\\maxge\\AppData\\Local\\Temp\\tmpp4mqv5lq\\main.c', '-O3', '-shared', '-fPIC', '-Wno-psabi', '-o', 'C:\\Users\\maxge\\AppData\\Local\\Temp\\tmpp4mqv5lq\\cuda_utils.cp311-win_amd64.pyd', '-lcuda', '-LC:\\Users\\maxge\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python311\\site-packages\\triton\\backends\\nvidia\\lib', '-LC:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.8\\lib\\x64', '-LC:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\libs', '-LC:\\Program Files (x86)\\Windows Kits\\10\\Lib\\10.0.26100.0\\ucrt\\x64', '-LC:\\Program Files (x86)\\Windows Kits\\10\\Lib\\10.0.26100.0\\um\\x64', '-LC:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\libs', '-LC:\\Program Files (x86)\\Windows Kits\\10\\Lib\\10.0.26100.0\\ucrt\\x64', '-LC:\\Program Files (x86)\\Windows Kits\\10\\Lib\\10.0.26100.0\\um\\x64', '-LC:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\libs', '-LC:\\Program Files (x86)\\Windows Kits\\10\\Lib\\10.0.26100.0\\ucrt\\x64', '-LC:\\Program Files (x86)\\Windows Kits\\10\\Lib\\10.0.26100.0\\um\\x64', '-LC:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\libs', '-LC:\\Program Files (x86)\\Windows Kits\\10\\Lib\\10.0.26100.0\\ucrt\\x64', '-LC:\\Program Files (x86)\\Windows Kits\\10\\Lib\\10.0.26100.0\\um\\x64', '-IC:\\Users\\maxge\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python311\\site-packages\\triton\\backends\\nvidia\\include', '-IC:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.8\\include', '-IC:\\Users\\maxge\\AppData\\Local\\Temp\\tmpp4mqv5lq', '-IC:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\Include', '-IC:\\Program Files (x86)\\Windows Kits\\10\\Include\\10.0.26100.0\\shared', '-IC:\\Program Files (x86)\\Windows Kits\\10\\Include\\10.0.26100.0\\ucrt', '-IC:\\Program Files (x86)\\Windows Kits\\10\\Include\\10.0.26100.0\\um']' returned non-zero exit status 1.

Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information


You can suppress this exception and fall back to eager by setting:
    import torch._dynamo
    torch._dynamo.config.suppress_errors = True
