## Inference Notebook for final submission
Combined data filtering, pseudo-labels and spectrogram augmentations in one final model. \
Using more powerful [SE-ResNeXt](https://paperswithcode.com/model/seresnext?variant=seresnext50-32x4d) model. \
For more informmation, see training notebook:
- [MLiP Group 25 BirdCLEF 2025 Training Notebook](https://www.kaggle.com/code/maxgewald/mlip-25-birdclef2025-training) 

This notebook is a modified version of:
- [Bird2025 | Single SED Model Inference [LB 0.857]](www.kaggle.com/code/maxgewald/mlip-birdclef-seresnext-inference/edit)

In [1]:
import os
import gc
import warnings
import logging
import time
import math
import cv2
from pathlib import Path
import joblib

import numpy as np
import pandas as pd
import librosa
import soundfile as sf
from soundfile import SoundFile 
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import autocast, GradScaler
import timm
from tqdm.auto import tqdm
from glob import glob
import torchaudio
import random
import itertools
from typing import Union

import concurrent.futures

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.ERROR)

In [2]:
class CFG:
    
    seed = 42
    print_freq = 100
    num_workers = 4
    batch_size=4

    train_datadir = '/kaggle/input/birdclef-2025/train_audio'
    train_csv = '/kaggle/input/birdclef-2025/train.csv'
    test_soundscapes = '/kaggle/input/birdclef-2025/test_soundscapes'
    submission_csv = '/kaggle/input/birdclef-2025/sample_submission.csv'
    taxonomy_csv = '/kaggle/input/birdclef-2025/taxonomy.csv'
    model_paths = ['/kaggle/input/birdclef-2025-mlip-submission/pytorch/label-filtering-pseudo-labels-5-folds/1/',
                  '/kaggle/input/birdclef-2025-mlip-submission/pytorch/seresnext-full-training/1']
 
    pretrained = False
    in_channels = 1
    
    # Mel spectrogram parameters
    n_fft = 1024
    hop_length = 512
    n_mels = 128
    f_min = 50
    f_max = 14000
    target_shape = (256,256)

    projection_dim = 512
    projection_dropout = 0.0
    
    SR = 32000
    target_duration = 5
    infer_duration = 5
    train_duration = 5
    
    device = 'cpu'

cfg = CFG()

In [3]:
print(f"Using device: {cfg.device}")
print(f"Loading taxonomy data...")
taxonomy_df = pd.read_csv(cfg.taxonomy_csv)
species_ids = taxonomy_df['primary_label'].tolist()
num_classes = len(species_ids)
print(f"Number of classes: {num_classes}")

Using device: cpu
Loading taxonomy data...
Number of classes: 206


In [4]:
def set_seed(seed=42):
    """
    Set seed for reproducibility
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(cfg.seed)


In [5]:
class AttBlockV2(nn.Module):
    def __init__(self, in_features: int, out_features: int, activation="linear"):
        super().__init__()

        self.activation = activation
        self.att = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True,
        )
        self.cla = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True,
        )

        self.init_weights()

    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)

    def forward(self, x):
        # x: (n_samples, n_in, n_time)
        norm_att = torch.softmax(torch.tanh(self.att(x)), dim=-1)
        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        return x, norm_att, cla

    def nonlinear_transform(self, x):
        if self.activation == "linear":
            return x
        elif self.activation == "sigmoid":
            return torch.sigmoid(x)


def init_layer(layer):
    nn.init.xavier_uniform_(layer.weight)

    if hasattr(layer, "bias"):
        if layer.bias is not None:
            layer.bias.data.fill_(0.0)

def init_bn(bn):
    bn.bias.data.fill_(0.0)
    bn.weight.data.fill_(1.0)

In [6]:
class SeResNextModel(nn.Module):
    def __init__(self, cfg, name):
        super().__init__()
        
        taxonomy_df = pd.read_csv('/kaggle/input/birdclef-2025/taxonomy.csv')
        self.num_classes = len(taxonomy_df)

        self.bn0 = nn.BatchNorm2d(cfg.target_shape[0])
        
        self.backbone = timm.create_model(
            name,
            pretrained=False,
            in_chans=cfg.in_channels,
            drop_rate=0.0,
            drop_path_rate=0.0,
        )

        layers = list(self.backbone.children())[:-2]
        self.encoder = nn.Sequential(*layers)

        backbone_out = self.backbone.fc.in_features
        self.backbone.fc = nn.Identity()
        
        self.fc1 = nn.Linear(backbone_out, backbone_out, bias=True)
        self.att_block = AttBlockV2(backbone_out, self.num_classes, activation="sigmoid")

        # this is not used, but needed for compatibility
        self.melspec_transform = torchaudio.transforms.MelSpectrogram(
            sample_rate=cfg.SR,
            hop_length=cfg.hop_length,
            n_mels=cfg.n_mels,
            f_min=cfg.f_min,
            f_max=cfg.f_max,
            n_fft=cfg.n_fft,
            pad_mode="constant",
            norm="slaney",
            onesided=True,
            mel_scale="htk",
        )


    def extract_feature(self,x):
        x = x.permute((0, 1, 3, 2))
        frames_num = x.shape[2]
        
        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)
        
        x = x.transpose(2, 3)
        # (batch_size, channels, freq, frames)
        x = self.encoder(x)
        
        # (batch_size, channels, frames)
        x = torch.mean(x, dim=2)
        
        # channel smoothing
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        x = x1 + x2
        
        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)
        return x, frames_num

    def forward(self, x):

        x, frames_num = self.extract_feature(x)
        
        (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
        logit = torch.sum(norm_att * self.att_block.cla(x), dim=2)
        segmentwise_logit = self.att_block.cla(x).transpose(1, 2)
        segmentwise_output = segmentwise_output.transpose(1, 2)

        return torch.logit(clipwise_output)

    def infer(self, x, tta_delta=2):

        x,_ = self.extract_feature(x)
        time_att = torch.tanh(self.att_block.att(x))
        feat_time = x.size(-1)
        start = (
            feat_time / 2 - feat_time * (cfg.infer_duration / cfg.train_duration) / 2
        )
        end = start + feat_time * (cfg.infer_duration / cfg.train_duration)
        start = int(start)
        end = int(end)
        pred = self.attention_infer(start,end,x,time_att)

        start_minus = max(0, start-tta_delta)
        end_minus=end-tta_delta
        pred_minus = self.attention_infer(start_minus,end_minus,x,time_att)

        start_plus = start+tta_delta
        end_plus=min(feat_time, end+tta_delta)
        pred_plus = self.attention_infer(start_plus,end_plus,x,time_att)

        pred = 0.5*pred + 0.25*pred_minus + 0.25*pred_plus
        return pred
        
    def attention_infer(self,start,end,x,time_att):
        feat = x[:, :, start:end]
        # att = torch.softmax(time_att[:, :, start:end], dim=-1)
        #             print(feat_time, start, end)
        #             print(att_a.sum(), att.sum(), time_att.shape)
        framewise_pred = torch.sigmoid(self.att_block.cla(feat))
        framewise_pred_max = framewise_pred.max(dim=2)[0]
        # clipwise_output = torch.sum(framewise_pred * att, dim=-1)
        #logits = torch.sum(
        #    self.att_block.cla(feat) * att,
        #    dim=-1,
        #)

        # return clipwise_output
        return framewise_pred_max

In [7]:
class EffnetModel(nn.Module):
    def __init__(self, cfg, name):
        super().__init__()
        self.cfg = cfg
        self.backbone = timm.create_model(
            name,
            pretrained=cfg.pretrained,
            in_chans=cfg.in_channels,
        )
        backbone_out = self.backbone.classifier.in_features
        self.backbone.classifier = nn.Identity()
        
        self.pooling = nn.AdaptiveAvgPool2d(1)
        self.feat_dim = backbone_out
        if cfg.projection_dim > 0:
            self.projection = nn.Sequential(
                nn.Linear(backbone_out, cfg.projection_dim),
                nn.BatchNorm1d(cfg.projection_dim),
                nn.ReLU(inplace=True),
                nn.Dropout(cfg.projection_dropout),
                nn.Linear(cfg.projection_dim, num_classes)
            )
            self.classifier = self.projection
        else:
            self.classifier = nn.Linear(backbone_out, num_classes)

        
    def forward(self, x, targets=None):

        features = self.backbone(x)
            
        if len(features.shape) == 4:
            features = self.pooling(features)
            features = features.view(features.size(0), -1)
        
        logits = self.classifier(features)
        return logits

    def infer(self, x):
        return self.forward(x)

In [8]:
def transform_to_spec(audio_data):

    if np.isnan(audio_data).any():
        mean_signal = np.nanmean(audio_data)
        audio_data = np.nan_to_num(audio_data, nan=mean_signal)

    mel_spec = librosa.feature.melspectrogram(
        y=audio_data,
        sr=cfg.SR,
        n_fft=cfg.n_fft,
        hop_length=cfg.hop_length,
        n_mels=cfg.n_mels,
        fmin=cfg.f_min,
        fmax=cfg.f_max,
        power=2.0
    )

    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    mel_min, mel_max = mel_spec_db.min(), mel_spec_db.max()
    mel_spec_norm = (mel_spec_db - mel_min) / (mel_max - mel_min + 1e-8)
    
    if mel_spec_norm.shape != cfg.target_shape:
        mel_spec_norm = cv2.resize(mel_spec_norm, cfg.target_shape, interpolation=cv2.INTER_LINEAR)
    
    return torch.tensor(mel_spec_norm, dtype=torch.float32).unsqueeze(0).unsqueeze(0)

In [9]:
def load_sample(path, cfg):
    audio, _ = sf.read(path, dtype="float32")
    audio_length = cfg.SR * cfg.target_duration
    step = audio_length
    segments = []
    
    # Pre-calculate all segments
    for i in range(audio_length, len(audio) + step, step):
        start = max(0, i - audio_length)
        end = start + audio_length
        if end <= len(audio):
            segments.append((start, end))
    
    # Pad audio once
    padded_audio = np.pad(audio, (len(audio), len(audio)), mode='wrap')
    
    audios = []
    train_length = int(cfg.SR * cfg.train_duration)
    target_length = int(cfg.SR * cfg.target_duration)
    pad_length = (train_length - target_length) // 2
    
    for i, (start, end) in enumerate(segments):
        center = len(audio) + (start + end) // 2
        segment_start = center - train_length // 2
        segment_end = segment_start + train_length
        
        y = padded_audio[segment_start:segment_end].astype(np.float32)
        
        # Apply padding only at boundaries
        if i == 0:
            y[:pad_length] = 0
        elif i == len(segments) - 1:
            y[-pad_length:] = 0
            
        audios.append(y)
    
    return audios

def sigmoid(x):
    s = 1 / (1 + np.exp(-x))
    return s

In [10]:
def find_model_files(cfg):
    """
    Find all .pth model files in the specified model directory
    """
    model_files = []
    
    model_dirs = [Path(path) for path in cfg.model_paths]

    for model_dir in model_dirs:
        for path in model_dir.glob('**/*.pth'):
            model_files.append(str(path))
    
    return model_files

def load_models(cfg, num_classes):
    """
    Load all found model files and prepare them for ensemble
    """
    models = []
    
    model_files = find_model_files(cfg)
    
    if not model_files:
        print(f"Warning: No model files found under {cfg.model_path}!")
        return models
    
    print(f"Found a total of {len(model_files)} model files.")
    
    for i, model_path in enumerate(model_files):
        print(f"Loading model: {model_path}")
        checkpoint = torch.load(model_path, map_location=torch.device(cfg.device), weights_only=False)
    
        if "efficientnet_b3" in model_path:
            model = EffnetModel(cfg, "efficientnet_b3")
        elif "resnext" in model_path:
            model = SeResNextModel(cfg, "seresnext26t_32x4d")
            
        state_dict = checkpoint['model_state_dict']
        if any(k.startswith('_orig_mod.') for k in state_dict.keys()):
            state_dict = {k.replace('_orig_mod.', ''): v for k, v in state_dict.items()}
            
        model.load_state_dict(state_dict)
        model = model.to(cfg.device)
        model.eval()
        model.zero_grad()
        model = model.half().float()
        models.append(model)
    torch.set_num_threads(4)

    
    return models

def predict_on_spectrogram(audio_paths, models, cfg):
    """Process multiple audio files in batches"""
    all_row_ids = []
    all_predictions = []
    
    for i in range(0, len(audio_paths), cfg.batch_size):
        batch_paths = audio_paths[i:i+cfg.batch_size]
        batch_specs = []
        batch_row_ids = []
        
        # Load and preprocess batch
        for audio_path in batch_paths:
            audio_path = str(audio_path)
            soundscape_id = Path(audio_path).stem
            audio_data = load_sample(audio_path, cfg)
            
            for segment_idx, audio_input in enumerate(audio_data):
                end_time_sec = (segment_idx + 1) * cfg.target_duration
                row_id = f"{soundscape_id}_{end_time_sec}"
                batch_row_ids.append(row_id)
                
                spec = transform_to_spec(audio_input)
                batch_specs.append(spec)
        
        # Process batch
        if batch_specs:
            batch_tensor = torch.cat(batch_specs, dim=0)
            
            with torch.no_grad():
                if len(models) == 1:
                    outputs = models[0].infer(batch_tensor)
                    batch_preds = outputs.cpu().numpy()
                else:
                    batch_ensemble_preds = []
                    for model in models:
                        outputs = model.infer(batch_tensor)
                        batch_ensemble_preds.append(outputs.cpu().numpy())
                    batch_preds = np.mean(batch_ensemble_preds, axis=0)
            
            all_row_ids.extend(batch_row_ids)
            all_predictions.extend(batch_preds)
    
    return all_row_ids, all_predictions

In [21]:
def run_inference(cfg, models, species_ids):
    """Run inference on all test soundscapes"""
    test_files = list(Path(cfg.test_soundscapes).glob('*.ogg'))
    if len(test_files) == 0:
        test_files = sorted(glob(str(Path('/kaggle/input/birdclef-2025/train_soundscapes') / '*.ogg')))[:50]
    
    print(f"Found {len(test_files)} test soundscapes")
    
    all_row_ids = []
    all_predictions = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=cfg.num_workers) as executor:
        results = list(
            executor.map(
                predict_on_spectrogram,
                test_files,
                itertools.repeat(models),
                itertools.repeat(cfg),
            )
        )

    for rids, preds in results:
        all_row_ids.extend(rids)
        all_predictions.extend(preds)
    
    return all_row_ids, all_predictions

def create_submission(row_ids, predictions, species_ids, cfg):
    """Create submission dataframe"""
    print("Creating submission dataframe...")

    submission_dict = {'row_id': row_ids}
    
    for i, species in enumerate(species_ids):
        submission_dict[species] = [pred[i] for pred in predictions]

    submission_df = pd.DataFrame(submission_dict)

    submission_df.set_index('row_id', inplace=True)

    sample_sub = pd.read_csv(cfg.submission_csv, index_col='row_id')

    missing_cols = set(sample_sub.columns) - set(submission_df.columns)
    if missing_cols:
        print(f"Warning: Missing {len(missing_cols)} species columns in submission")
        for col in missing_cols:
            submission_df[col] = 0.0

    submission_df = submission_df[sample_sub.columns]

    submission_df = submission_df.reset_index()
    
    return submission_df


def smooth_submission(submission_path):
        """
        Post-process the submission CSV by smoothing predictions to enforce temporal consistency.
        
        For each soundscape (grouped by the file name part of 'row_id'), each row's predictions
        are averaged with those of its neighbors using defined weights.
        
        :param submission_path: Path to the submission CSV file.
        """
        print("Smoothing submission predictions...")
        sub = pd.read_csv(submission_path)
        cols = sub.columns[1:]
        # Extract group names by splitting row_id on the last underscore
        groups = sub['row_id'].str.rsplit('_', n=1).str[0].values
        unique_groups = np.unique(groups)
        
        for group in unique_groups:
            # Get indices for the current group
            idx = np.where(groups == group)[0]
            sub_group = sub.iloc[idx].copy()
            predictions = sub_group[cols].values
            new_predictions = predictions.copy()
            
            if predictions.shape[0] > 1:
                # Smooth the predictions using neighboring segments
                new_predictions[0] = (predictions[0] * 0.8) + (predictions[1] * 0.2)
                new_predictions[-1] = (predictions[-1] * 0.8) + (predictions[-2] * 0.2)
                for i in range(1, predictions.shape[0]-1):
                    new_predictions[i] = (predictions[i-1] * 0.2) + (predictions[i] * 0.6) + (predictions[i+1] * 0.2)
            # Replace the smoothed values in the submission dataframe
            sub.iloc[idx, 1:] = new_predictions
        
        sub.to_csv(submission_path, index=False)
        print(f"Smoothed submission saved to {submission_path}")

In [15]:
def main():
    print("Starting BirdCLEF-2025 inference...")

    models = load_models(cfg, num_classes)
    
    if not models:
        print("No models found! Please check model paths.")
        return
    
    print(f"Model usage: {'Single model' if len(models) == 1 else f'Ensemble of {len(models)} models'}")
    start_time = time.time()
    row_ids, predictions = run_inference(cfg, models, species_ids)
    end_time = time.time()
    submission_df = create_submission(row_ids, predictions, species_ids, cfg)

    submission_path = 'submission.csv'
    submission_df.to_csv(submission_path, index=False)
    print(f"Submission saved to {submission_path}")

    smooth_submission(submission_path)
    

    print(f"Inference completed in {(end_time - start_time)/60:.2f} minutes")

In [22]:
if __name__ == "__main__":
    main()

Starting BirdCLEF-2025 inference...
Found a total of 10 model files.
Loading model: /kaggle/input/birdclef-2025-mlip-submission/pytorch/label-filtering-pseudo-labels-5-folds/1/model_20250605_192722_efficientnet_b3_fold1.pth
Loading model: /kaggle/input/birdclef-2025-mlip-submission/pytorch/label-filtering-pseudo-labels-5-folds/1/model_20250605_192722_efficientnet_b3_fold0.pth
Loading model: /kaggle/input/birdclef-2025-mlip-submission/pytorch/label-filtering-pseudo-labels-5-folds/1/model_20250605_192722_efficientnet_b3_fold2.pth
Loading model: /kaggle/input/birdclef-2025-mlip-submission/pytorch/label-filtering-pseudo-labels-5-folds/1/model_20250605_192722_efficientnet_b3_fold4.pth
Loading model: /kaggle/input/birdclef-2025-mlip-submission/pytorch/label-filtering-pseudo-labels-5-folds/1/model_20250605_192722_efficientnet_b3_fold3.pth
Loading model: /kaggle/input/birdclef-2025-mlip-submission/pytorch/seresnext-full-training/1/model_20250607_002141_seresnext26t_32x4d_fold4.pth
Loading mode

LibsndfileError: Error opening '/': Format not recognised.