# BirdCLEF 2024 [Train]

This is the baseline of EfficientNetB0 with PyTorch. I strive to simplify the process to get started faster. Therefore, I only use data from BirdCLEF 2024 (no extended data from BirdCLEF'23, 22, 21, and other sources) and unlabeled soundscapes are also not used. Besides, PyTorch-Lightning is employed to organize the training.

Hope this notebook is useful for you!!

* [Pre-Processing](https://www.kaggle.com/code/zijiangyang1116/birdclef-24-speed-up-audio-to-spec-via-cupy)
* [The inference Notebook](https://www.kaggle.com/code/zijiangyang1116/birdclef-24-efficientnetb0-pytorch-inference)

## Features
- Implement with PyTorch and PyTorch-Lightning
- Speed up audio-to-spec. via CuPy
- Use EfficientNetB0 from torchvision

## Table of Contents

- [Import Packages](#Import-Packages)
- [Configuration](#Configuration)
- [Dataset & Dataloader](#Dataset-&-Dataloader)
- [Model](#Model)
- [Functions of Training Loop](#Functions-of-Training-Loop)
- [Training](#Training)

## Update

- V3: fix bug - After validation, self.validation_step_outputs should be cleared.
- V4: use XYMasking

# Import packages

Import all required packages.

In [None]:
#!pip install audiomentations

In [7]:
import os
import gc
import sys
import cv2
import math
import numpy as np
import pandas as pd
from glob import glob
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
import librosa
from scipy import signal as sci_signal

import torch
from torch import nn
from torchvision.models import efficientnet

import albumentations as albu

import pytorch_lightning as pl
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from pytorch_lightning.callbacks import ModelCheckpoint, TQDMProgressBar

from datetime import date

from datetime import datetime  

# # import score function of BirdCLEF
# sys.path.append('/kaggle/input/birdclef-roc-auc')
# sys.path.append('/kaggle/usr/lib/kaggle_metric_utilities')
# from metric import score
# print('Imports done')

RuntimeError: operator torchvision::nms does not exist

# Configuration

Hyper-paramters 

# Note batch diminué pour résoudre out of memory cuda

In [2]:
class config:
    
    # == global config ==
    SEED = 2024  # random seed
    DEVICE = 'cuda'  # device to be used
    MIXED_PRECISION = False  # whether to use mixed-16 precision
    OUTPUT_DIR = '/kaggle/working/'  # output folder
    
    # == data config ==
    DATA_ROOT = '/home/christophe/birdclef'  # root folder
    PREPROCESSED_DATA_ROOT = '/kaggle/input/birdclef24-spectrograms-via-cupy'
    LOAD_DATA = True  # whether to load data from pre-processed dataset
    FS = 32000  # sample rate
    N_FFT = 1095  # n FFT of Spec.
    WIN_SIZE = 412  # WIN_SIZE of Spec.
    WIN_LAP = 100  # overlap of Spec.
    MIN_FREQ = 40  # min frequency
    MAX_FREQ = 15000  # max frequency
    
    # == model config ==
    MODEL_TYPE = 'efficientnet_b0'  # model type
    
    # == dataset config ==
    BATCH_SIZE = 4  # batch size of each step
    N_WORKERS = 4  # number of workers
    
    # == AUG ==
    USE_XYMASKING = True  # whether use XYMasking
    
    # == training config ==
    FOLDS = 10  # n fold
    EPOCHS = 2  # max epochs initialy 15
    LR = 1e-3  # learning rate
    WEIGHT_DECAY = 1e-5  # weight decay of optimizer
    
    # == other config ==
    VISUALIZE = True  # whether to visualize data and batch



def print_date_with_message(message):
    now = datetime.now()
    formatted_date_time = now.strftime("%Y-%m-%d %H:%M")
    print(f"{formatted_date_time} : {message}")

print('fix seed:')
print(config.SEED)
pl.seed_everything(config.SEED, workers=True)

print_date_with_message("Config imported")

Seed set to 2024


fix seed:
2024
2025-07-04 09:18 : Config imported


In [3]:
# labels
label_list = sorted(os.listdir(os.path.join(config.DATA_ROOT, 'train_audio')))
label_id_list = list(range(len(label_list)))
label2id = dict(zip(label_list, label_id_list))
id2label = dict(zip(label_id_list, label_list))

In [11]:
label2id

{'asbfly': 0,
 'ashdro1': 1,
 'ashpri1': 2,
 'ashwoo2': 3,
 'asikoe2': 4,
 'asiope1': 5,
 'aspfly1': 6,
 'aspswi1': 7,
 'barfly1': 8,
 'barswa': 9,
 'bcnher': 10,
 'bkcbul1': 11,
 'bkrfla1': 12,
 'bkskit1': 13,
 'bkwsti': 14,
 'bladro1': 15,
 'blaeag1': 16,
 'blakit1': 17,
 'blhori1': 18,
 'blnmon1': 19,
 'blrwar1': 20,
 'bncwoo3': 21,
 'brakit1': 22,
 'brasta1': 23,
 'brcful1': 24,
 'brfowl1': 25,
 'brnhao1': 26,
 'brnshr': 27,
 'brodro1': 28,
 'brwjac1': 29,
 'brwowl1': 30,
 'btbeat1': 31,
 'bwfshr1': 32,
 'categr': 33,
 'chbeat1': 34,
 'cohcuc1': 35,
 'comfla1': 36,
 'comgre': 37,
 'comior1': 38,
 'comkin1': 39,
 'commoo3': 40,
 'commyn': 41,
 'compea': 42,
 'comros': 43,
 'comsan': 44,
 'comtai1': 45,
 'copbar1': 46,
 'crbsun2': 47,
 'cregos1': 48,
 'crfbar1': 49,
 'crseag1': 50,
 'dafbab1': 51,
 'darter2': 52,
 'eaywag1': 53,
 'emedov2': 54,
 'eucdov': 55,
 'eurbla2': 56,
 'eurcoo': 57,
 'forwag1': 58,
 'gargan': 59,
 'gloibi': 60,
 'goflea1': 61,
 'graher1': 62,
 'grbeat1': 63,
 

In [12]:
id2label

{0: 'asbfly',
 1: 'ashdro1',
 2: 'ashpri1',
 3: 'ashwoo2',
 4: 'asikoe2',
 5: 'asiope1',
 6: 'aspfly1',
 7: 'aspswi1',
 8: 'barfly1',
 9: 'barswa',
 10: 'bcnher',
 11: 'bkcbul1',
 12: 'bkrfla1',
 13: 'bkskit1',
 14: 'bkwsti',
 15: 'bladro1',
 16: 'blaeag1',
 17: 'blakit1',
 18: 'blhori1',
 19: 'blnmon1',
 20: 'blrwar1',
 21: 'bncwoo3',
 22: 'brakit1',
 23: 'brasta1',
 24: 'brcful1',
 25: 'brfowl1',
 26: 'brnhao1',
 27: 'brnshr',
 28: 'brodro1',
 29: 'brwjac1',
 30: 'brwowl1',
 31: 'btbeat1',
 32: 'bwfshr1',
 33: 'categr',
 34: 'chbeat1',
 35: 'cohcuc1',
 36: 'comfla1',
 37: 'comgre',
 38: 'comior1',
 39: 'comkin1',
 40: 'commoo3',
 41: 'commyn',
 42: 'compea',
 43: 'comros',
 44: 'comsan',
 45: 'comtai1',
 46: 'copbar1',
 47: 'crbsun2',
 48: 'cregos1',
 49: 'crfbar1',
 50: 'crseag1',
 51: 'dafbab1',
 52: 'darter2',
 53: 'eaywag1',
 54: 'emedov2',
 55: 'eucdov',
 56: 'eurbla2',
 57: 'eurcoo',
 58: 'forwag1',
 59: 'gargan',
 60: 'gloibi',
 61: 'goflea1',
 62: 'graher1',
 63: 'grbeat1',
 

In [3]:
#os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
#print('for now we will use only one gpu')
import os
import torch
#os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
print('Using', torch.cuda.device_count(), 'GPU(s)')
print(torch.__version__)

# Check if CUDA is available
print(torch.cuda.is_available())

# Check CUDA version
print(torch.version.cuda)

# Check the location of PyTorch installation
print(torch.__file__)

cuda:0
Using 1 GPU(s)
2.5.1
True
11.8
/home/christophe/miniforge3/envs/complex_net/lib/python3.9/site-packages/torch/__init__.py


# Dataset & Dataloader

1. [Load Metadata](#Load-Metadata): Load metadata from dataset
2. [Pre-Processing](#Pre-Processing): The function to convert audio to spectrograms.
3. [Dataset](#Dataset): Yield samples
4. [Augmentation](#Augmentation): Data augmentation
5. [Verify](#Verify): Verify the dataset and dataloader work well

## Load Metadata

In [38]:
metadata_df = pd.read_csv(f'{config.DATA_ROOT}/train_metadata.csv')
metadata_df.head()

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename
0,asbfly,[],['call'],39.2297,118.1987,Muscicapa dauurica,Asian Brown Flycatcher,Matt Slaymaker,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/134896,asbfly/XC134896.ogg
1,asbfly,[],['song'],51.403,104.6401,Muscicapa dauurica,Asian Brown Flycatcher,Magnus Hellström,Creative Commons Attribution-NonCommercial-Sha...,2.5,https://www.xeno-canto.org/164848,asbfly/XC164848.ogg
2,asbfly,[],['song'],36.3319,127.3555,Muscicapa dauurica,Asian Brown Flycatcher,Stuart Fisher,Creative Commons Attribution-NonCommercial-Sha...,2.5,https://www.xeno-canto.org/175797,asbfly/XC175797.ogg
3,asbfly,[],['call'],21.1697,70.6005,Muscicapa dauurica,Asian Brown Flycatcher,vir joshi,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/207738,asbfly/XC207738.ogg
4,asbfly,[],['call'],15.5442,73.7733,Muscicapa dauurica,Asian Brown Flycatcher,Albert Lastukhin & Sergei Karpeev,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/209218,asbfly/XC209218.ogg


In [10]:
for col in metadata_df.columns:
    print(col)

primary_label
secondary_labels
type
latitude
longitude
scientific_name
common_name
author
license
rating
url
filename


column target assigns a class to

Ici nous créons notre dataset 

In [39]:
train_df = metadata_df[['primary_label', 'rating', 'filename']].copy()

# create target
train_df['target'] = train_df.primary_label.map(label2id) # assigns an int to bird species
# create filepath
train_df['filepath'] = config.DATA_ROOT + '/train_audio/' + train_df.filename
# create new sample name
train_df['samplename'] = train_df.filename.map(lambda x: x.split('/')[0] + '-' + x.split('/')[-1].split('.')[0])

print(f'found {len(train_df)} samples')

train_df.sample(20)

found 24459 samples


Unnamed: 0,primary_label,rating,filename,target,filepath,samplename
4253,brodro1,4.0,brodro1/XC843971.ogg,28,/kaggle/input/birdclef-2024/train_audio/brodro...,brodro1-XC843971
9055,eucdov,4.5,eucdov/XC128849.ogg,55,/kaggle/input/birdclef-2024/train_audio/eucdov...,eucdov-XC128849
5924,comkin1,4.0,comkin1/XC742886.ogg,39,/kaggle/input/birdclef-2024/train_audio/comkin...,comkin1-XC742886
15966,labcro1,4.0,labcro1/XC143395.ogg,100,/kaggle/input/birdclef-2024/train_audio/labcro...,labcro1-XC143395
20837,rorpar,1.0,rorpar/XC505788.ogg,143,/kaggle/input/birdclef-2024/train_audio/rorpar...,rorpar-XC505788
5225,comgre,4.0,comgre/XC820709.ogg,37,/kaggle/input/birdclef-2024/train_audio/comgre...,comgre-XC820709
8578,eaywag1,4.0,eaywag1/XC338351.ogg,53,/kaggle/input/birdclef-2024/train_audio/eaywag...,eaywag1-XC338351
17159,litegr,3.0,litegr/XC589946.ogg,106,/kaggle/input/birdclef-2024/train_audio/litegr...,litegr-XC589946
1583,bcnher,5.0,bcnher/XC742880.ogg,10,/kaggle/input/birdclef-2024/train_audio/bcnher...,bcnher-XC742880
8082,comtai1,5.0,comtai1/XC788137.ogg,45,/kaggle/input/birdclef-2024/train_audio/comtai...,comtai1-XC788137


## Pre-Processing

To speed up audio-to-spectrogram, we employ CuPy. `CuPy is a NumPy/SciPy-compatible array library for GPU-accelerated computing with Python,` which can significant improve the efficiency of conversion. For more detailed analysis, you can refer to this [notebook](https://www.kaggle.com/code/zijiangyang1116/birdclef-24-speed-up-audio-to-spec-via-cupy).

Please note, in this notebook, we only use the **center 5 sec** of each audio. By default (`Load_DATA=True`), pre-processed data will be loaded from the [dataset](https://www.kaggle.com/datasets/zijiangyang1116/birdclef24-spectrograms-via-cupy). If `Load_DATA` is set to `False`, spectrograms will be create with `CuPy` (about 30 minites).

### Nous n'utilserons pas ces fonctions puisque notre réseau s'en charge

In [40]:
def oog2spec_via_scipy(audio_data):
    # handles NaNs
    mean_signal = np.nanmean(audio_data)
    audio_data = np.nan_to_num(audio_data, nan=mean_signal) if np.isnan(audio_data).mean() < 1 else np.zeros_like(audio_data)
    
    # to spec.
    frequencies, times, spec_data = sci_signal.spectrogram(
        audio_data, 
        fs=config.FS, 
        nfft=config.N_FFT, 
        nperseg=config.WIN_SIZE, 
        noverlap=config.WIN_LAP, 
        window='hann'
    )
    
    # Filter frequency range
    valid_freq = (frequencies >= config.MIN_FREQ) & (frequencies <= config.MAX_FREQ)
    spec_data = spec_data[valid_freq, :]
    
    # Log
    spec_data = np.log10(spec_data + 1e-20)
    
    # min/max normalize
    spec_data = spec_data - spec_data.min()
    spec_data = spec_data / spec_data.max()
    
    return spec_data

In [41]:
def oog2spec_via_cupy(audio_data):
    
    import cupy as cp
    from cupyx.scipy import signal as cupy_signal
    
    audio_data = cp.array(audio_data)
    
    # handles NaNs
    mean_signal = cp.nanmean(audio_data)
    audio_data = cp.nan_to_num(audio_data, nan=mean_signal) if cp.isnan(audio_data).mean() < 1 else cp.zeros_like(audio_data)
    
    # to spec.
    frequencies, times, spec_data = cupy_signal.spectrogram(
        audio_data, 
        fs=config.FS, 
        nfft=config.N_FFT, 
        nperseg=config.WIN_SIZE, 
        noverlap=config.WIN_LAP, 
        window='hann'
    )
    
    # Filter frequency range
    valid_freq = (frequencies >= config.MIN_FREQ) & (frequencies <= config.MAX_FREQ)
    spec_data = spec_data[valid_freq, :]
    
    # Log
    spec_data = cp.log10(spec_data + 1e-20)
    
    # min/max normalize
    spec_data = spec_data - spec_data.min()
    spec_data = spec_data / spec_data.max()
    
    return spec_data.get()

In [25]:
import sys
sys.exit('This code must not be executed')
if config.LOAD_DATA:
    print('load from file')
    all_bird_data = np.load(f'{config.PREPROCESSED_DATA_ROOT}/spec_center_5sec_256_256.npy', allow_pickle=True).item()
else:
    all_bird_data = dict()
    for i, row_metadata in tqdm(train_df.iterrows()):

        # load ogg
        audio_data, _ = librosa.load(row_metadata.filepath, sr=config.FS)

        # crop
        n_copy = math.ceil(5 * config.FS / len(audio_data))
        if n_copy > 1: audio_data = np.concatenate([audio_data]*n_copy)

        start_idx = int(len(audio_data) / 2 - 2.5 * config.FS)
        end_idx = int(start_idx + 5.0 * config.FS)
        input_audio = audio_data[start_idx:end_idx]

        # ogg to spec.
        input_spec = oog2spec_via_cupy(input_audio)
        
        input_spec = cv2.resize(input_spec, (256, 256), interpolation=cv2.INTER_AREA)

        all_bird_data[row_metadata.samplename] = input_spec.astype(np.float32)

    # save to file
    np.save(os.path.join(config.OUTPUT_DIR, f'spec_center_5sec_256_256.npy'), all_bird_data)

SystemExit: This code must not be executed

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


## Dataset

To yield samples.

nous pourrions ajouter

In [42]:
#from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift

# not used fo now
def augment_audio(audio):
    # Random gain
    gain = np.random.uniform(0.8, 1.2)
    audio *= gain

    # Add random noise
    noise = np.random.normal(0, 0.005, audio.shape)
    audio += noise

    return audio

class BirdDataset(torch.utils.data.Dataset):
    
    def __init__(
        self,
        metadata,
        augmentation=None,
        mode='train'
    ):
        super().__init__()
        self.metadata = metadata
        #self.augmentation = augmentation
        self.mode = mode
       # self.augmentation = Compose([
            #AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
          #  TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
          #  PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
            #Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5),
        #])
    
    def __len__(self):
        return len(self.metadata)
    
    def __getitem__(self, index):
        
        row_metadata = self.metadata.iloc[index]

        audio_data, _ = librosa.load(row_metadata.filepath, sr=config.FS)

        n_copy = math.ceil(5 * config.FS / len(audio_data))
        if n_copy > 1: audio_data = np.concatenate([audio_data]*n_copy)

        start_idx = int(len(audio_data) / 2 - 2.5 * config.FS)
        end_idx = int(start_idx + 5.0 * config.FS)
        input_audio = audio_data[start_idx:end_idx]
        # for now no data augmentatio in train mode
        #if self.augmentation and self.mode=='aug':
           # audio_data = self.augmentation(samples=input_audio, sample_rate=config.FS)

        #input_model = torch.from_numpy(input_audio).unsqueeze(0).unsqueeze(0) 
        input_model = torch.from_numpy(input_audio).unsqueeze(0)
    
        bird_name=row_metadata.primary_label
        # target
        target = row_metadata.target
        
        return input_model, torch.tensor(target, dtype=torch.long),bird_name

print_date_with_message('dataset compiled')

2025-07-03 - dataset compiled


## Augmentation

In [None]:
def get_transforms(_type):
    
    if _type == 'train':
        return albu.Compose([
            albu.HorizontalFlip(0.5),
            albu.XYMasking(
                p=0.3,
                num_masks_x=(1, 3),
                num_masks_y=(1, 3),
                mask_x_length=(1, 10),
                mask_y_length=(1, 20),
            ) if config.USE_XYMASKING else albu.NoOp()
        ])
    elif _type == 'valid':
        return albu.Compose([])

## Ici nous vérifions notre dataset:

In [19]:
import random
dummy_dataset = BirdDataset(train_df,mode='train',augmentation=None)
len_dataset=len(dummy_dataset)
print('number of bird records: ',len_dataset)
num_samples=10

indices = random.sample(range(0, len_dataset), num_samples)

for idx in indices:

    test_input, test_target,bird_name = dummy_dataset[idx]
    print(test_input.detach().numpy().shape)
    print(test_target)
    print(bird_name)

del dummy_dataset
gc.collect()

number of bird records:  24459
(1, 160000)
tensor(90)
insbab1
(1, 160000)
tensor(39)
comkin1
(1, 160000)
tensor(177)
woosan
(1, 160000)
tensor(129)
plapri1
(1, 160000)
tensor(57)
eurcoo
(1, 160000)
tensor(41)
commyn
(1, 160000)
tensor(177)
woosan
(1, 160000)
tensor(76)
grywag
(1, 160000)
tensor(177)
woosan
(1, 160000)
tensor(53)
eaywag1


64665

récupération des valeurs min max de l'audio obsolete?

In [None]:
import random
import sys
sys.exit('lenghty code will exit')
dummy_dataset = BirdDataset(train_df,mode='train')
len_dataset=len(dummy_dataset)
print('number of bird records: ',len_dataset)
#num_samples=10

#indices = random.sample(range(0, len_dataset), num_samples)
max_l=0
min_l=1e30
for idx in range(len(dummy_dataset)):

    test_input, test_target,bird_name = dummy_dataset[idx]
    #print(test_input.detach().numpy().shape)
    lenght_audio=test_input.detach().numpy().shape[2]
    #print(lenght_audio)
    if lenght_audio>max_l:
        max_l=lenght_audio
    if lenght_audio<min_l:
        min_l=lenght_audio
    if idx%300==0:
        print(idx)

del dummy_dataset
gc.collect()
assert max_l!=0
assert min_l!=1e30
print('max duration: ',max_l)
print('min duration: ',min_l)

# Model

## Network

In [43]:
class LearnableSpectrogram(nn.Module):
    def __init__(self, in_channels=1, n_filters=256, kernel_size=256, stride=128, apply_log=True):
        super(LearnableSpectrogram, self).__init__()
        self.apply_log = apply_log
        
        self.conv = nn.Conv1d(in_channels, n_filters, kernel_size, stride=stride)
        self.bn = nn.BatchNorm1d(n_filters)

    def forward(self, x):
        """
        x: Tensor of shape (batch, time)
        Returns: Tensor of shape (batch, 256, 256)
        """
        if x.ndim == 2:
            x = x.unsqueeze(1)  # (batch, 1, time)

        x = self.conv(x)       # (batch, 256, time_frames)
        x = torch.abs(x)
        x = self.bn(x)

        if self.apply_log:
            x = torch.log1p(x)

        return x


class EffNet(nn.Module):
    
    def __init__(self, model_type, n_classes, pretrained=True):
        super().__init__()
        
        if model_type == 'efficientnet_b0':
            if pretrained: weights = efficientnet.EfficientNet_B0_Weights.DEFAULT
            else: weights = None
            self.base_model = efficientnet.efficientnet_b0(weights=weights)
        elif model_type == 'efficientnet_b1':
            if pretrained: weights = efficientnet.EfficientNet_B1_Weights.DEFAULT
            else: weights = None
            self.base_model = efficientnet.efficientnet_b1(weights=weights)
        elif model_type == 'efficientnet_b2':
            if pretrained: weights = efficientnet.EfficientNet_B2_Weights.DEFAULT
            else: weights = None
            self.base_model = efficientnet.efficientnet_b2(weights=weights)
        elif model_type == 'efficientnet_b3':
            if pretrained: weights = efficientnet.EfficientNet_B3_Weights.DEFAULT
            else: weights = None
            self.base_model = efficientnet.efficientnet_b3(weights=weights)
        else:
            raise ValueError('model type not supported')
        
        self.base_model.classifier[1] = nn.Linear(self.base_model.classifier[1].in_features, n_classes, dtype=torch.float32)
    
    def forward(self, x):
        x = x.unsqueeze(-1)
        x = torch.cat([x, x, x], dim=3).permute(0, 3, 1, 2)
        return self.base_model(x)

print_date_with_message('models compiled')

2025-07-03 - models compiled


signal processing train

## Model by PyTorch-Lightning

In [44]:
class BirdModel(pl.LightningModule):
    
    def __init__(self):
        super().__init__()

        self.learnable_process= LearnableSpectrogram()
        
        # == backbone ==
        self.backbone = EffNet(config.MODEL_TYPE, n_classes=len(label_list))
        
        # == loss function ==
        self.loss_fn = nn.CrossEntropyLoss()
        
        # == record ==
        self.validation_step_outputs = []
        
    def forward(self, sounds):
        images=self.learnable_process(sounds)
        return self.backbone(images)
    
    def configure_optimizers(self):
        
        # == define optimizer ==
        model_optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, self.parameters()),
            lr=config.LR,
            weight_decay=config.WEIGHT_DECAY
        )
        
        # == define learning rate scheduler ==
        lr_scheduler = CosineAnnealingWarmRestarts(
            model_optimizer,
            T_0=config.EPOCHS,
            T_mult=1,
            eta_min=1e-6,
            last_epoch=-1
        )
        
        return {
            'optimizer': model_optimizer,
            'lr_scheduler': {
                'scheduler': lr_scheduler,
                'interval': 'epoch',
                'monitor': 'val_loss',
                'frequency': 1
            }
        }
    
    def training_step(self, batch, batch_idx):
        
        # == obtain input and target ==
        image, target,_ = batch
        image = image.to(self.device)
        target = target.to(self.device)
        
        # == pred ==
        y_pred = self(image)
        
        # == compute loss ==
        train_loss = self.loss_fn(y_pred, target)
        
        # == record ==
        self.log('train_loss', train_loss, True)
        
        return train_loss
    
    def validation_step(self, batch, batch_idx):
        
        # == obtain input and target ==
        image, target,_ = batch
        image = image.to(self.device)
        target = target.to(self.device)
        
        # == pred ==
        with torch.no_grad():
            y_pred = self(image)
            
        self.validation_step_outputs.append({"logits": y_pred, "targets": target})
        
    def train_dataloader(self):
        return self._train_dataloader

    def validation_dataloader(self):
        return self._validation_dataloader
    
    def on_validation_epoch_end(self):
        
        # = merge batch data =
        outputs = self.validation_step_outputs
        
        output_val = nn.Softmax(dim=1)(torch.cat([x['logits'] for x in outputs], dim=0)).cpu().detach()
        target_val = torch.cat([x['targets'] for x in outputs], dim=0).cpu().detach()
        
        # = compute validation loss =
        val_loss = self.loss_fn(output_val, target_val)
        
        # target to one-hot
        target_val = torch.nn.functional.one_hot(target_val, len(label_list))
        
        # = val with ROC AUC =
        gt_df = pd.DataFrame(target_val.numpy().astype(np.float32), columns=label_list)
        pred_df = pd.DataFrame(output_val.numpy().astype(np.float32), columns=label_list)
        
        gt_df['id'] = [f'id_{i}' for i in range(len(gt_df))]
        pred_df['id'] = [f'id_{i}' for i in range(len(pred_df))]
        
        val_score = score(gt_df, pred_df, row_id_column_name='id')
        
        self.log("val_score", val_score, True)
        
        # clear validation outputs
        self.validation_step_outputs = list()
        
        return {'val_loss': val_loss, 'val_score': val_score}

print_date_with_message('class birdmodel compiled')

2025-07-03 - class birdmodel compiled


In [28]:
dummy_model = EffNet(config.MODEL_TYPE, n_classes=len(label_list))
learn_signal=LearnableSpectrogram()

bird_model=BirdModel()


dummy_input = torch.randn(1, 256, 256)
print(dummy_model(dummy_input).shape)


import random
dummy_dataset = BirdDataset(train_df,mode='train',augmentation=None)
len_dataset=len(dummy_dataset)
print('number of bird records: ',len_dataset)
num_samples=3

indices = random.sample(range(0, len_dataset), num_samples)

for idx in indices:

    test_input, test_target,bird_name = dummy_dataset[idx]
    image=learn_signal(test_input)
    print('learn signal layer shape: ',image.shape)
    pred=dummy_model(image)
    print('side by side output',pred.shape)
    pred_2=bird_model(test_input)
    print('birdmodel output',pred_2.shape)
   

del dummy_dataset
gc.collect()

torch.Size([1, 182])
number of bird records:  24459
learn signal layer shape:  torch.Size([1, 256, 1249])
side by side output torch.Size([1, 182])
birdmodel output torch.Size([1, 182])
learn signal layer shape:  torch.Size([1, 256, 1249])
side by side output torch.Size([1, 182])
birdmodel output torch.Size([1, 182])
learn signal layer shape:  torch.Size([1, 256, 1249])
side by side output torch.Size([1, 182])
birdmodel output torch.Size([1, 182])


23

# Functions of Training Loop

In [45]:
def predict(data_loader, model):
    model.to(config.DEVICE)
    model.eval()
    predictions = []
    gts = []
    for batch in tqdm(data_loader):
        with torch.no_grad():
            x, y,_= batch
            x = x.cuda()
            outputs = model(x)
            outputs = nn.Softmax(dim=1)(outputs)
        assert outputs is not None, "model inputs is none"
        predictions.append(outputs.detach().cpu())
        gts.append(y.detach().cpu())
    
    predictions = torch.cat(predictions, dim=0).cpu().detach()
    gts = torch.cat(gts, dim=0).cpu().detach()
    gts = torch.nn.functional.one_hot(gts, len(label_list))
    
    return predictions.numpy().astype(np.float32), gts.numpy().astype(np.float32)

print_date_with_message('predict function compiled')

2025-07-03 - predict function compiled


### COMMENTED LINE WITH SCORE!!!!

In [49]:
def run_training(fold_id, total_df):
    print('================================================================')
    print(f"==== Running training for fold {fold_id} ====")
    
    # == create dataset and dataloader ==
    train_df = total_df[total_df['fold'] != fold_id].copy()
    valid_df = total_df[total_df['fold'] == fold_id].copy()
    
    print(f'Train Samples: {len(train_df)}')
    print(f'Valid Samples: {len(valid_df)}')

    train_ds = BirdDataset(train_df)
    val_ds = BirdDataset(valid_df)

    # for now no data augmentation implemented
    #train_ds = BirdDataset(train_df, get_transforms('train'), 'train')
    #val_ds = BirdDataset(valid_df, get_transforms('valid'), 'valid')
    
    train_dl = torch.utils.data.DataLoader(
        train_ds,
        batch_size=config.BATCH_SIZE,
        shuffle=True,
        num_workers=config.N_WORKERS,
        pin_memory=True,
        persistent_workers=True
    )
    
    val_dl = torch.utils.data.DataLoader(
        val_ds,
        batch_size=config.BATCH_SIZE * 2,
        shuffle=False,
        num_workers=config.N_WORKERS,
        pin_memory=True,
        persistent_workers=True
    )

    print('data loaded succesfully')
    
    # == init model ==
    bird_model = BirdModel()

    print('bird_model loaded')
    
    # == init callback ==
    checkpoint_callback = ModelCheckpoint(monitor='val_score',
                                          dirpath=config.OUTPUT_DIR,
                                          save_top_k=1,
                                          save_last=False,
                                          save_weights_only=True,
                                          filename=f"fold_{fold_id}",
                                          mode='max')
    callbacks_to_use = [checkpoint_callback, TQDMProgressBar(refresh_rate=1)]
    
    # == init trainer ==
    trainer = pl.Trainer(
        enable_progress_bar=True,        # Enables rich progress bar
        
        log_every_n_steps=10,
        max_epochs=config.EPOCHS,
        val_check_interval=0.5,
        callbacks=callbacks_to_use,
        enable_model_summary=True,
        accelerator="gpu",
        deterministic=True,
        precision='16-mixed' if config.MIXED_PRECISION else 32,
    )
    print('Will enter training')
    # == Training ==
    trainer.fit(bird_model, train_dataloaders=train_dl, val_dataloaders=val_dl)
    
    # == Prediction ==
    best_model_path = checkpoint_callback.best_model_path
    weights = torch.load(best_model_path)['state_dict']
    bird_model.load_state_dict(weights)
    
    preds, gts = predict(val_dl, bird_model)
    
    # = create dataframe =
    pred_df = pd.DataFrame(preds, columns=label_list)
    pred_df['id'] = np.arange(len(pred_df))
    gt_df = pd.DataFrame(gts, columns=label_list)
    gt_df['id'] = np.arange(len(gt_df))
    
    # = compute score =
    #val_score = score(gt_df, pred_df, row_id_column_name='id')
    
    # == save to file ==
    pred_cols = [f'pred_{t}' for t in label_list]
    valid_df = pd.concat([valid_df.reset_index(), pd.DataFrame(np.zeros((len(valid_df), len(label_list)*2)).astype(np.float32), columns=label_list+pred_cols)], axis=1)
    valid_df[label_list] = gts
    valid_df[pred_cols] = preds
    valid_df.to_csv(f"{config.OUTPUT_DIR}/pred_df_f{fold_id}.csv", index=False)
    
    #return preds, gts, val_score
    return preds, gts

print_date_with_message('run_training function compiled ')

2025-07-03 - run_training function compiled 


# Training

## KFold

In [50]:
kf = KFold(n_splits=config.FOLDS, shuffle=True, random_state=config.SEED)
train_df['fold'] = 0
for fold, (train_idx, val_idx) in enumerate(kf.split(train_df)):
    train_df.loc[val_idx, 'fold'] = fold

print('Kfolds generated')

Kfolds generated


## Training Loop

In [51]:
# training
# we reduced batch size from 32 to 4 to avoid cuda out of memory error
torch.cuda.empty_cache()
import gc

gc.collect()
torch.set_float32_matmul_precision('high')

# record
fold_val_score_list = list()
oof_df = train_df.copy()
pred_cols = [f'pred_{t}' for t in label_list]
oof_df = pd.concat([oof_df, pd.DataFrame(np.zeros((len(oof_df), len(pred_cols)*2)).astype(np.float32), columns=label_list+pred_cols)], axis=1)

for f in range(config.FOLDS):
    
    # get validation index
    val_idx = list(train_df[train_df['fold'] == f].index)
    
    # main loop of f-fold
    #val_preds, val_gts, val_score = run_training(f, train_df)
    val_preds, val_gts = run_training(f, train_df)
    
    # record
    oof_df.loc[val_idx, label_list] = val_gts
    oof_df.loc[val_idx, pred_cols] = val_preds
    #fold_val_score_list.append(val_score)
    
    # only training one fold
    break


#for idx, val_score in enumerate(fold_val_score_list):
   # print(f'Fold {idx} Val Score: {val_score:.5f}')

# oof_gt_df = oof_df[['samplename'] + label_list].copy()
# oof_pred_df = oof_df[['samplename'] + pred_cols].copy()
# oof_pred_df.columns = ['samplename'] + label_list
# oof_score = score(oof_gt_df, oof_pred_df, 'samplename')
# print(f'OOF Score: {oof_score:.5f}')

oof_df.to_csv(f"{config.OUTPUT_DIR}/oof_pred.csv", index=False)

==== Running training for fold 0 ====
Train Samples: 22013
Valid Samples: 2446
data loaded succesfully
bird_model loaded
Will enter training


/opt/conda/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /kaggle/working exists and is not empty.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

ParticipantVisibleError: Input contains NaN.

entrainement de la couche signal process

In [28]:
import random

def oog2spec_via_scipy(audio_data):
    # handles NaNs
    mean_signal = np.nanmean(audio_data)
    audio_data = np.nan_to_num(audio_data, nan=mean_signal) if np.isnan(audio_data).mean() < 1 else np.zeros_like(audio_data)
    
    # to spec.
    frequencies, times, spec_data = sci_signal.spectrogram(
        audio_data, 
        fs=config.FS, 
        nfft=config.N_FFT, 
        nperseg=config.WIN_SIZE, 
        noverlap=config.WIN_LAP, 
        window='hann'
    )
    
    # Filter frequency range
    valid_freq = (frequencies >= config.MIN_FREQ) & (frequencies <= config.MAX_FREQ)
    spec_data = spec_data[valid_freq, :]
    
    # Log
    spec_data = np.log10(spec_data + 1e-20)
    
    # min/max normalize
    spec_data = spec_data - spec_data.min()
    spec_data = spec_data / spec_data.max()
    
    return spec_data
learn_signal=LearnableSpectrogram()

dummy_dataset = BirdDataset(train_df,mode='train',augmentation=None)
len_dataset=len(dummy_dataset)
print('number of bird records: ',len_dataset)
num_samples=1

indices = random.sample(range(0, len_dataset), num_samples)

for idx in indices:

    test_input, test_target,bird_name = dummy_dataset[idx]
    image=learn_signal(test_input)
    print('learn signal layer shape: ',image.shape)
    
   

del dummy_dataset
gc.collect()

torch.Size([1, 182])
number of bird records:  24459
learn signal layer shape:  torch.Size([1, 256, 1249])
side by side output torch.Size([1, 182])
birdmodel output torch.Size([1, 182])
learn signal layer shape:  torch.Size([1, 256, 1249])
side by side output torch.Size([1, 182])
birdmodel output torch.Size([1, 182])
learn signal layer shape:  torch.Size([1, 256, 1249])
side by side output torch.Size([1, 182])
birdmodel output torch.Size([1, 182])


23

In [None]:
import sys
sys.exit('must not be executed')

class Config():
    
    # Sample Rate
    FS = 32000
    
    # make sure the spec. for each 5s audio data is [512, 512]
    N_FFT = 1095  # N FFT
    WIN_SIZE = 412  # WIN Size
    WIN_LAP = 100  # OVERLAP
    # min frequency
    MIN_FREQ = 40
    # max frequency
    MAX_FREQ = 15000
    # Competition Root Folder
    ROOT_FOLDER = '/kaggle/input/birdclef-2024'
    
CONFIG = Config()

train_metadata_df = pd.read_csv(
        '/kaggle/input/birdclef-2024/train_metadata.csv',
        dtype={
            'secondary_labels': 'string',
            'primary_label': 'category',
        },
    )

# Convert secondary_labels to iterable tuple
def parse_secondary_labels(s):
    s = s.strip("[']")
    s = s.split("', '")
    return tuple([e for e in s if len(e) > 0])

train_metadata_df['secondary_labels'] = train_metadata_df['secondary_labels'].apply(parse_secondary_labels)

# Number of samples
CONFIG.N_SAMPLES = len(train_metadata_df)
print(f'# Samples: {CONFIG.N_SAMPLES:,}')



import torch
import torch.nn as nn
import torch.nn.functional as F

class LearnableSpectrogram(nn.Module):
    def __init__(self, in_channels=1, n_filters=64, kernel_size=256, stride=128, apply_log=True):
        super(LearnableSpectrogram, self).__init__()
        self.apply_log = apply_log
        
        # Convolutional filterbank (learnable)
        self.conv = nn.Conv1d(in_channels, n_filters, kernel_size, stride=stride)

        # Optional batch norm
        self.bn = nn.BatchNorm1d(n_filters)

    def forward(self, x):
        """
        x: (batch, time) → assumes mono audio waveform
        returns: (batch, filters, time_frames)
        """
        if x.ndim == 2:
            x = x.unsqueeze(1)  # (batch, 1, time)

        x = self.conv(x)          # (batch, n_filters, time_frames)
        x = torch.abs(x)          # Magnitude of response
        x = self.bn(x)

        if self.apply_log:
            x = torch.log1p(x)    # Log-magnitude

        return x
model=LearnableSpectrogram()
model.eval()

test_audio_row = train_metadata_df.iloc[0]
test_file = f'{CONFIG.ROOT_FOLDER}/train_audio/{test_audio_row.filename}'
print(test_file)

# load file
audio_data, sample_rate = librosa.load(test_file, sr=CONFIG.FS)
print(audio_data.shape, sample_rate)
input_model = torch.from_numpy(audio_data).unsqueeze(0).unsqueeze(0) 
#input_model=torch.from_numpy(audio_data)
print(input_model.shape)
outputs=model(input_model)
print(outputs.shape)

display(train_metadata_df.head())
display(train_metadata_df.info())

In [None]:
# retrieve one sample