# **BirdCLEF 2025 Data Preprocessing Notebook**
This notebook demonstrates how we can transform audio data into mel-spectrogram data. This transformation is essential for training 2D Convolutional Neural Networks (CNNs) on audio data, as it converts the one-dimensional audio signals into two-dimensional image-like representations.
I run this public notebook in debug mode(only a few sample processing). You can find the fully preprocessed mel spectrogram training dataset here --> [BirdCLEF'25 | Mel Spectrograms](https://www.kaggle.com/datasets/kadircandrisolu/birdclef25-mel-spectrograms).


In [10]:
import cv2
import time
import os
import librosa
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")

In [11]:
class Config:
 
    DEBUG_MODE = True
    
    OUTPUT_DIR = 'archive/'
    DATASET_NAME = 'test'#train_audio_melspec_Xx5_256_256'
    AUDIO_DIR = "birdclef-2025/train_audio/"
    FS = 32000
    
    # Mel spectrogram parameters
    N_FFT = 1024
    HOP_LENGTH = 512
    N_MELS = 128
    FMIN = 50
    FMAX = 14000
    WINDOW_SIZE = 5
    
    TARGET_DURATION = 5.0
    TARGET_SHAPE = (256, 256)  
    
    N_MAX = 50 if DEBUG_MODE else None  

config = Config()

os.makedirs(f"{config.OUTPUT_DIR}{config.DATASET_NAME}", exist_ok=True)

In [12]:
print(f"Debug mode: {'ON' if config.DEBUG_MODE else 'OFF'}")
print(f"Max samples to process: {config.N_MAX if config.N_MAX is not None else 'ALL'}")

Debug mode: ON
Max samples to process: 50


In [13]:
filepaths = []
for root, dirs, files in os.walk(config.AUDIO_DIR):
    for file in files:
        if file.endswith('.ogg'):
            filepaths.append(os.path.join(root, file))

working_df = pd.DataFrame({"filepath": filepaths})
working_df['samplename'] = working_df.filepath.map(lambda x: os.path.basename(x).split('.')[0])
total_samples = min(len(working_df), config.N_MAX or len(working_df))
with open('sample_list.csv', 'w') as f:
    working_df.to_csv(f)
print(f'Total files to process: {total_samples} out of {len(working_df)} available')

Total files to process: 50 out of 28579 available


In [14]:
def audio2melspec(audio_data):
    if np.isnan(audio_data).any():
        mean_signal = np.nanmean(audio_data)
        audio_data = np.nan_to_num(audio_data, nan=mean_signal)

    mel_spec = librosa.feature.melspectrogram(
        y=audio_data,
        sr=config.FS,
        n_fft=config.N_FFT,
        hop_length=config.HOP_LENGTH,
        n_mels=config.N_MELS,
        fmin=config.FMIN,
        fmax=config.FMAX,
        power=2.0
    )

    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min() + 1e-8)
    
    return mel_spec_norm

In [15]:
def process_audio_segment(audio_data):
    """Process audio segment to get mel spectrogram"""
    if len(audio_data) < config.FS * config.WINDOW_SIZE:
        audio_data = np.pad(audio_data, 
                          (0, config.FS * config.WINDOW_SIZE - len(audio_data)), 
                          mode='constant')
    
    mel_spec = audio2melspec(audio_data)
    
    if mel_spec.shape != config.TARGET_SHAPE:
        mel_spec = cv2.resize(mel_spec, config.TARGET_SHAPE, interpolation=cv2.INTER_LINEAR)
        
    return mel_spec.astype(np.float32)

In [16]:
print("Starting audio processing...")
print(f"{'DEBUG MODE - Processing only 50 samples' if config.DEBUG_MODE else 'FULL MODE - Processing all samples'}")
start_time = time.time()

# Dictionary to track processed files (metadata only, not the actual spectrograms)
all_bird_metadata = {}
errors = []
processed_count = 0

# Process and save each spectrogram individually
for i, row in tqdm(working_df.iterrows(), total=total_samples):
    if config.N_MAX is not None and i >= config.N_MAX:
        break
    
    try:
        audio_data, _ = librosa.load(row.filepath, sr=config.FS)
        
        # Calculate the number of complete segments
        segment_samples = config.FS * config.WINDOW_SIZE
        original_len = len(audio_data) / config.FS
        
        # Calculate padding needed to make audio divisible by segment size
        remainder = original_len % config.WINDOW_SIZE
        if remainder > 0:
            padding_size = int(segment_samples - remainder * config.FS)
            audio_data = np.pad(audio_data, (0, padding_size), mode='constant')
        
        # Recalculate total segments after padding
        total_segments = len(audio_data) // segment_samples
        
        for segment_idx in range(total_segments):
            start_sample = segment_idx * segment_samples
            end_sample = start_sample + segment_samples
            segment_audio = audio_data[start_sample:end_sample]
            
            end_time_sec = (segment_idx + 1) * config.WINDOW_SIZE
            row_id = f"{row.samplename}_{end_time_sec}"
            
            # Process the audio segment
            mel_spec = process_audio_segment(segment_audio)
            
            # Save each spectrogram as a separate numpy file
            spec_filepath = f"{config.OUTPUT_DIR}{config.DATASET_NAME}/{row_id}.npy"
            np.save(spec_filepath, mel_spec)
            processed_count += 1
        
    except Exception as e:
        print(f"Error processing {row.filepath}: {e}")
        errors.append((row.filepath, str(e)))

end_time = time.time()
print(f"Processing completed in {end_time - start_time:.2f} seconds")
print(f"Successfully processed {processed_count} segments out of {total_samples} audio files")
print(f"Failed to process {len(errors)} files")

Starting audio processing...
DEBUG MODE - Processing only 50 samples


  0%|          | 0/50 [00:00<?, ?it/s]

Processing completed in 12.55 seconds
Successfully processed 740 segments out of 50 audio files
Failed to process 0 files
