# **BirdCLEF 2025 Data Preprocessing Notebook - Combined**
This notebook demonstrates how we can transform audio data into mel-spectrogram data. This transformation is essential for training 2D Convolutional Neural Networks (CNNs) on audio data, as it converts the one-dimensional audio signals into two-dimensional image-like representations.

This combined notebook can process:
- Training audio (labeled data) AND soundscape audio (unlabeled data) in sequence
- Center segments only or all segments
- Save to single file or individual files

If you run this public notebook in debug mode, only a few samples will be processed. You can find the fully preprocessed mel spectrogram training and soundscape dataset here --> [BirdCLEF'25 | Mel Spectrograms](https://www.kaggle.com/datasets/kadircandrisolu/birdclef25-mel-spectrograms).

In [None]:
import cv2
import math
import time
import os
import librosa
import pandas as pd
import numpy as np
import json
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")

In [None]:
class Config:
    DEBUG_MODE = True
    
    # Data source configuration - will process both types
    DATA_TYPES = ['train_audio', 'train_soundscapes']  # Process both in sequence
    DATA_ROOT = 'birdclef-2025'
    
    # Processing mode configuration
    USE_CENTER_ONLY = True  # True: extract center segment only, False: extract all segments
    SAVE_INDIVIDUAL_FILES = False  # True: save as individual .npy files, False: save as single dictionary
    
    # Output configuration
    OUTPUT_DIR = 'archive/'
    DATASET_NAME = 'combined'  # Used for individual files or as filename prefix
    
    # Audio processing parameters
    FS = 32000
    WINDOW_SIZE = 5
    TARGET_DURATION = 5.0
    TARGET_SHAPE = (256, 256)
    
    # Mel spectrogram parameters
    N_FFT = 1024
    HOP_LENGTH = 512
    N_MELS = 128
    FMIN = 50
    FMAX = 14000
    
    # Debug and extension options
    N_MAX = 50 if DEBUG_MODE else None
    ADD_TO_EXISTING = False  # Only used when SAVE_INDIVIDUAL_FILES=False
    
    def get_audio_dir(self, data_type):
        if data_type == 'train_audio':
            return f"{self.DATA_ROOT}/train_audio/"
        elif data_type == 'train_soundscapes':
            return f"{self.DATA_ROOT}/train_soundscapes/"
        else:
            raise ValueError(f"Unknown DATA_TYPE: {data_type}")
    
    def get_output_filename(self, data_type):
        mode = "center" if self.USE_CENTER_ONLY else "all"
        return f"{self.DATASET_NAME}_{data_type}_{mode}_{self.WINDOW_SIZE}_{self.TARGET_SHAPE[0]}_{self.TARGET_SHAPE[1]}.npy"
    
    def save_config(self, filepath):
        """Save configuration to JSON file"""
        config_dict = {attr: getattr(self, attr) for attr in dir(self) if not attr.startswith('__') and not callable(getattr(self, attr))}
        
        with open(filepath, 'w') as f:
            json.dump(config_dict, f, indent=2)
        print(f"Configuration saved to {filepath}")

config = Config()

# Create output directory
if config.SAVE_INDIVIDUAL_FILES:
    for data_type in config.DATA_TYPES:
        os.makedirs(f"{config.OUTPUT_DIR}{config.DATASET_NAME}_{data_type}", exist_ok=True)
else:
    os.makedirs(config.OUTPUT_DIR, exist_ok=True)

# Save configuration
config_filename = f"{config.OUTPUT_DIR}config_{config.DATASET_NAME}.json"
config.save_config(config_filename)

In [None]:
def load_data_for_type(data_type):
    """Load data based on type and return working dataframe"""
    if data_type == 'train_audio':
        print(f"Loading training data with labels...")
        taxonomy_df = pd.read_csv(f'{config.DATA_ROOT}/taxonomy.csv')
        species_class_map = dict(zip(taxonomy_df['primary_label'], taxonomy_df['class_name']))
        
        train_df = pd.read_csv(f'{config.DATA_ROOT}/train.csv')
        
        label_list = sorted(train_df['primary_label'].unique())
        label_id_list = list(range(len(label_list)))
        label2id = dict(zip(label_list, label_id_list))
        
        print(f'Found {len(label_list)} unique species')
        working_df = train_df[['primary_label', 'rating', 'filename']].copy()
        working_df['target'] = working_df.primary_label.map(label2id)
        working_df['filepath'] = config.get_audio_dir(data_type) + working_df.filename
        working_df['samplename'] = working_df.filename.map(lambda x: x.split('/')[0] + '-' + x.split('/')[-1].split('.')[0])
        working_df['class'] = working_df.primary_label.map(lambda x: species_class_map.get(x, 'Unknown'))
        
        print(f'Samples by class:')
        print(working_df['class'].value_counts())
        
    elif data_type == 'train_soundscapes':
        print(f"Loading soundscape data (unlabeled)...")
        audio_dir = config.get_audio_dir(data_type)
        filepaths = [f"{audio_dir}{name}" for name in os.listdir(audio_dir)]
        working_df = pd.DataFrame({"filepath": filepaths})
        working_df['samplename'] = working_df.filepath.map(lambda x: os.path.basename(x).split('.')[0])
        working_df['class'] = 'soundscape'  # All soundscapes get same class
        working_df['target'] = 0  # Dummy target for consistency
    
    total_samples = min(len(working_df), config.N_MAX or len(working_df))
    print(f'Total files to process: {total_samples} out of {len(working_df)} available')
    
    return working_df, total_samples

# Load data for both data types
all_working_dfs = {}
all_total_samples = {}

for data_type in config.DATA_TYPES:
    print(f"\n=== Loading {data_type} ===")
    working_df, total_samples = load_data_for_type(data_type)
    all_working_dfs[data_type] = working_df
    all_total_samples[data_type] = total_samples

In [None]:
def audio2melspec(audio_data):
    if np.isnan(audio_data).any():
        mean_signal = np.nanmean(audio_data)
        audio_data = np.nan_to_num(audio_data, nan=mean_signal)

    mel_spec = librosa.feature.melspectrogram(
        y=audio_data,
        sr=config.FS,
        n_fft=config.N_FFT,
        hop_length=config.HOP_LENGTH,
        n_mels=config.N_MELS,
        fmin=config.FMIN,
        fmax=config.FMAX,
        power=2.0
    )

    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min() + 1e-8)
    
    return mel_spec_norm

In [None]:
def process_audio_segment(audio_data):
    """Process audio segment to get mel spectrogram"""
    target_samples = int(config.WINDOW_SIZE * config.FS)
    
    if len(audio_data) < target_samples:
        audio_data = np.pad(audio_data, 
                          (0, target_samples - len(audio_data)), 
                          mode='constant')
    
    mel_spec = audio2melspec(audio_data)
    
    if mel_spec.shape != config.TARGET_SHAPE:
        mel_spec = cv2.resize(mel_spec, config.TARGET_SHAPE, interpolation=cv2.INTER_LINEAR)
        
    return mel_spec.astype(np.float32)

def process_center_audio(audio_data):
    """Process center portion of audio for single segment extraction"""
    target_samples = int(config.TARGET_DURATION * config.FS)

    if len(audio_data) < target_samples:
        n_copy = math.ceil(target_samples / len(audio_data))
        if n_copy > 1:
            audio_data = np.concatenate([audio_data] * n_copy)

    start_idx = max(0, int(len(audio_data) / 2 - target_samples / 2))
    end_idx = min(len(audio_data), start_idx + target_samples)
    center_audio = audio_data[start_idx:end_idx]

    if len(center_audio) < target_samples:
        center_audio = np.pad(center_audio, 
                             (0, target_samples - len(center_audio)), 
                             mode='constant')

    return process_audio_segment(center_audio)

In [None]:
print("Starting audio processing for both data types...")
mode_desc = "center segment" if config.USE_CENTER_ONLY else "all segments"
save_desc = "individual files" if config.SAVE_INDIVIDUAL_FILES else "single file per type"
print(f"Mode: {mode_desc}, Save: {save_desc}")
print(f"{'DEBUG MODE - Processing only 50 samples per type' if config.DEBUG_MODE else 'FULL MODE - Processing all samples'}")

total_start_time = time.time()
all_results = {}

# Process each data type
for data_type in config.DATA_TYPES:
    print(f"\n{'='*50}")
    print(f"PROCESSING {data_type.upper()}")
    print(f"{'='*50}")
    
    working_df = all_working_dfs[data_type]
    total_samples = all_total_samples[data_type]
    
    start_time = time.time()
    all_bird_data = {}
    errors = []
    processed_count = 0
    
    # Load existing data if needed
    if not config.SAVE_INDIVIDUAL_FILES and config.ADD_TO_EXISTING:
        try:
            existing_file = f"{config.OUTPUT_DIR}{config.get_output_filename(data_type)}"
            if os.path.exists(existing_file):
                all_bird_data = np.load(existing_file, allow_pickle=True).item()
                print(f"Loaded {len(all_bird_data)} existing samples for {data_type}")
        except Exception as e:
            print(f"Could not load existing file for {data_type}: {e}")
    
    # Process each audio file
    for i, row in tqdm(working_df.iterrows(), total=total_samples, desc=f"Processing {data_type}"):
        if config.N_MAX is not None and i >= config.N_MAX:
            break
        
        try:
            audio_data, _ = librosa.load(row.filepath, sr=config.FS)
            
            if config.USE_CENTER_ONLY:
                # Process center segment only
                sample_id = row.samplename
                
                # Skip if already exists and we're adding to existing
                if not config.SAVE_INDIVIDUAL_FILES and config.ADD_TO_EXISTING and sample_id in all_bird_data:
                    continue
                
                mel_spec = process_center_audio(audio_data)
                
                if config.SAVE_INDIVIDUAL_FILES:
                    folder_name = f"{config.DATASET_NAME}_{data_type}"
                    spec_filepath = f"{config.OUTPUT_DIR}{folder_name}/{sample_id}.npy"
                    np.save(spec_filepath, mel_spec)
                else:
                    all_bird_data[sample_id] = mel_spec
                    
                processed_count += 1
                
            else:
                # Process all segments
                segment_samples = config.FS * config.WINDOW_SIZE
                original_len = len(audio_data) / config.FS
                
                # Calculate padding needed to make audio divisible by segment size
                remainder = original_len % config.WINDOW_SIZE
                if remainder > 0:
                    padding_size = int(segment_samples - remainder * config.FS)
                    audio_data = np.pad(audio_data, (0, padding_size), mode='constant')
                
                # Process each segment
                total_segments = len(audio_data) // segment_samples
                
                for segment_idx in range(total_segments):
                    start_sample = segment_idx * segment_samples
                    end_sample = start_sample + segment_samples
                    segment_audio = audio_data[start_sample:end_sample]
                    
                    end_time_sec = (segment_idx + 1) * config.WINDOW_SIZE
                    sample_id = f"{row.samplename}_{end_time_sec}"
                    
                    # Skip if already exists and we're adding to existing
                    if not config.SAVE_INDIVIDUAL_FILES and config.ADD_TO_EXISTING and sample_id in all_bird_data:
                        continue
                    
                    mel_spec = process_audio_segment(segment_audio)
                    
                    if config.SAVE_INDIVIDUAL_FILES:
                        folder_name = f"{config.DATASET_NAME}_{data_type}"
                        spec_filepath = f"{config.OUTPUT_DIR}{folder_name}/{sample_id}.npy"
                        np.save(spec_filepath, mel_spec)
                    else:
                        all_bird_data[sample_id] = mel_spec
                    
                    processed_count += 1
            
        except Exception as e:
            print(f"Error processing {row.filepath}: {e}")
            errors.append((row.filepath, str(e)))
    
    # Save results for this data type
    if not config.SAVE_INDIVIDUAL_FILES:
        # Save single file
        output_filepath = f"{config.OUTPUT_DIR}{config.get_output_filename(data_type)}"
        with open(output_filepath, 'wb') as f:
            np.save(f, all_bird_data)
        print(f"Processed data saved to {output_filepath}")
    
    end_time = time.time()
    print(f"{data_type} processing completed in {end_time - start_time:.2f} seconds")
    print(f"Successfully processed {processed_count} segments")
    print(f"Failed to process {len(errors)} files")
    
    if not config.SAVE_INDIVIDUAL_FILES:
        print(f"Total samples in {data_type} dataset: {len(all_bird_data)}")
    
    # Store results for visualization
    all_results[data_type] = {
        'processed_count': processed_count,
        'errors': len(errors),
        'data': all_bird_data if not config.SAVE_INDIVIDUAL_FILES else {},
        'working_df': working_df
    }

total_end_time = time.time()
print(f"Total processing time: {total_end_time - total_start_time:.2f} seconds")

In [None]:
# Visualization for both data types
import matplotlib.pyplot as plt

print(f"Creating visualization for both data types...")

fig, axes = plt.subplots(len(config.DATA_TYPES), 2, figsize=(16, 8 * len(config.DATA_TYPES)))
if len(config.DATA_TYPES) == 1:
    axes = [axes]

for data_type_idx, data_type in enumerate(config.DATA_TYPES):
    result = all_results[data_type]
    working_df = result['working_df']
    
    samples = []
    displayed_classes = set()
    max_samples = 2  # 2 samples per data type to fit in layout
    
    for i, row in working_df.iterrows():
        if i >= (config.N_MAX or len(working_df)):
            break
        
        if config.USE_CENTER_ONLY:
            sample_id = row.samplename
        else:
            sample_id = f"{row.samplename}_{config.WINDOW_SIZE}"  # Use first segment for display
        
        # Check if sample exists
        sample_exists = False
        if config.SAVE_INDIVIDUAL_FILES:
            folder_name = f"{config.DATASET_NAME}_{data_type}"
            sample_path = f"{config.OUTPUT_DIR}{folder_name}/{sample_id}.npy"
            sample_exists = os.path.exists(sample_path)
        else:
            sample_exists = sample_id in result['data']
        
        if sample_exists:
            class_name = row.get('class', 'Unknown')
            if class_name not in displayed_classes or len(displayed_classes) < max_samples:
                samples.append((sample_id, class_name, row.get('primary_label', 'N/A'), data_type))
                displayed_classes.add(class_name)
            if len(samples) >= max_samples:
                break
    
    # Plot up to 2 samples for this data type
    for sample_idx in range(min(len(samples), 2)):
        sample_id, class_name, species, _ = samples[sample_idx]
        
        ax = axes[data_type_idx][sample_idx] if len(config.DATA_TYPES) > 1 else axes[sample_idx]
        
        if config.SAVE_INDIVIDUAL_FILES:
            folder_name = f"{config.DATASET_NAME}_{data_type}"
            sample_path = f"{config.OUTPUT_DIR}{folder_name}/{sample_id}.npy"
            mel_spec = np.load(sample_path)
        else:
            mel_spec = result['data'][sample_id]
        
        im = ax.imshow(mel_spec, aspect='auto', origin='lower', cmap='viridis')
        ax.set_title(f"{data_type}: {class_name} - {species}")
        plt.colorbar(im, ax=ax, format='%+2.0f dB')
    
    # If less than 2 samples, hide empty subplot
    if len(samples) < 2:
        ax = axes[data_type_idx][1] if len(config.DATA_TYPES) > 1 else axes[1]
        ax.set_visible(False)

plt.tight_layout()
debug_note = "debug_" if config.DEBUG_MODE else ""
save_name = f'{debug_note}melspec_examples_combined.png'
plt.savefig(save_name)
plt.show()
print(f"Visualization saved as {save_name}")