Preprocessing Data

Loading library

In [14]:
import os
import pandas as pd
import numpy as np
import librosa
import librosa.display as lid
import matplotlib.pyplot as plt
import IPython.display as ipd
from matplotlib import cm
from tqdm import tqdm
tqdm.pandas()

Configuration for function

In [16]:
class CFG:
    seed = 42
    img_size = [128, 384]
    batch_size = 64
    duration = 15
    sample_rate = 32000
    audio_len = duration * sample_rate
    nfft = 2028
    window = 2048
    hop_length = audio_len // (img_size[1] - 1)
    fmin = 20
    fmax = 16000
    epochs = 10
    preset = 'efficientnetv2_b2_imagenet'
    augment = True

cmap = cm.get_cmap('coolwarm')

Loading data

In [15]:
DATA_DIR = 'E:/Desktop/BU/2025 Spring/MA679_Bird2025/'
train_meta = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))

# Label info
class_names = sorted(train_meta['primary_label'].unique())
name2id = {name: i for i, name in enumerate(class_names)}
id2name = {i: name for name, i in name2id.items()}


Brief info about the file.

In [32]:
train_meta.head()

Unnamed: 0,primary_label,secondary_labels,type,filename,collection,rating,url,latitude,longitude,scientific_name,common_name,author,license
0,1139490,[''],[''],1139490/CSA36385.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,cc-by-nc-sa 4.0
1,1139490,[''],[''],1139490/CSA36389.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3206,-73.7128,Ragoniella pulchella,Ragoniella pulchella,Fabio A. Sarria-S,cc-by-nc-sa 4.0
2,1192948,[''],[''],1192948/CSA36358.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0
3,1192948,[''],[''],1192948/CSA36366.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.28,-73.8582,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0
4,1192948,[''],[''],1192948/CSA36373.ogg,CSA,0.0,http://colecciones.humboldt.org.co/rec/sonidos...,7.3791,-73.7313,Oxyprora surinamensis,Oxyprora surinamensis,Fabio A. Sarria-S,cc-by-nc-sa 4.0


Loading audio, convertion of spectrum ftn
Also handling mel & label pair

In [34]:
def extract_mel_label(row):
    try:
        filepath = os.path.join(DATA_DIR, 'train_audio', row['filename'])
        audio, _ = load_audio(filepath)
        mel = audio_to_mel(audio)
        label = name2id[row['primary_label']]
        return mel, label
    except Exception as e:
        print(f"Error processing {filepath}: {e}")
        return None, None


def load_audio(filepath):
    audio, sr = librosa.load(filepath, sr=CFG.sample_rate)
    if len(audio) < CFG.audio_len:
        audio = np.pad(audio, (0, CFG.audio_len - len(audio)))
    else:
        audio = audio[:CFG.audio_len]
    return audio, sr

def audio_to_mel(audio):
    mel = librosa.feature.melspectrogram(
        y=audio,
        sr=CFG.sample_rate,
        n_fft=CFG.nfft,
        hop_length=CFG.hop_length,
        win_length=CFG.window,
        n_mels=CFG.img_size[0],
        fmin=CFG.fmin,
        fmax=CFG.fmax
    )
    mel_db = librosa.power_to_db(mel, ref=np.max)
    mel_db -= mel_db.min()
    mel_db /= mel_db.max()
    return mel_db


Display sample 1

In [36]:
import matplotlib.pyplot as plt

# Select one row (e.g., the first one)
row = train_meta.iloc[0]

# Extract mel and label
mel, label = extract_mel_label(row)

# Check if successful
if mel is not None:
    print("Mel shape:", mel.shape)
    print("Label (class id):", label, "| Label name:", id2name[label])

    # Plot the mel spectrogram
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(mel, sr=CFG.sample_rate, hop_length=CFG.hop_length,
                             fmin=CFG.fmin, fmax=CFG.fmax,
                             x_axis='time', y_axis='mel', cmap='magma')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Mel Spectrogram')
    plt.tight_layout()
    plt.show()

Error processing E:/Desktop/BU/2025 Spring/MA679_Bird2025/train_audio\1139490/CSA36385.ogg: Target size (2028) must be at least input size (2048)


Augmenters