In [16]:
import librosa
import numpy as np
import os
from tqdm import tqdm
import tensorflow as tf
import tensorflow_hub as hub
import torch
from torch.utils.data import Dataset, DataLoader
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

model = hub.load('https://tfhub.dev/google/yamnet/1')

In [2]:
import pandas as pd

taxonomy = pd.read_csv("/Users/rohitbogulla/Desktop/Sem 2/ML/Project/BirdClef/birdclef-2025/taxonomy.csv")

label_list = taxonomy['primary_label'].tolist()
label_to_idx = {label: idx for idx, label in enumerate(label_list)}
idx_to_label = {idx: label for label, idx in label_to_idx.items()}

print("Total classes:", len(label_list))

Total classes: 206


In [3]:
# convert the input ogg file to a frequency array 
# length of frequency array is (sample_rate(sr) / hop_length) * length_of_audio_in_seconds

def ogg_to_frequency_array(ogg_file_path, frame_length=1024, hop_length=256):
    """
    Converts OGG audio data to an array of frequencies.

    Args:
        ogg_file_path (str): Path to the OGG audio file.
        frame_length (int): Length of the FFT window.
        hop_length (int): Step size between successive FFT windows.

    Returns:
        numpy.ndarray: Array of frequencies over time.
    """
    y, sr = librosa.load(ogg_file_path, sr=44100)
    frequencies = librosa.fft_frequencies(sr=sr, n_fft=frame_length)
    stft_result = np.abs(librosa.stft(y, n_fft=frame_length, hop_length=hop_length))
    dominant_frequencies = np.argmax(stft_result, axis=0)
    frequency_array = frequencies[dominant_frequencies]

    return frequency_array

# if __name__ == '__main__':
#     file_path = '/Users/rohitbogulla/Desktop/Sem 2/ML/Project/BirdClef/birdclef-2025/train_audio/21038/iNat297879.ogg'
#     frequency_data = ogg_to_frequency_array(file_path, 1024, 256)
#     print(len(frequency_data))

In [4]:
# converting audio to MFCC (Mel-Frequency Cepstral Coefficients) 
# data of the form [n_mfcc, num_frames]

def fixed_length_mfcc(audio_path, sr=44100, duration=10, n_mfcc=10):
    y, _ = librosa.load(audio_path, sr=sr, duration=duration)
    
    # Pad/truncate
    target_len = sr * duration
    if len(y) < target_len:
        y = np.pad(y, (0, target_len - len(y)))
    else:
        y = y[:target_len]
    
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    return mfcc

In [5]:
def extract_yamnet_embedding(audio_path, sr=44100):
    # Load and resample audio
    y, _ = librosa.load(audio_path, sr=sr)
    waveform = tf.convert_to_tensor(y, dtype=tf.float32)

    # Run through YAMNet
    scores, embeddings, spectrogram = model(waveform)
    embeddings_np = embeddings.numpy()

    # Return mean embedding over time (fixed-size vector)
    return np.mean(embeddings_np, axis=0)  # shape: (1024,)

In [61]:
def audio_to_logmelspec(path, random_crop=False):
    sr = 32000
    n_mels = 128
    y, _ = librosa.load(path, sr=sr, mono=True)

    if random_crop:
        segment_samples = int(sr * 5.0)
        if len(y) > segment_samples:
            max_start = len(y) - segment_samples
            start = np.random.randint(0, max_start)
            y = y[start:start + segment_samples]
        else:
            y = np.pad(y, (0, max(0, segment_samples - len(y))))

    mel = librosa.feature.melspectrogram(
        y=y,
        sr=sr,
        n_fft=2048,
        hop_length=512,
        n_mels=n_mels,
        fmin=20,
        fmax=16000
    )
    logmel = librosa.power_to_db(mel, ref=np.max).astype(np.float32)
    logmel_norm = (logmel - logmel.min()) / (logmel.max() - logmel.min())

    melspec = np.expand_dims(logmel_norm, axis=0)
    return melspec

In [57]:
class BirdClefDataset(Dataset):
    def __init__(self, df, label_to_idx, audio_root, sr=32000, n_mels=128, target_len=309, random_crop=False):
        self.df = df.reset_index(drop=True)
        self.label_to_idx = label_to_idx
        self.audio_root = audio_root
        self.sr = sr
        self.n_mels = n_mels
        self.target_len = target_len
        self.random_crop = random_crop

    def __len__(self):
        return len(self.df)


    def audio_to_logmelspec(self, path):
        y, _ = librosa.load(path, sr=self.sr, mono=True)

        if self.random_crop:
            segment_samples = int(self.sr * 5.0)
            if len(y) > segment_samples:
                max_start = len(y) - segment_samples
                start = np.random.randint(0, max_start)
                y = y[start:start + segment_samples]
            else:
                y = np.pad(y, (0, max(0, segment_samples - len(y))))

        mel = librosa.feature.melspectrogram(
            y=y,
            sr=self.sr,
            n_fft=1024,
            hop_length=256,
            n_mels=self.n_mels,
            fmin=20,
            fmax=16000
        )
        logmel = librosa.power_to_db(mel).astype(np.float32)
        return logmel

    def pad_or_crop(self, logmel):
        _, t = logmel.shape
        if t < self.target_len:
            pad_width = self.target_len - t
            logmel = np.pad(logmel, ((0, 0), (0, pad_width)), mode='constant')
        else:
            logmel = logmel[:, :self.target_len]
        return logmel

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        audio_path = os.path.join(self.audio_root, row["filename"])
        
        logmel = self.audio_to_logmelspec(audio_path)
        logmel = self.pad_or_crop(logmel)
        mel_tensor = torch.tensor(logmel, dtype=torch.float32).unsqueeze(0)  # [1, 128, T]

        label_vec = torch.zeros(len(self.label_to_idx))
        for label in row["parsed_labels"]:
            if label in self.label_to_idx:
                label_vec[self.label_to_idx[label]] = 1.0

        return mel_tensor, label_vec

In [60]:
def get_training_data(datatype="frequency_array", data_path='/Users/rohitbogulla/Desktop/Sem 2/ML/Project/BirdClef/birdclef-2025/train_audio/'):
    X_train = []
    y_train = []
    if os.path.isdir(data_path):
        train_dirs = os.listdir(data_path)
    else:
        train_dirs = []
    parsed_labels = []

    num_samples = 0

    if datatype == "frequency_array":
        for index in tqdm(train_dirs):
            if index == ".DS_Store":
                continue
            index_path = data_path+'/'+index
            train_files = os.listdir(data_path+'/'+index)
            for file in train_files:
                file_path = index_path+'/'+file
                X_train.append(ogg_to_frequency_array(file_path, 1024, 256))
                num_samples+=1
                # y_train.append(index)

    if datatype == "embedding":

        for index in tqdm(train_dirs):
            if index == ".DS_Store":
                continue
            index_path = data_path+'/'+index
            train_files = os.listdir(data_path+'/'+index)
            for file in train_files:
                file_path = index_path+'/'+file
                X_train.append(extract_yamnet_embedding(file_path))
                num_samples+=1
                # y_train.append(index)

    if datatype == "melspec":
        # for index in tqdm(train_dirs):
        #     if index == ".DS_Store":
        #         continue
        #     index_path = data_path+'/'+index
        #     train_files = os.listdir(data_path+'/'+index)
        #     for file in train_files:
        #         file_path = index_path+'/'+file
        #         X_train.append(audio_to_logmelspec(file_path, True))
        #         parsed_labels.append(index)
        #         num_samples+=1
        train_data = pd.read_csv(data_path)
        train_data["parsed_labels"] = train_data.apply(
                                        lambda row: list(set([row["primary_label"]])),
                                        axis=1)
        num_classes = len(label_to_idx)
        num_samples = len(train_data)
        y = torch.zeros((num_samples, num_classes))
    
        for i, label_list in enumerate(train_data["parsed_labels"]):
            for label in label_list:
                if label in label_to_idx:
                    y[i, label_to_idx[label]] = 1
        mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=22)
        train_idx, val_idx = next(mskf.split(train_data, y))
        train_df = train_data.iloc[train_idx].reset_index(drop=True)
        val_df = train_data.iloc[val_idx].reset_index(drop=True)
        train_dataset = BirdClefDataset(
                            df=train_df,
                            label_to_idx=label_to_idx,
                            audio_root="/Users/rohitbogulla/Desktop/Sem 2/ML/Project/BirdClef/birdclef-2025/train_audio",
                            random_crop=True
                        )
        val_dataset = BirdClefDataset(
                            df=val_df,
                            label_to_idx=label_to_idx,
                            audio_root="/Users/rohitbogulla/Desktop/Sem 2/ML/Project/BirdClef/birdclef-2025/train_audio",
                            random_crop=True
                        )
        train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True,  num_workers=0)
        val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=0)
        return train_loader, val_loader

    if datatype == "melspec2":
        for index in tqdm(train_dirs):
            if index == ".DS_Store":
                continue
            index_path = data_path+'/'+index
            train_files = os.listdir(data_path+'/'+index)
            for file in train_files:
                file_path = index_path+'/'+file
                X_train.append(audio_to_logmelspec(file_path, True).flatten())
                y_train.append(index)
                num_samples+=1

    
    # num_classes = len(label_to_idx)
    # y_train = np.zeros((num_samples, num_classes))

    # # print(parsed_labels)

    # for i, label in enumerate(parsed_labels):
    #     # for label in label_list:
    #     if label in label_to_idx:
    #         y_train[i, label_to_idx[label]] = 1

    return X_train, y_train

In [52]:
# x, y = get_training_data('melspec2','/Users/rohitbogulla/Desktop/Sem 2/ML/Project/BirdClef/birdclef-2025/train_audio_ff')