In [None]:
import os
import glob
import json
import random
import librosa
import numpy as np
import pandas as pd
import soundfile as sf
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image

AUDIO PROCESSING 

In [None]:
'''This Load audio with consistent sample rate,padding and convert stero to mono
   here it would be called once for each audio file you can modify it like you want for multiple files'''

def load_audio(file_path, sample_rate=22050, min_duration=5.0):

    min_samples = int(sample_rate * min_duration)
    audio, sr = sf.read(file_path)
    
    # if its stero converts to mono
    if len(audio.shape) > 1:
        audio = np.mean(audio, axis=1)
    
    # this is to resample 
    if sr != sample_rate:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=sample_rate)
    
    # its for padding
    if len(audio) < min_samples:
        repeats = (min_samples // len(audio)) + 1
        audio = np.tile(audio, repeats)[:min_samples]
    
    return audio.astype('float32') # choose this coz 32-bit floating point format Compatible with libraries like PyTorch, Librosa, and your model

class MelSpectrogramConverter:

    #to convert Converts audio to Mel-spectrograms 

    def __init__(self, sample_rate=22050, n_mels=128, n_fft=2048, hop_length=512):
        self.sample_rate = sample_rate
        self.n_mels = n_mels
        self.n_fft = n_fft
        self.hop_length = hop_length       
        self.mel_basis = librosa.filters.mel(
            sr=sample_rate,
            n_fft=n_fft,
            n_mels=n_mels,
            fmin=20,
            fmax=sample_rate//2
        )
    
    def __call__(self, audio):
        stft = librosa.stft(
            audio,
            n_fft=self.n_fft,
            hop_length=self.hop_length,
            win_length=self.n_fft
        )
    
        mel_spec = np.dot(self.mel_basis, np.abs(stft)**2)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        
        mel_spec_db = (mel_spec_db - np.min(mel_spec_db)) / (np.max(mel_spec_db) - np.min(mel_spec_db))
        return 2 * mel_spec_db - 1

DATA AUGMENTATION

In [None]:
# Appling frequency and time masking to spectrograms

class SpecAugment:
    """Applies frequency and time masking to spectrograms"""
    def __init__(self, freq_mask_max=0.15, time_mask_max=0.20):
        self.freq_mask_max = freq_mask_max
        self.time_mask_max = time_mask_max
    
    def __call__(self, spec):
        spec = spec.copy()
        n_mels, n_time = spec.shape
        
        # Frequency masking
        if self.freq_mask_max > 0:
            max_freq_bands = int(n_mels * self.freq_mask_max)
            freq_bands = random.randint(1, max_freq_bands)
            freq_start = random.randint(0, n_mels - freq_bands)
            spec[freq_start:freq_start+freq_bands, :] = np.min(spec)
        
        # Time masking
        if self.time_mask_max > 0:
            max_time_steps = int(n_time * self.time_mask_max)
            time_steps = random.randint(1, max_time_steps)
            time_start = random.randint(0, n_time - time_steps)
            spec[:, time_start:time_start+time_steps] = np.min(spec)
        
        return spec
    
# It Mixes two audio samples with adjustable ratio

class AudioMixer:
    
    def __init__(self, alpha=0.4, mix_prob=0.5):
        self.alpha = alpha
        self.mix_prob = mix_prob
    
    def __call__(self, dataset, index, spec, label):
        if random.random() > self.mix_prob:
            return spec, label
            
       
        idx2 = random.randint(0, len(dataset) - 1)
        spec2, label2 = dataset.get_sample(idx2)
        
        lam = np.random.beta(self.alpha, self.alpha)
        
        mixed_spec = lam * spec + (1 - lam) * spec2
        mixed_label = lam * label + (1 - lam) * label2
        
        return mixed_spec, mixed_label

DATASET HANDLING

In [None]:
# for Handling FSD50K dataset processing and augmentation

class FSD50KDataset(Dataset):
    
    def __init__(self, manifest_path, labels_map, sample_rate=22050, 
                 n_mels=128, min_duration=5.0, augment=False):
       
        with open(labels_map, 'r') as f:
            self.labels_map = json.load(f)
        
        self.df = pd.read_csv(manifest_path)
        self.audio_paths = self.df['files'].values
        self.labels = self.df['labels'].values
        self.num_classes = len(self.labels_map)

        self.sample_rate = sample_rate
        self.n_mels = n_mels
        self.min_duration = min_duration
        self.augment = augment
        
        self.mel_converter = MelSpectrogramConverter(
            sample_rate=sample_rate,
            n_mels=n_mels
        )
        
        self.spec_augment = SpecAugment() if augment else None
        self.audio_mixer = AudioMixer() if augment else None

    def __len__(self):
        return len(self.audio_paths)

#Get sample without augmentation (for mixing)

    def get_sample(self, idx):
        """Get sample without augmentation (for mixing)"""
        audio = load_audio(
            self.audio_paths[idx],
            self.sample_rate,
            self.min_duration
        )
        return (
            self.mel_converter(audio),
            self._parse_labels(self.labels[idx])
        )
    
    def __getitem__(self, idx):

        audio = load_audio(
            self.audio_paths[idx],
            self.sample_rate,
            self.min_duration
        )
        mel_spec = self.mel_converter(audio)
        
        label = self._parse_labels(self.labels[idx])
        
        # Applying mix augmentation
        if self.augment and self.audio_mixer:
            mel_spec, label = self.audio_mixer(self, idx, mel_spec, label)
        
        # Applying spectrogram augmentation
        if self.augment and self.spec_augment:
            mel_spec = self.spec_augment(mel_spec)
        
        # Convert to tensor and add channel dimension coz CNNs require input in [channels, height, width] format, and this line ensures that
        return (
            torch.tensor(mel_spec).unsqueeze(0),  # Shape: [1, n_mels, time]
            torch.tensor(label)
        )
    
    #Convert label string to multi-hot vector
    
    def _parse_labels(self, label_str):
    
        label_vector = np.zeros(self.num_classes)
        for label in label_str.split(','):
            label_vector[self.labels_map[label]] = 1
        return label_vector