In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os

# List of important Directories
ROOT = '/kaggle/input'
DATASET_DIR = '/kaggle/input/baby-sounds/Infant_Cry_Sounds_Kaggle'
AUDIO_DIR = DATASET_DIR + "/data"
METADATA_DIR = DATASET_DIR + "/metadata"


In [6]:
# Making metadata (ignore this)
class_indexes = {
#     "hungry": 0,
#     "tired": 1,
#     "uncomfortable": 2,
    "cry": 0,
    "laugh": 1,
    "noise": 2,
    "silence": 3
}

In [22]:
import torch
from torch.utils.data import Dataset
import torchaudio

class BabyDataset(Dataset):
    def __init__(self, annotations_file, audio_dir, transformation, target_sr, num_of_samples, device):
        self.annotations = pd.read_csv(annotations_file) # metadata file
        self.audio_dir = audio_dir # audio directory
        self.device = device
        self.transformation = transformation.to(device) # mel-spectrogram (function)
        self.target_sr = target_sr # sample rate
        self.num_of_samples = num_of_samples
    
    def __len__(self): # returns the total number of audio_files in the dataset
        return len(self.annotations) 
    
    
    # Preprocessing audio whilst getting the audio
    def __getitem__(self, index): # returns the signal(tensor) and label name of the particular audio
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        signal, sr = torchaudio.load(audio_sample_path)
        signal = signal.to(self.device) # send signal to particular device
        signal = self._resample_if_necessary(signal, sr) # maintains all audios with uniform sample rate
        signal = self._mix_down_if_necessary(signal) # makes audio mono-channel
        signal = self._cut_if_necessary(signal) # truncates remaining samples if samples are more than expected num of samples
        signal = self._right_pad_if_necessary(signal) # maintains uniform number of samples if samples are lesser than expected num of samples
        signal = self.transformation(signal)
        return signal, label
    
    def _get_audio_sample_path(self, index): # returns a path string of the particular audio
        path = os.path.join(self.audio_dir, self.annotations.iloc[index, 3], self.annotations.iloc[index, 1])
        return path
    
    def _get_audio_sample_label(self, index): 
        return self.annotations.iloc[index, 1]
    
    def _mix_down_if_necessary(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(
                signal,
                dim=0,
                keepdim=True
            )
        return signal
    
    def _resample_if_necessary(self, signal, sr):
        if sr != self.target_sr:
            resampler = torchaudio.transforms.Resample(sr, self.target_sr).to(self.device)
            signal = resampler(signal)
        return signal
    
    def _cut_if_necessary(self, signal):
        if signal.shape[1] > self.num_of_samples:
            signal = signal[:, :self.num_of_samples]
        return signal
    
    def _right_pad_if_necessary(self, signal):
        signal_length = signal.shape[1]
        if signal.shape[1] < self.num_of_samples:
            num_of_missing_samples = self.num_of_samples - signal_length
            last_dim_padding = (0, num_of_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal
            
    

In [24]:
if __name__ == '__main__':
    ANNOTATIONS_FILE  = METADATA_DIR+"/metadata.csv"
    SAMPLE_RATE = 22050
    NUM_OF_SAMPLES = 22050
    
    
    # check if gpu is available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("USING DEVICE:", device)
    
    mel_spectrogram = torchaudio.transforms.MelSpectrogram( # returns a function
        sample_rate=SAMPLE_RATE,
        n_fft=1024,
        hop_length=512,
        n_mels=64
    )
    baby = BabyDataset( # initializing the Babyset Object
        ANNOTATIONS_FILE,
        AUDIO_DIR,
        mel_spectrogram,
        SAMPLE_RATE,
        NUM_OF_SAMPLES,
        device
    )
    
    # pre-processed and ready-to-go audio can be retrieved by getting item from baby obj using index: baby[index]
    print(baby[0])

USING DEVICE: cuda
before: torch.Size([1, 221376])
after: torch.Size([1, 22050])
(tensor([[[1.4614e-03, 1.0155e-03, 2.9039e-02,  ..., 1.0453e-01,
          4.7356e-02, 2.5281e+00],
         [3.1175e-06, 4.5162e-07, 4.4379e-02,  ..., 4.3996e-02,
          2.0307e-01, 1.9964e+00],
         [5.0374e-07, 3.4675e-07, 3.4060e-02,  ..., 2.9633e-02,
          7.0934e-02, 1.5329e+00],
         ...,
         [4.6024e-08, 4.1675e-08, 9.8247e-06,  ..., 4.8067e-03,
          2.5472e-03, 1.0527e-02],
         [1.0236e-07, 2.8940e-08, 7.1711e-06,  ..., 3.0330e-02,
          1.5261e-02, 3.2795e-02],
         [7.4341e-08, 3.1492e-08, 4.7825e-06,  ..., 2.0047e-01,
          2.2641e-01, 1.6230e-01]]], device='cuda:0'), '1-187207-A.ogg')
