In [None]:
import os
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
import torchaudio
import torchaudio.transforms as T

This Dataset class can be used for both training and to pre-calculate normalization stats for a (spectrogram) dataset

In [None]:
class audio_dataset(Dataset):
    def __init__(self, annotations_file, audio_dir, calculate_norm_stats, spec_mean=0.0, spec_std=0.5):
        self.annos = pd.read_csv(annotations_file) # filename in column 1 and labels in column 3.
        self.audio_dir = audio_dir # all files in '.wav' format

        # The following setting is as specified by AST
        self.sampling_frequency = 16000
        self.mel = T.MelSpectrogram(
                      sample_rate=16000,
                      n_fft=400,
                      win_length=400,
                      hop_length=160,
                      n_mels=128,
                  )
        
        self.a2d = T.AmplitudeToDB()
        self.calculate_norm_stats = calculate_norm_stats
        self.spec_mean = spec_mean
        self.spec_std = spec_std

    def __len__(self):
        return len(self.annos)

    def __getitem__(self, idx):
        audio_file_name = self.annos.iloc[idx, 1]
        audio_path = os.path.join(self.audio_dir, audio_file_name)

        waveform, sample_rate = torchaudio.load(audio_path)
        if sample_rate != self.sampling_frequency:
            waveform = torchaudio.functional.resample(waveform, sample_rate, self.sampling_frequency)
        
        # normalize raw waveform
        waveform = (waveform - torch.mean(waveform)) / (torch.std(waveform) + 1e-6)
        # generate mel spectrogram and convert amplitude to decibels
        spectrogram = self.a2d(self.mel(waveform))

        # labels
        label = int(self.annos.iloc[idx, 3])

        if self.calculate_norm_stats:
          return spectrogram, label
        
        else:
          # normalize spectrogram with custom mean and std values
          spectrogram = (spectrogram - self.spec_mean) / self.spec_std
          return spectrogram, label

Not to be used for training........ To calculate mean and std of a dataset

In [None]:
train_dataset = audio_dataset('ESC50/protocols/train1.csv', 'ESC50/audio/', calculate_norm_stats=True)
test_dataset = audio_dataset('ESC50/protocols/test1.csv', 'ESC50/audio/', calculate_norm_stats=True)

mean = []
std = []

for i in range(len(train_dataset)):
  x,y = train_dataset[i]
  mean.append(torch.mean(x))
  std.append(torch.std(x))

for i in range(len(test_dataset)):
  x,y = test_dataset[i]
  mean.append(torch.mean(x))
  std.append(torch.std(x))

mean = np.asarray(mean)
std = np.asarray(std) 

print("Mean and Std of ESC50 dataset.....", np.mean(mean), np.mean(std))
# Mean and Std of ESC50 dataset..... 5.4969063 22.096338

Mean and Std of ESC50 dataset..... -9.642373 27.26192


Dataloader for training (to be used in the main loop)

In [None]:
train_dataset = audio_dataset('ESC50/protocols/train1.csv', 'ESC50/audio/', calculate_norm_stats=False, spec_mean=5.4969063, spec_std=22.096338)
test_dataset = audio_dataset('ESC50/protocols/test1.csv', 'ESC50/audio/', calculate_norm_stats=False, spec_mean=5.4969063, spec_std=22.096338)

trainloader = DataLoader(train_dataset, batch_size=32)
testloader = DataLoader(test_dataset, batch_size=32)

for x,y in trainloader:
  print(x.shape, y.shape)
  break

for x,y in testloader:
  print(x.shape, y.shape)
  break

torch.Size([32, 1, 224, 716]) torch.Size([32])
torch.Size([32, 1, 224, 716]) torch.Size([32])


In [None]:
# check how min and max values of normalized spectrogrms look like
for i in range(10):
  x,y = test_dataset[i]
  print(x.shape, x.min(), x.max())

torch.Size([1, 224, 716]) tensor(-4.7744) tensor(3.0022)
torch.Size([1, 224, 716]) tensor(-2.9414) tensor(3.0675)
torch.Size([1, 224, 716]) tensor(-0.7115) tensor(2.5673)
torch.Size([1, 224, 716]) tensor(-0.7269) tensor(2.5374)
torch.Size([1, 224, 716]) tensor(-1.3142) tensor(2.8114)
torch.Size([1, 224, 716]) tensor(-1.2775) tensor(2.9726)
torch.Size([1, 224, 716]) tensor(-4.7744) tensor(2.6792)
torch.Size([1, 224, 716]) tensor(-1.7821) tensor(2.8586)
torch.Size([1, 224, 716]) tensor(-2.3361) tensor(2.5816)
torch.Size([1, 224, 716]) tensor(-4.7744) tensor(2.8206)
