# soundNet: Classifying Vehicle Classes from Audio Files

Import Libraries/Modules

In [1]:
import os
from tqdm import tqdm

from statistics import mean
import random
import pandas as pd
import numpy as np
import pickle

import torch
import torchaudio 
import torchvision 
from torchaudio.transforms import AmplitudeToDB, MelSpectrogram, Resample
from torch.utils.data import Dataset, DataLoader
from torch.utils import tensorboard

Config

In [18]:
# Configuration cell
# ------------------
# This cell contains all the configuration for the notebook.

# Path to the directory containing the sound files.
data_dir = '/media/huimin/PortableSSD/KTH/data/SED-single_vehicle-30_sec/'
annotations_path= '/media/huimin/PortableSSD/KTH/data/SED-single_vehicle-30_sec/weak_labels.csv'

# Information on the dataset
classes = [1, 2, 5, 12, 15]
header = ['filepath_wav_30', 'vehicle_class', 'vehicle_class_combo', 'vehicle_class_base',	'speed']
_SEED = 42

# Parameters for the spectrogram computation.
feats = {
    'n_mels': 128,
    'n_filters': 2048,
    'hop_length': 256,
    'n_window': 2048,
    'sample_rate': 16000,
    'f_min': 0,
    'f_max': 8000,
    'duration': 30
}

# Training and model parameters
model = {
    'batch_size' : 12,
    'lr' : 0.001,
    'epochs' : 100,
    'num_workers' : 4,
    'n_frames' : 1876
}


## Data Manipulation

### Loading audio files

In [19]:
# Load the annotations in a pandas dataframe
labels = pd.read_csv(annotations_path, sep='\t')
labels.head()

Unnamed: 0,filepath_wav_30,vehicle_class,vehicle_class_combo,vehicle_class_base,speed
0,wav/20230109/rs_an0005_dt_20230109_tm_015511_t...,2,16,M,33
1,wav/20230109/rs_an0005_dt_20230109_tm_022315_t...,2,16,M,77
2,wav/20230109/rs_an0005_dt_20230109_tm_022947_t...,2,16,M,28
3,wav/20230109/rs_an0005_dt_20230109_tm_023948_t...,5,18,M,20
4,wav/20230109/rs_an0005_dt_20230109_tm_024134_t...,5,18,M,38


In [20]:
# Compare the number of unique filenames and raw number of rows
# to see if there are any duplicates.
unique = labels[header[0]].nunique()
length = labels.shape[0]
if not np.array_equal(unique, length):
    print("There are duplicates in the dataset.")
else:
    print(f"There are {length} unique filenames in the dataset.")

There are 3303 unique filenames in the dataset.


In [21]:
# Verify that the classes are the correct ones
unique_classes = labels[header[1]].unique()
unique_classes.sort()
if not np.array_equal(classes, unique_classes):
    print('The classes are not the same as the ones in the annotations file.')
    print('Classes in annotations file: {}'.format(unique_classes))
    print('Classes in classes variable: {}'.format(classes))
    raise ValueError('The classes are not the same as the ones in the annotations file.')

In [22]:
# Convert classes to be in range [0, num_classes - 1]
# This is necessary for the cross-entropy loss.
labels[header[1]] = labels[header[1]].apply(lambda x: classes.index(x))
labels.head()

Unnamed: 0,filepath_wav_30,vehicle_class,vehicle_class_combo,vehicle_class_base,speed
0,wav/20230109/rs_an0005_dt_20230109_tm_015511_t...,1,16,M,33
1,wav/20230109/rs_an0005_dt_20230109_tm_022315_t...,1,16,M,77
2,wav/20230109/rs_an0005_dt_20230109_tm_022947_t...,1,16,M,28
3,wav/20230109/rs_an0005_dt_20230109_tm_023948_t...,2,18,M,20
4,wav/20230109/rs_an0005_dt_20230109_tm_024134_t...,2,18,M,38


In [23]:
# Create 3 splits: train, validation and test
# -------------------------------------------
# We will create 3 splits: train, validation and test. The train split will be used to train the model, the validation
# split will be used to evaluate the model during training and the test split will be used to evaluate the model after
# training. We will use a 70/10/20 split.
from sklearn.model_selection import train_test_split

# Split the dataset into train and test
train, test = train_test_split(labels, test_size=0.2, random_state=_SEED)
# Split the train dataset into train and validation
train, val = train_test_split(train, test_size=0.1, random_state=_SEED)

train.reset_index(drop=True, inplace=True)
val.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [24]:
# Verify the splits
print(f"Number of unique audio files: {length} = {len(train)} + {len(val)} + {len(test)}") if len(labels) == len(train) + len(val) + len(test) else print('The splits are not correct.')
print('Number of train files: {}'.format(len(train)))
print('Number of validation files: {}'.format(len(val)))
print('Number of test files: {}'.format(len(test)))

Number of unique audio files: 3303 = 2377 + 265 + 661
Number of train files: 2377
Number of validation files: 265
Number of test files: 661


In [25]:
# Get the partitions
# ---------------------
# The partitions will be together, identified by 3 keys (train, val, test) and the values will be the couple (filename,
# class).
if not os.path.exists(data_dir + 'partitions.pkl'):
    # Generate the partitions
    # ---------------------
    partitions = {'train': [], 'val': [], 'test': []}

    partitions['train'] = [(filename, class_value) for filename, class_value in zip(train[header[0]], train[header[1]])]
    partitions['val'] = [(filename, class_value) for filename, class_value in zip(val[header[0]], val[header[1]])]
    partitions['test'] = [(filename, class_value) for filename, class_value in zip(test[header[0]], test[header[1]])]
    
    # Save the partitions
    # -------------------
    with open(data_dir + 'partitions.pkl', 'wb') as f:
        pickle.dump(partitions, f)
else:
    with open(data_dir + 'partitions.pkl', 'rb') as f:
        partitions = pickle.load(f)
# Verify the partitions
if len(partitions['train']) + len(partitions['val']) + len(partitions['test']) != len(labels):
    print('The partitions are not correct.')

### Pre-processing of the audio files

In [26]:
def pad_audio(audio, target_len, fs):
    
    if audio.shape[-1] < target_len:
        audio = torch.nn.functional.pad(
            audio, (0, target_len - audio.shape[-1]), mode="constant"
        )

        padded_indx = [target_len / len(audio)]
        onset_s = 0.000
    
    elif len(audio) > target_len:
        
        rand_onset = random.randint(0, len(audio) - target_len)
        audio = audio[rand_onset:rand_onset + target_len]
        onset_s = round(rand_onset / fs, 3)
        padded_indx = [target_len / len(audio)] 

    else:
        onset_s = 0.000
        padded_indx = [1.0]

    offset_s = round(onset_s + (target_len / fs), 3)
    return audio, onset_s, offset_s, padded_indx

In [27]:
def get_log_melspectrogram(audio, sample_rate, window_size, hop_size, n_mels, f_min, f_max):
    """Compute log melspectrogram of an audio signal."""
    # Compute the mel spectrogram
    mel_spectrogram = MelSpectrogram(
        sample_rate=sample_rate,
        n_fft=window_size,
        win_length=window_size,
        hop_length=hop_size,
        f_min=f_min,
        f_max=f_max,
        n_mels=n_mels,
        window_fn=torch.hamming_window,
        wkwargs={"periodic": False},
        power=1,
    )(audio)
    # Convert to dB
    amp_to_db = AmplitudeToDB(stype="amplitude")
    amp_to_db.amin = 1e-5  # amin= 1e-5 as in librosa
    log_melspectrogram = amp_to_db(mel_spectrogram).clamp(min=-50, max=80)
    return log_melspectrogram

In [28]:
def get_log_melspectrogram_set(set, save_path, sample_rate=feats["sample_rate"], window_size=feats["n_window"], hop_size=feats["hop_length"], n_mels=feats["n_mels"], f_min=feats["f_min"], f_max=feats["f_max"], duration=feats["duration"]): 
    """Compute log melspectrogram of a set of audio signals."""
    for i, filename in enumerate(set[header[0]].unique()):
        print(f"\rConstructing mel audio {i+1}/{len(set[header[0]].unique())}", flush=True)
        audio, sr = torchaudio.load(os.path.join(data_dir + filename))
        if sr != sample_rate:
            resampled_audio = Resample(sr, sample_rate)(audio)
        resampled_audio_pad, *_ = pad_audio(resampled_audio, duration*sample_rate, sample_rate)
        log_melspectrogram = get_log_melspectrogram(resampled_audio_pad, sample_rate, window_size, hop_size, n_mels, f_min, f_max)
        # Create the save path folder if doesn't exist
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        np.save(f"{save_path}/{filename.split('/')[-1].replace('.wav', '')}.npy", log_melspectrogram.numpy())

In [29]:
# Compute the log melspectrogram for each set
# -------------------------------------------
if not os.path.exists(os.path.join(data_dir + 'npy', 'train')):
    print("Constructing mel audio for train set")
    get_log_melspectrogram_set(train, os.path.join(data_dir + 'npy', 'train'))
if not os.path.exists(os.path.join(data_dir + 'npy', 'val')):    
    print("Constructing mel audio for val set")
    get_log_melspectrogram_set(val, os.path.join(data_dir + 'npy', 'val'))
if not os.path.exists(os.path.join(data_dir + 'npy', 'test')):
    print("Constructing mel audio for test set")
    get_log_melspectrogram_set(test, os.path.join(data_dir + 'npy', 'test'))

### Dataloaders

In [30]:
# Create the dataset and the dataloaders
class VehicleDataset(Dataset): 
    def __init__(self, data_dir, partition, set='train', n_frames=model['n_frames']):
        self.partition = partition[set]
        self.data_dir = data_dir
        self.set = set
        self.n_frames = n_frames
    
    def __len__(self):
        return len(self.partition)
    
    def __getitem__(self, idx):
        filename, class_value = self.partition[idx]
        log_melspectrogram = np.load(os.path.join(self.data_dir + 'npy', self.set, filename.split('/')[-1].replace('.wav', '') + '.npy'))[:, :, :self.n_frames]
        return log_melspectrogram, class_value
    
train_dataloader = DataLoader(VehicleDataset(data_dir, partitions), batch_size=model['batch_size'], shuffle=True, num_workers=model['num_workers'])
val_dataloader = DataLoader(VehicleDataset(data_dir, partitions, set='val'), batch_size=model['batch_size'], shuffle=True, num_workers=model['num_workers'])
test_dataloader = DataLoader(VehicleDataset(data_dir, partitions, set='test'), batch_size=model['batch_size'], shuffle=True, num_workers=model['num_workers'])

## Model

In [31]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [32]:
# Load VGG16 model 
vgg16 = torchvision.models.get_model('vgg16', weights=None)
# Change the input layer
vgg16.features[0] = torch.nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
# Modify the last layer
vgg16.classifier[6] = torch.nn.Linear(4096, len(classes))
print(vgg16)

VGG(
  (features): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [33]:
def train(model, optimizer, loader, writer, epochs=10):
    criterion = torch.nn.CrossEntropyLoss()
    for epoch in range(epochs):
        running_loss = []
        t = tqdm(loader)
        for x, y in t:
            x, y = x.to(device), y.to(device)
            outputs = model(x)
            loss = criterion(outputs, y)
            running_loss.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            t.set_description(f'training loss: {mean(running_loss)}')
        writer.add_scalar('training loss', mean(running_loss), epochs)

def test(model, dataloader):
    test_corrects = 0
    total = 0
    with torch.no_grad():
        for x, y in dataloader:
            x = x.to(device)
            y = y.to(device)
            y_hat = model(x).argmax(1)
            test_corrects += y_hat.eq(y).sum().item()
            total += y.size(0)
    return test_corrects / total

In [34]:
vgg16 = vgg16.to(device)
optimizer = torch.optim.Adam(vgg16.parameters(), lr=model['lr'])
writer = tensorboard.SummaryWriter()
train(vgg16, optimizer, train_dataloader, writer, epochs=3)

  0%|          | 0/199 [00:00<?, ?it/s]

training loss: 416.9128973881404:   3%|▎         | 6/199 [02:04<1:06:42, 20.74s/it] 


KeyboardInterrupt: 

In [None]:
test_acc = test(vgg16, test_dataloader)
print(f'Test accuracy:{test_acc}')