# Sound Classification - UrbanSound4K

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torchaudio
import numpy as np
import os
import pandas as pd
from torchsummary import summary

## Dataset & Pre Processing 

In [2]:
class UrbanSoundDataset(Dataset):
    def __init__(self, annotations_file, audio_dir, sample_rate, sample_size):
        self.annotations = pd.read_csv(annotations_file)
        self.audio_dir = audio_dir
        self.sample_rate = sample_rate
        self.sample_size = sample_size
        self.device = self._is_cuda()
     
    def __len__(self):
        return len(self.annotations)
    
    def __getitem__(self, index):
        audio_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        '''
        signal = audio signal, sr = sample rate
        signal -> [num_of_channels, samples] 
        '''
        signal, sr = torchaudio.load(audio_path)
        signal = signal.to(self.device)
        signal = self._resample(signal, sr)
        signal = self._mix_down(signal)
        signal = self._right_pad(signal)
        signal = self._truncate(signal)
        signal = self._mel_spectrogram(signal)
        return signal, label
    
    def _is_cuda(self):
        if torch.cuda.is_available():
            device="cuda"
        else:
            device="cpu"
        print(f"Using {device}")
        return device
    
    def _right_pad(self, signal):
        if signal.shape[1] < self.sample_size:
            missing_samples = self.sample_size - signal.shape[1]
            last_dim_padding = (0, missing_samples)
            signal = F.pad(signal, last_dim_padding)
        return signal
    
    def _truncate(self, signal):
        if signal.shape[1] > self.sample_size:
            signal = signal[:, :self.sample_size]
        return signal
    
    def _resample(self, signal, sr):
        if sr != self.sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.sample_rate).to(self.device)
            signal = resampler(signal)
        return signal
    
    def _mel_spectrogram(self, signal):
        mel_spectrogram = torchaudio.transforms.MelSpectrogram(
                            sample_rate=sample_rate, 
                            n_fft=1024,
                            hop_length=512,
                            n_mels=64,
                            ).to(self.device)
        signal = mel_spectrogram(signal)
        return signal
        
    def _mix_down(self, signal):
        if(signal.shape[0] != 1):
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal
    
    def _get_audio_sample_path(self, index):
        fold = f"fold{self.annotations.iloc[index, 5]}"
        path = os.path.join(self.audio_dir, fold, self.annotations.iloc[index, 0])
        return path
    
    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 6]
    
    

In [3]:
annotations_file = "/home/mnk/MNK/Mega/MegNav/Deep-Learning/Speech-Recognition/UrbanSound8K/metadata/UrbanSound8K.csv"
audio_dir = "/home/mnk/MNK/Mega/MegNav/Deep-Learning/Speech-Recognition/UrbanSound8K/audio/"

In [4]:
sample_rate = 22000
sample_size = 22000
data = UrbanSoundDataset(annotations_file, audio_dir, sample_rate, sample_size)

Using cuda


In [5]:
signal, label = data[0]

print(signal)
print(label)
print(signal.shape)
signal_shape = signal.shape

tensor([[[8.1120e-04, 2.2000e-04, 9.0258e-04,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [2.4777e-03, 1.5648e-03, 4.2762e-04,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [2.9734e-03, 6.0683e-03, 4.1591e-03,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         ...,
         [1.7172e-04, 5.9824e-02, 7.9713e-01,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [2.0983e-04, 1.7792e-02, 4.1559e-01,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [3.0722e-04, 1.4679e-02, 3.0088e-01,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00]]], device='cuda:0')
3
torch.Size([1, 64, 43])


## Model 

In [6]:
class CNN(nn.Module):
    def __init__(self, filter):
        super(CNN, self).__init__()
        self.network = nn.Sequential(
            #Layer-1
            nn.Conv2d(
                in_channels=1,
                out_channels=filter,
                kernel_size=3,
                stride=1,
                padding=2                
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            
            #Layer-2
            nn.Conv2d(
                in_channels=filter,
                out_channels=filter*2,
                kernel_size=3,
                stride=1,
                padding=2                
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            
            #Layer-3
            nn.Conv2d(
                in_channels=filter*2,
                out_channels=filter*4,
                kernel_size=3,
                stride=1,
                padding=2                
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            
            #layer-4
            nn.Conv2d(
                in_channels=filter*4,
                out_channels=filter*8,
                kernel_size=3,
                stride=1,
                padding=2                
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            
            nn.Flatten(),
            nn.Linear(128*5*4, 10),
            nn.Softmax(dim=1),    
        )
    
    def forward(self, input):
        self.output = self.network(input)
        
        return self.output
        

In [7]:
model = CNN(filter=16).to("cuda")
summary(model, signal_shape)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 66, 45]             160
              ReLU-2           [-1, 16, 66, 45]               0
         MaxPool2d-3           [-1, 16, 33, 22]               0
            Conv2d-4           [-1, 32, 35, 24]           4,640
              ReLU-5           [-1, 32, 35, 24]               0
         MaxPool2d-6           [-1, 32, 17, 12]               0
            Conv2d-7           [-1, 64, 19, 14]          18,496
              ReLU-8           [-1, 64, 19, 14]               0
         MaxPool2d-9             [-1, 64, 9, 7]               0
           Conv2d-10           [-1, 128, 11, 9]          73,856
             ReLU-11           [-1, 128, 11, 9]               0
        MaxPool2d-12            [-1, 128, 5, 4]               0
          Flatten-13                 [-1, 2560]               0
           Linear-14                   

## Train

In [10]:
def train(model, train_loader, loss_fn, optimizer, device, epochs):
    for i in range(epochs):
        for j, (X, y) in enumerate(train_loader):
            X = X.to(device)
            y = y.to(device)
            
            predictions = model(X)
            loss = loss_fn(predictions, y)
            
            model.zero_grad()
            loss.backward()
            optimizer.step()
            
        print(f"Epoch {i} : Loss {loss.item()}")

In [12]:
batch_size = 128
train_dataloader = DataLoader(data, batch_size)


loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

device="cuda"
epochs=10
train(model, train_dataloader, loss_fn, optimizer, device, epochs)

torch.save(model.state_dict(), "sound_classification.pth")
print("Trained Feed Forward Network")


Epoch 0 : Loss 2.203418254852295
Epoch 1 : Loss 2.175096273422241
Epoch 2 : Loss 2.0296285152435303
Epoch 3 : Loss 2.030651569366455
Epoch 4 : Loss 2.2335026264190674
Epoch 5 : Loss 2.1404218673706055
Epoch 6 : Loss 2.2797720432281494
Epoch 7 : Loss 2.27921462059021
Epoch 8 : Loss 2.3166377544403076
Epoch 9 : Loss 2.316761016845703
Trained Feed Forward Network


## Predictions

In [21]:
from GPUtil import showUtilization as gpu_usage
gpu_usage()
torch.cuda.empty_cache()

| ID | GPU | MEM |
------------------
|  0 |  6% | 97% |


In [34]:
batch_size = 128
train_dataloader = DataLoader(data, batch_size)
model.eval()
predictions = []
for _, (X, y) in enumerate(train_dataloader):
    X = X.to(device)
    y = y.to(device)

    with torch.no_grad():
        pred = model(X)
        for i in range(len(pred)):
            arr = pred[i].to("cpu")
            predictions.append(np.argmax(arr))

    del pred
    torch.cuda.empty_cache()

In [None]:
count=0
i=0
print(len(predictions))
for _, (_, y) in enumerate(train_dataloader):
    y = y.to("cpu").numpy()
    for j in range(len(y)):
        if(predictions[i] == y[j]):
            print(predictions[i], y[j])
            count+=1
        i+=1

In [44]:
print(count/len(predictions))

0.2201099404489235
