In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import torch

USE_GPU = True
if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print('using device:', device)

using device: cuda


In [None]:
import pandas as pd
from pathlib import Path
%cd "/content/drive/MyDrive/Colab Notebooks/SC201-Project/archive"
download_path = Path.cwd()

# Read metadata file
metadata_file = download_path/'UrbanSound8K.csv'
df = pd.read_csv(metadata_file)
df.head()

# Construct file path by concatenating fold and file name
df['relative_path'] = '/fold' + df['fold'].astype(str) + '/' + df['slice_file_name'].astype(str)

# Take relevant columns
df = df[['relative_path', 'classID']]
df.head()

/content/drive/MyDrive/Colab Notebooks/SC201-Project/archive


Unnamed: 0,relative_path,classID
0,/fold5/100032-3-0-0.wav,3
1,/fold5/100263-2-0-117.wav,2
2,/fold5/100263-2-0-121.wav,2
3,/fold5/100263-2-0-126.wav,2
4,/fold5/100263-2-0-137.wav,2


In [None]:
import math, random
import torch
! pip install torchaudio
import torchaudio
from torchaudio import transforms
from IPython.display import Audio

class AudioUtil():
  """
  The AudioUtil Class includes all the function to finish the data transform,
  preprocessing and augmentation
  """

  @staticmethod
  def open(audio_file):
    # Load an audio file. Return the signal as a tensor and the sample rate
    sig, sr = torchaudio.load(audio_file)
    return (sig, sr)


  @staticmethod
  def rechannel(aud, new_channel):
    # Convert the given audio to the desired number of channels
    sig, sr = aud
    if (sig.shape[0] == new_channel):
      # Nothing to do
      return aud
    if (new_channel == 1):
      # Convert from stereo to mono by selecting only the first channel
      resig = sig[:1, :]
    else:
      # Convert from mono to stereo by duplicating the first channel
      resig = torch.cat([sig, sig])
    return ((resig, sr))


  @staticmethod
  def resample(aud, newsr):
    # Since Resample applies to a single channel, 
    # we resample one channel at a time
    sig, sr = aud
    if (sr == newsr):
      # Nothing to do
      return aud
    num_channels = sig.shape[0]
    # Resample first channel
    resig = torchaudio.transforms.Resample(sr, newsr)(sig[:1,:])
    if (num_channels > 1):
      # Resample the second channel and merge both channels
      retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:,:])
      resig = torch.cat([resig, retwo])
    return ((resig, newsr))


  @staticmethod
  def pad_trunc(aud, max_ms):
    # Pad (or truncate) the signal to a fixed length 'max_ms' in milliseconds
    sig, sr = aud
    num_rows, sig_len = sig.shape
    max_len = sr//1000 * max_ms

    if (sig_len > max_len):
      # Truncate the signal to the given length
      sig = sig[:,:max_len]

    elif (sig_len < max_len):
      # Length of padding to add at the beginning and end of the signal
      pad_begin_len = random.randint(0, max_len - sig_len)
      pad_end_len = max_len - sig_len - pad_begin_len

      # Pad with 0s
      pad_begin = torch.zeros((num_rows, pad_begin_len))
      pad_end = torch.zeros((num_rows, pad_end_len))

      sig = torch.cat((pad_begin, sig, pad_end), 1)
    return (sig, sr)


  @staticmethod
  def time_shift(aud, shift_limit):
    # data augumentation, shifting audio to left or right
    sig,sr = aud
    _, sig_len = sig.shape
    shift_amt = int(random.random() * shift_limit * sig_len)
    return (sig.roll(shift_amt), sr)


  @staticmethod
  def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
    # transform the audio to the Mel Spectrogram
    sig,sr = aud
    top_db = 80
    # spec has shape [channel, n_mels, time], where channel is mono, stereo etc
    spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)
    # Convert to decibels
    spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
    return (spec)


  @staticmethod
  def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
    # Augment the Spectrogram by masking out some sections of it in both the frequency
    # dimension (ie. horizontal bars) and the time dimension (vertical bars) to prevent
    # overfitting and to help the model generalise better. The masked sections are
    # replaced with the mean value.
    _, n_mels, n_steps = spec.shape
    mask_value = spec.mean()
    aug_spec = spec

    freq_mask_param = max_mask_pct * n_mels
    for _ in range(n_freq_masks):
      aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)

    time_mask_param = max_mask_pct * n_steps
    for _ in range(n_time_masks):
      aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)

    return aug_spec

Collecting torchaudio
  Downloading torchaudio-0.9.0-cp37-cp37m-manylinux1_x86_64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 5.4 MB/s 
Installing collected packages: torchaudio
Successfully installed torchaudio-0.9.0


In [None]:
from torch.utils.data import DataLoader, Dataset, random_split
import torchaudio


class SoundDS(Dataset):
  def __init__(self, df, data_path):
    # initialte the parameters and load teh dateset
    self.df = df
    self.data_path = str(data_path)
    self.duration = 4000
    self.sr = 44100
    self.channel = 2
    self.shift_pct = 0.4
            
  def __len__(self):
    # return the length of dataset
    return len(self.df)    
    

  # Get i'th item in dataset
  def __getitem__(self, idx):
    # Absolute file path of the audio file - concatenate the audio directory with
    # the relative path
    audio_file = self.data_path + self.df.loc[idx, 'relative_path']
    # Get the Class ID
    class_id = self.df.loc[idx, 'classID']
    aud = AudioUtil.open(audio_file)
    # Some sounds have a higher sample rate, or fewer channels compared to the
    # majority. So make all sounds have the same number of channels and same 
    # sample rate. Unless the sample rate is the same, the pad_trunc will still
    # result in arrays of different lengths, even though the sound duration is
    # the same.
    reaud = AudioUtil.resample(aud, self.sr)
    rechan = AudioUtil.rechannel(reaud, self.channel)

    dur_aud = AudioUtil.pad_trunc(rechan, self.duration)
    shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct)
    sgram = AudioUtil.spectro_gram(shift_aud, n_mels=64, n_fft=1024, hop_len=None)
    aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)

    return aug_sgram, class_id

In [None]:
from torch.utils.data import random_split

myds = SoundDS(df, download_path)

# Random split of 80:20 between training and validation
num_items = len(myds)
num_train = round(num_items * 0.8)
num_val = num_items - num_train
train_ds, val_ds = random_split(myds, [num_train, num_val])

# Create training and validation data loaders
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=16, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=16, shuffle=False)

In [None]:
import torch.nn.functional as F
import torch.nn as nn
from torch.nn import init


class AudioClassifier (nn.Module):
    # Build the model architecture
    def __init__(self):
        super().__init__()
        conv_layers = []

        # First Convolution Block with Relu and Batch Norm. Use Kaiming Initialization
        self.conv1 = nn.Conv2d(2, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
        self.relu1 = nn.ReLU()
        self.bn1 = nn.BatchNorm2d(8)
        init.kaiming_normal_(self.conv1.weight, a=0.1)
        self.conv1.bias.data.zero_()
        conv_layers += [self.conv1, self.relu1, self.bn1]

        # Second Convolution Block
        self.conv2 = nn.Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu2 = nn.ReLU()
        self.bn2 = nn.BatchNorm2d(16)
        init.kaiming_normal_(self.conv2.weight, a=0.1)
        self.conv2.bias.data.zero_()
        conv_layers += [self.conv2, self.relu2, self.bn2]

        # Thrid Convolution Block
        self.conv3 = nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu3 = nn.ReLU()
        self.bn3 = nn.BatchNorm2d(32)
        init.kaiming_normal_(self.conv3.weight, a=0.1)
        self.conv3.bias.data.zero_()
        conv_layers += [self.conv3, self.relu3, self.bn3]

        # Fourth Convolution Block
        self.conv4 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu4 = nn.ReLU()
        self.bn4 = nn.BatchNorm2d(64)
        init.kaiming_normal_(self.conv4.weight, a=0.1)
        self.conv4.bias.data.zero_()
        conv_layers += [self.conv4, self.relu4, self.bn4]

        # Linear Classifier
        self.ap = nn.AdaptiveAvgPool2d(output_size=1)
        self.lin = nn.Linear(in_features=64, out_features=10)

        # Wrap the Convolutional Blocks
        self.conv = nn.Sequential(*conv_layers)
 

    # Forward pass computations
    def forward(self, x):
        # Run the convolutional blocks
        x = self.conv(x)

        # Adaptive pool and flatten for input to linear layer
        x = self.ap(x)
        x = x.view(x.shape[0], -1)

        # Linear layer
        x = self.lin(x)

        # Final output
        return x

# Create the model and put it on the GPU if available
myModel = AudioClassifier()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
myModel = myModel.to(device)
# Check that it is on Cuda
next(myModel.parameters()).device

device(type='cuda', index=0)

In [None]:
import time
from torchvision import models
validation_loss_history = []
validation_acc_history = []
training_loss_histroy = []
training_acc_history = []
# Training Loop
# ----------------------------
def training(train_dl, num_epochs, model):
  """
  Loss Function, Optimizer and Scheduler
  Loss Function: For multi-class problem, Cross Entropy and KL divergence are the
                 loss often used.
  Learning Rating - OneCycleLR: Sets the learning rate of each parameter group 
  according to the 1cycle learning rate policy. The 1cycle policy anneals the 
  learning rate from an initial learning rate to some maximum learning rate and 
  then from that maximum learning rate to some minimum learning rate much lower 
  than the initial learning rate.
  """
  


  if model == 'baseline':
    model_save_name = 'classifier_baseline.pt'
    model = myModel
    print(f'model={myModel}')
  elif model == 'VGG':
    model_save_name = 'classifier_vgg19bn.pt'
    model = models.vgg19_bn(pretrained=True).cuda()
    for param in model.parameters():
      param.requires_grad = False
    first_conv_layer = [nn.Conv2d(2, 3, kernel_size=3, stride=1, padding=1, dilation=1, groups=1, bias=True)]
    first_conv_layer.extend(list(model.features))  
    model.features= nn.Sequential(*first_conv_layer )  

    # add the classifier
    model.classifier[6] = nn.Sequential(
                          nn.Linear(in_features=4096, out_features=256, bias=True),
                          nn.ReLU(),
                          nn.Dropout(0.4),
                          nn.Linear(in_features=256, out_features=10, bias=True),
                          )
    model = model.to(device)
  elif model == 'ResNet':
    model_save_name = 'classifier_resnext101-unfreezed-100epooch.pt'
    model = models.resnext101_32x8d(pretrained=True).cuda()

    model = nn.Sequential(
      nn.Conv2d(2, 3, kernel_size=3, stride=1, padding=1, dilation=1, groups=1, bias=True),
      model)
    model.fc = nn.Sequential(
        nn.Linear(1000,10,bias=True))
    model = model.to(device)
    for param in model.parameters():
      param.requires_grad = True

  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
  scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
                                                steps_per_epoch=int(len(train_dl)),
                                                epochs=num_epochs,
                                                anneal_strategy='linear')


  # Repeat for each epoch
  print('Started Training')
  st_time = time.time()
  for epoch in range(num_epochs):
    running_loss = 0.0
    correct_prediction = 0
    total_prediction = 0

    # Repeat for each batch in the training set
    for i, data in enumerate(train_dl):
        # Get the input features and target labels, and put them on the GPU
        inputs, labels = data[0].to(device), data[1].to(device)

        # Normalize the inputs
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s

        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Keep stats for Loss and Accuracy
        running_loss += loss.item()

        # Get the predicted class with the highest score
        _, prediction = torch.max(outputs,1)
        # Count of predictions that matched the target label
        correct_prediction += (prediction == labels).sum().item()
        total_prediction += prediction.shape[0]

        if i % 50 == 0 and i != 0:    # print every 50 mini-batches
           print('At epoch {}, batch {}, having loss: {:.3f}'.format(epoch + 1, i + 1, running_loss / i))
    
    # Print stats at the end of the epoch
    
    num_batches = len(train_dl)
    avg_loss = running_loss / num_batches
    acc = correct_prediction/total_prediction
    training_loss_histroy.append(avg_loss)
    training_acc_history.append(acc)
    print(f'Epoch: {epoch+1}, Loss: {avg_loss:.3f}, Training Accuracy: {acc:.3f}')
    

    # if (epoch+1) % 5 == 0:
      # check the accuracy every 5 epoch
    inference(model, val_dl)

  et_time = time.time()
  path = F"/content/drive/MyDrive/Colab Notebooks/SC201-Project/{model_save_name}" 
  torch.save(model.state_dict(), path)
  print('Finished Training')
  print('The training time is {:.3f} seconds'.format(et_time-st_time))
num_epochs= 50
training(train_dl, num_epochs, model='ResNet')


Downloading: "https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth" to /root/.cache/torch/hub/checkpoints/resnext101_32x8d-8ba56ff5.pth


  0%|          | 0.00/340M [00:00<?, ?B/s]

Started Training


KeyboardInterrupt: ignored

In [None]:
from torchvision import models
model = models.resnext101_32x8d(pretrained=True).cuda()
for param in model.parameters():
  param.requires_grad = False
model = nn.Sequential(
  nn.Conv2d(2, 3, kernel_size=3, stride=1, padding=1, dilation=1, groups=1, bias=True),
  model)
model.fc = nn.Sequential(
    nn.Linear(1000,10,bias=True))
model = model.to(device)
model_save_name = 'classifier_resnext101-unfreezed.pt'
path = F"/content/drive/MyDrive/Colab Notebooks/SC201-Project/{model_save_name}"
model.load_state_dict(torch.load(path))


<All keys matched successfully>

In [12]:
import time
from torchvision import models
validation_loss_history = []
validation_acc_history = []
training_loss_histroy = []
training_acc_history = []
# Training Loop
# ----------------------------
def training(train_dl, num_epochs, model):
  """
  Loss Function, Optimizer and Scheduler
  Loss Function: For multi-class problem, Cross Entropy and KL divergence are the
                 loss often used.
  Learning Rating - OneCycleLR: Sets the learning rate of each parameter group 
  according to the 1cycle learning rate policy. The 1cycle policy anneals the 
  learning rate from an initial learning rate to some maximum learning rate and 
  then from that maximum learning rate to some minimum learning rate much lower 
  than the initial learning rate.
  """
  
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
  scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
                                                steps_per_epoch=int(len(train_dl)),
                                                epochs=num_epochs,
                                                anneal_strategy='linear')


  # Repeat for each epoch
  print('Started Training')
  st_time = time.time()
  for epoch in range(num_epochs):
    running_loss = 0.0
    correct_prediction = 0
    total_prediction = 0

    # Repeat for each batch in the training set
    for i, data in enumerate(train_dl):
        # Get the input features and target labels, and put them on the GPU
        inputs, labels = data[0].to(device), data[1].to(device)

        # Normalize the inputs
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s

        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Keep stats for Loss and Accuracy
        running_loss += loss.item()

        # Get the predicted class with the highest score
        _, prediction = torch.max(outputs,1)
        # Count of predictions that matched the target label
        correct_prediction += (prediction == labels).sum().item()
        total_prediction += prediction.shape[0]

        if i % 50 == 0 and i != 0:    # print every 50 mini-batches
           print('At epoch {}, batch {}, having loss: {:.3f}'.format(epoch + 1, i + 1, running_loss / i))
    
    # Print stats at the end of the epoch
    
    num_batches = len(train_dl)
    avg_loss = running_loss / num_batches
    acc = correct_prediction/total_prediction
    training_loss_histroy.append(avg_loss)
    training_acc_history.append(acc)
    print(f'Epoch: {epoch+1}, Loss: {avg_loss:.3f}, Training Accuracy: {acc:.3f}')
    

    # if (epoch+1) % 5 == 0:
      # check the accuracy every 5 epoch
    inference(model, val_dl)

  et_time = time.time()
  model_save_name = 'classifier_resnext101-unfreezed-100epooch.pt'
  path = F"/content/drive/MyDrive/Colab Notebooks/SC201-Project/{model_save_name}" 
  torch.save(model.state_dict(), path)
  print('Finished Training')
  print('The training time is {:.3f} seconds'.format(et_time-st_time))
num_epochs= 30
training(train_dl, num_epochs, model)

Started Training


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


At epoch 1, batch 51, having loss: 0.036
At epoch 1, batch 101, having loss: 0.028
At epoch 1, batch 151, having loss: 0.028
At epoch 1, batch 201, having loss: 0.022
At epoch 1, batch 251, having loss: 0.021
At epoch 1, batch 301, having loss: 0.019
At epoch 1, batch 351, having loss: 0.019
At epoch 1, batch 401, having loss: 0.019
Epoch: 1, Loss: 0.019, Training Accuracy: 0.993
Validation Accuracy: 0.99, Validation Loss: 0.021170 Total items: 1746
At epoch 2, batch 51, having loss: 0.027
At epoch 2, batch 101, having loss: 0.024
At epoch 2, batch 151, having loss: 0.021
At epoch 2, batch 201, having loss: 0.022
At epoch 2, batch 251, having loss: 0.024
At epoch 2, batch 301, having loss: 0.023
At epoch 2, batch 351, having loss: 0.021
At epoch 2, batch 401, having loss: 0.022
Epoch: 2, Loss: 0.022, Training Accuracy: 0.993
Validation Accuracy: 0.99, Validation Loss: 0.024840 Total items: 1746
At epoch 3, batch 51, having loss: 0.035
At epoch 3, batch 101, having loss: 0.028
At epoch 

In [None]:
def inference (model, val_dl):
  correct_prediction_val = 0
  total_prediction_val = 0
  running_loss_val = 0

  # Disable gradient updates
  with torch.no_grad():
    for data in val_dl:
      criterion = nn.CrossEntropyLoss()
      # Get the input features and target labels, and put them on the GPU
      inputs, labels = data[0].to(device), data[1].to(device)

      # Normalize the inputs
      inputs_m, inputs_s = inputs.mean(), inputs.std()
      inputs = (inputs - inputs_m) / inputs_s

      # Get predictions
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      running_loss_val += loss.item()
      # Get the predicted class with the highest score
      _, prediction = torch.max(outputs,1)
      # Count of predictions that matched the target label
      correct_prediction_val += (prediction == labels).sum().item()
      total_prediction_val += prediction.shape[0]
  num_batches = len(val_dl)
  avg_loss = running_loss_val / num_batches
  acc = correct_prediction_val/total_prediction_val
  validation_loss_history.append(avg_loss)
  validation_acc_history.append(acc)
    
  print(f'Validation Accuracy: {acc:.2f}, Validation Loss: {running_loss_val/num_batches:3f} Total items: {total_prediction_val}')

# Run inference on trained model with the validation set
# inference(myModel, val_dl)

In [18]:
import pandas as pd
from google.colab import files
pd.DataFrame(validation_loss_history).to_csv('validation_loss_history_Resnet_unfreezed.csv')
pd.DataFrame(validation_acc_history).to_csv('validation_acc_history_Resnet_unfreezed.csv')
pd.DataFrame(training_loss_histroy).to_csv('training_loss_histroy_Resnet_unfreezed.csv')
pd.DataFrame(training_acc_history).to_csv('training_acc_histroy_Resnet_unfreezed.csv')
files.download('validation_loss_history_Resnet_unfreezed.csv')
files.download('validation_acc_history_Resnet_unfreezed.csv')
files.download('training_loss_histroy_Resnet_unfreezed.csv')
files.download('training_acc_histroy_Resnet_unfreezed.csv')
print(validation_loss_history)
print(validation_acc_history)
print(training_loss_histroy)
print(training_acc_history)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[0.02116993127009385, 0.024839670180530633, 0.03822971842346202, 0.04894922624962832, 0.030653590066530838, 0.030361600962406886, 0.02422721373170382, 0.021815956446014504, 0.03961132439127885, 0.028017719313010565, 0.025096471600677846, 0.025271171618804933, 0.038465522662636434, 0.025798433198059974, 0.03163592282706056, 0.046393437248240264, 0.025862392508208192, 0.025566795499515717, 0.03057516650620032, 0.024693567576181605, 0.02915768568463012, 0.031062282665846975, 0.0323049977648372, 0.02124520076220981, 0.021752606817326058, 0.026824195344512313, 0.02357776487833896, 0.015463281511920906, 0.02310781331372337, 0.018910101449428524]
[0.9936998854524628, 0.9914089347079038, 0.9896907216494846, 0.9891179839633448, 0.9908361970217641, 0.9902634593356243, 0.9908361970217641, 0.9925544100801833, 0.9873997709049256, 0.9936998854524628, 0.9936998854524628, 0.9925544100801833, 0.9908361970217641, 0.993127147766323, 0.9885452462772051, 0.9914089347079038, 0.9936998854524628, 0.9908361970

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from torchvision import models
from torchsummary import summary

In [None]:
model = models.resnext101_32x8d(pretrained=True)
for param in model.parameters():
  param.requires_grad = False
model = nn.Sequential(
  nn.Conv2d(2, 3, kernel_size=3, stride=1, padding=1, dilation=1, groups=1, bias=True),
  model)
model.fc = nn.Sequential(
    nn.Linear(1000,10,bias=True))

for param in model.parameters():
  print(param.requires_grad)
summary(model,(2,64,64))

In [None]:
model = models.resnext101_32x8d(pretrained=True)
summary(model, (3,64,64))