<a href="https://colab.research.google.com/github/JohEder/bachelor_thesis_audio_ml/blob/master/base_line_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torchaudio

Collecting torchaudio
[?25l  Downloading https://files.pythonhosted.org/packages/a8/20/eab40caad8f4b97f5e91a5de8ba5ec29115e08fa4c9a808725490b7b4844/torchaudio-0.9.0-cp37-cp37m-manylinux1_x86_64.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 4.3MB/s 
Installing collected packages: torchaudio
Successfully installed torchaudio-0.9.0


In [2]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import torch.utils.data as data
from torch.autograd import Variable
import numpy as np
import torch.nn.functional as F
from torch.utils.data import Dataset 
import torchaudio
import pandas as pd
import os

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!ls "/content/drive/My Drive"

 ATT00001.gdoc
'Colab Notebooks'
'Data Science and Machine Learning.gslides'
 datasets
 models
'ProbenBisWeihnachten (1).txt.gdoc'
 ProbenBisWeihnachten.txt.gdoc
'Project Edwinter.gdoc'
'Seminar Paper: Handout and Literature.gdoc'
'Um Antwort wird gebeten (1).gform'
'Um Antwort wird gebeten.gform'
'User Interview.gdoc'
'User Interviews Drink Mates'
 vorläufige.gdoc
 VVZafa183ad-b65e-4fbb-9681-0bac29b42558.rtf.gdoc
'Wie soll unsere App heißen?_exported_on_Tue May 05 2020 17:18:32 GMT+0530 (IST).gsheet'
'Wie soll unsere App heißen? .gform'


In [5]:
class IdmtTrafficDataSet(Dataset):

    

    def __init__(self, annotations_file, audio_dir, audio_transformation, transformation, target_sample_rate):
        self.annotations = pd.read_csv(annotations_file)
        self.audio_dir = audio_dir
        self.audio_transformation = audio_transformation
        self.transformation = transformation
        self.target_sample_rate = target_sample_rate
        self.classes = ['None','C','T', 'M', 'B']


    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        signal, sr = torchaudio.load(audio_sample_path)
        signal = self._resample(signal, sr) #adjust sample rates
        # signal -> (num_channels, samples) i.e. (2, 16000)
        signal  = self._mix_down(signal) #stereo to mono
        signal = self.audio_transformation(signal) #(1, 16000) -> torch.Size([1, 64, 63])
        signal = self.transformation(signal)
        return signal, self.classes.index(label)

    def _resample(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
            signal = resampler(signal)
        return signal
    
    def _mix_down(self, signal):
        if signal.shape[0] > 1: #(2, 16000)
            #mean operation: aggregating multiple channels
            signal = torch.mean(signal, 0, True)
        return signal

    def _get_audio_sample_path(self, index):
        path = os.path.join(self.audio_dir, self.annotations.iloc[index, 1])
        return path + '.wav'

    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 9]

In [6]:
SAMPLE_RATE = 22500
N_FFT=2048 #is also window size
HOP_LENGTH=1024
N_MELS=128
melspectogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_fft=N_FFT, # Frame Size
        hop_length=HOP_LENGTH, #here half the frame size
        n_mels=N_MELS
    )

transforms = transforms.Compose([
    #transforms.ToPILImage(mode='L'),
    #transforms.Grayscale(num_output_channels=3),
    #transforms.Resize([224, 224]),
    transforms.ToTensor(),
])

In [7]:
def get_train_and_val_idmt():
    AUDIO_DIR = "/content/drive/My Drive/datasets/IDMT_Traffic/audio"
    train_annotations = "/content/drive/My Drive/datasets/IDMT_Traffic/annotation/eusipco_2021_train.csv"
    test_annotatons = "/content/drive/My Drive/datasets/IDMT_Traffic/annotation/eusipco_2021_test.csv"
    train_data = IdmtTrafficDataSet(train_annotations, AUDIO_DIR,melspectogram, transforms, SAMPLE_RATE)
    test_data = IdmtTrafficDataSet(test_annotatons, AUDIO_DIR,melspectogram, transforms, SAMPLE_RATE)
    return train_data, test_data

In [12]:
train_data, val_data = get_train_and_val_idmt()
first_sample, first_label = train_data[0]
input_dim = first_sample.shape[1] *first_sample.shape[2]
print(f"Train Data Shape: {first_sample.shape}")
print(input_dim)

Train Data Shape: torch.Size([3, 128, 44])
5632


In [13]:
class AutoEncoder(nn.Module):
  def __init__(self, input_dim):
    super().__init__()

    self.encoder = nn.Sequential(
        #FC(Input, 64, ReLU)
        nn.Linear(in_features=input_dim, out_features=64),
        nn.ReLU(),
        #FC(64, 32, ReLU),
        nn.Linear(in_features=64, out_features=32),
        nn.ReLU(),
        #FC(32, 16, ReLU)
        nn.Linear(in_features=32, out_features=16),
        nn.ReLU()
    )
    self.decoder = nn.Sequential(
        #FC(16, 32,ReLU)
        nn.Linear(in_features=16, out_features=32),
        nn.ReLU(),
        #FC(32, 64, ReLU), 
        nn.Linear(in_features=32, out_features=64),
        nn.ReLU(),
        #FC(64, Output, none)
        nn.Linear(in_features=64, out_features=input_dim),
        nn.ReLU()
    )

  def forward(self, input_data):
    z = self.encoder(input_data)
    output = self.decoder(z)
    return output

In [22]:
LEARNING_RATE = 0.0001
EPOCHS = 1
BATCH_SIZE = 16
BATCH_SIZE_VAL = 51


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

autoencoder = AutoEncoder(input_dim=input_dim)

optimizer = torch.optim.Adam(autoencoder.parameters(), lr=LEARNING_RATE)
loss_func = nn.MSELoss()

In [23]:
train_loader = torch.utils.data.DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(val_data, batch_size=BATCH_SIZE_VAL, shuffle=True)

In [24]:
def train(model, input_dim, device, train_loader, optimizer, epoch, loss_func):
  print("Starting Training.")
  model.to(device)
  model.train() #set mode
  for batch_index, (data_batch, _) in enumerate(train_loader):
    data_batch = data_batch.view(-1, input_dim).to(device)

    optimizer.zero_grad()
    output = model(data_batch)

    # Calculate loss
    loss = loss_func(output, data_batch)
    loss.backward()                 
    optimizer.step()
    
    print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, batch_index * len(data_batch), len(train_loader.dataset),100. * batch_index / len(train_loader), loss.item()))
    return loss

In [27]:
for epoch in range(EPOCHS):
  epoch_loss = 0
  batch_loss = train(autoencoder, input_dim, device, train_loader, optimizer, epoch, loss_func)
  epoch_loss += batch_loss.item()
  print("epoch : {}/{}, loss = {:.6f}".format(epoch + 1, EPOCHS, epoch_loss))

Starting Training.
epoch : 1/1, loss = 0.128137
