<a href="https://colab.research.google.com/github/JohEder/bachelor_thesis_audio_ml/blob/master/vit_huggingface.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install torchaudio



In [3]:


!pip install -q git+https://github.com/huggingface/transformers



  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
  Building wheel for transformers (PEP 517) ... [?25l[?25hdone


In [4]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import torch.utils.data as data
from torch.autograd import Variable
import numpy as np
from transformers import ViTModel
from transformers.modeling_outputs import SequenceClassifierOutput
import torch.nn.functional as F
from transformers import ViTFeatureExtractor
from torch.utils.data import Dataset 
import torchaudio
import pandas as pd
import os

In [61]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [6]:
!ls "/content/drive/My Drive"

 ATT00001.gdoc
'Colab Notebooks'
'Data Science and Machine Learning.gslides'
 datasets
'ProbenBisWeihnachten (1).txt.gdoc'
 ProbenBisWeihnachten.txt.gdoc
'Project Edwinter.gdoc'
'Seminar Paper: Handout and Literature.gdoc'
'Um Antwort wird gebeten (1).gform'
'Um Antwort wird gebeten.gform'
'User Interview.gdoc'
'User Interviews Drink Mates'
 vorläufige.gdoc
 VVZafa183ad-b65e-4fbb-9681-0bac29b42558.rtf.gdoc
'Wie soll unsere App heißen?_exported_on_Tue May 05 2020 17:18:32 GMT+0530 (IST).gsheet'
'Wie soll unsere App heißen? .gform'


In [7]:
class IdmtTrafficDataSet(Dataset):

    

    def __init__(self, annotations_file, audio_dir, audio_transformation, transformation, target_sample_rate):
        self.annotations = pd.read_csv(annotations_file)
        self.audio_dir = audio_dir
        self.audio_transformation = audio_transformation
        self.transformation = transformation
        self.target_sample_rate = target_sample_rate
        self.classes = ['None','C','T', 'M', 'B']


    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        signal, sr = torchaudio.load(audio_sample_path)
        signal = self._resample(signal, sr) #adjust sample rates
        # signal -> (num_channels, samples) i.e. (2, 16000)
        signal  = self._mix_down(signal) #stereo to mono
        signal = self.audio_transformation(signal) #(1, 16000) -> torch.Size([1, 64, 63])
        signal = self.transformation(signal)
        return signal, self.classes.index(label)

    def _resample(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
            signal = resampler(signal)
        return signal
    
    def _mix_down(self, signal):
        if signal.shape[0] > 1: #(2, 16000)
            #mean operation: aggregating multiple channels
            signal = torch.mean(signal, 0, True)
        return signal

    def _get_audio_sample_path(self, index):
        path = os.path.join(self.audio_dir, self.annotations.iloc[index, 1])
        return path + '.wav'

    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 9]



In [8]:
SAMPLE_RATE = 22500
N_FFT=2048 #is also window size
HOP_LENGTH=1024
N_MELS=128
melspectogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_fft=N_FFT, # Frame Size
        hop_length=HOP_LENGTH, #here half the frame size
        n_mels=N_MELS
    )

transforms = transforms.Compose([
    transforms.ToPILImage(mode='L'),
    transforms.Grayscale(num_output_channels=3),
    transforms.Resize([224, 224]),
    transforms.ToTensor(),
])

In [9]:
def download_mnist_datasets():
    train_data = datasets.MNIST(
        root="mnist_data",
        download=True,
        train=True,
        transform=transforms
    )
    validation_data = datasets.MNIST(
        root="mnist_data",
        download=True,
        train=False,
        transform=transforms
    )
    return train_data, validation_data


def get_train_and_val_idmt():
    AUDIO_DIR = "/content/drive/My Drive/datasets/IDMT_Traffic/audio"
    train_annotations = "/content/drive/My Drive/datasets/IDMT_Traffic/annotation/eusipco_2021_train.csv"
    test_annotatons = "/content/drive/My Drive/datasets/IDMT_Traffic/annotation/eusipco_2021_test.csv"
    train_data = IdmtTrafficDataSet(train_annotations, AUDIO_DIR,melspectogram, transforms, SAMPLE_RATE)
    test_data = IdmtTrafficDataSet(test_annotatons, AUDIO_DIR,melspectogram, transforms, SAMPLE_RATE)
    return train_data, test_data


In [10]:
train_data, val_data = get_train_and_val_idmt()
NUMBER_OF_CLASSES = 5

In [11]:
print(len(train_data)) #60000
#print(train_data[0][0].shape) #torch.Size([3, 224, 224])
tensor, label = train_data[0]
print(tensor.shape) #(channels, number_of_mels, number_of_frames)

5872
torch.Size([3, 224, 224])


In [12]:
class ViTForImageClassification(nn.Module):
    def __init__(self, num_labels=NUMBER_OF_CLASSES):
        super(ViTForImageClassification, self).__init__()
        self.vit = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.vit.config.hidden_size, num_labels) #hidden_size : 768
        self.num_labels = num_labels

    def forward(self, pixel_values, labels):
        outputs = self.vit(pixel_values=pixel_values) #pixel values (batch_size, num_channels, height, width)
        output = self.dropout(outputs.last_hidden_state[:,0]) #torch.floatTensor (batch_size, sequence_length, hidden_size)
        logits = self.classifier(output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        if loss is not None:
            return logits, loss.item()
        else:
            return logits, None



In [52]:
EPOCHS = 10
BATCH_SIZE = 16
BATCH_SIZE_VAL = 51
LEARNING_RATE = 0.0001


In [53]:
model = ViTForImageClassification(NUMBER_OF_CLASSES)
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_func = nn.CrossEntropyLoss()
device = device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [54]:
print("Number of train samples: ", len(train_data))
print("Number of test samples: ", len(val_data))
print("Detected Classes are: ", train_data.classes)

Number of train samples:  5872
Number of test samples:  2856
Detected Classes are:  ['None', 'C', 'T', 'M', 'B']


In [63]:
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE_VAL, shuffle=True)

def preprocess_batch(batch, batch_size, feature_extractor):
  batch = np.split(np.squeeze(np.array(batch)), batch_size)
  for index, array in enumerate(batch):
    batch[index] = np.squeeze(array)
  
  batch = torch.tensor(np.stack(feature_extractor(batch)['pixel_values'], axis=0))
  return batch

def train(model, feature_extractor, device, train_loader, optimizer, epoch, loss_func):
  print("Starting Training.")
  model.to(device)
  model.train() #set mode
  for batch_index, (data_batch, target) in enumerate(train_loader):
    data_batch = preprocess_batch(data_batch, BATCH_SIZE, feature_extractor)
    data_batch = data_batch.to(device)
    #print(target)
    #print(target.shape)
    target = target.to(device)

    output, loss = model(data_batch, None)

    # Calculate loss
    if loss is None: 
      loss = loss_func(output, target)   
      optimizer.zero_grad()           
      loss.backward()                 
      optimizer.step()
    
    print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, batch_index * len(data_batch), len(train_loader.dataset),100. * batch_index / len(train_loader), loss.item()))


def test(model, device, test_loader, feature_extractor, epoch):
  model.to(device)
  model.eval() #evaluation mode
  test_loss = 0
  correct = 0

  with torch.no_grad():
    data_batch, target = next(iter(test_loader))
    print(f"Shape of data_batch: {data_batch.shape}")
    print(f"Shape of target_batch: {target.shape}")
    #data_batch = data_batch[0]
    data = preprocess_batch(data_batch, BATCH_SIZE_VAL, feature_extractor)
    data= data.to(device)
    target = target.to(device)
    test_output, loss = model(data, target)
    test_output = test_output.argmax(1)
    accuracy = (test_output == target).sum().item() / BATCH_SIZE_VAL
    print('Epoch: ', epoch, '|| test accuracy: %.2f' % accuracy)






In [56]:
for epoch in range(1, EPOCHS +1):
  train(model=model, feature_extractor=feature_extractor, device=device, train_loader=train_loader, optimizer=optimizer, epoch=epoch, loss_func=loss_func)
  test(model=model, device=device, test_loader=val_loader, feature_extractor=feature_extractor, epoch=epoch)

Starting Training.
Shape of data_batch: torch.Size([51, 3, 224, 224])
Shape of target_batch: torch.Size([51])
Epoch:  1 || test accuracy: 0.76
Starting Training.
Shape of data_batch: torch.Size([51, 3, 224, 224])
Shape of target_batch: torch.Size([51])
Epoch:  2 || test accuracy: 0.90
Starting Training.
Shape of data_batch: torch.Size([51, 3, 224, 224])
Shape of target_batch: torch.Size([51])
Epoch:  3 || test accuracy: 1.00
Starting Training.
Shape of data_batch: torch.Size([51, 3, 224, 224])
Shape of target_batch: torch.Size([51])
Epoch:  4 || test accuracy: 0.90
Starting Training.
Shape of data_batch: torch.Size([51, 3, 224, 224])
Shape of target_batch: torch.Size([51])
Epoch:  5 || test accuracy: 0.98
Starting Training.
Shape of data_batch: torch.Size([51, 3, 224, 224])
Shape of target_batch: torch.Size([51])
Epoch:  6 || test accuracy: 0.94
Starting Training.
Shape of data_batch: torch.Size([51, 3, 224, 224])
Shape of target_batch: torch.Size([51])
Epoch:  7 || test accuracy: 0.98

In [73]:
def evaluate(model, val_loader, feature_extractor, device):

  correct, total = 0, 0

  model.to(device)
  model.eval()
  with torch.no_grad():
    for _, data in enumerate(val_loader, 0):
      inputs, targets = data
      inputs = preprocess_batch(inputs, BATCH_SIZE_VAL, feature_extractor)
      inputs, targets = inputs.to(device), targets.to(device)
      outputs, loss = model(inputs, None)
      test_output= outputs.argmax(1)
      correct += (test_output == targets).sum().item()
      total += targets.size(0)

    print(f"Validation Accuracy on whole validation set is: {100 * correct/total} %.")

evaluate(model, val_loader, feature_extractor, device)
      

#Todo: evaluation graphs

56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.Size([51, 3, 224, 224])
56
torch.S

In [62]:
print("Traininig finished! Saving Model...")
import time
from google.colab import files
current_time = time.asctime( time.localtime(time.time()) )
torch.save(model.state_dict(), '/content/drive/My Drive/models/' + str(current_time))

#save hyperparams:
with open('/content/drive/My Drive/models/hyper_params' + str(current_time) + '.txt', 'w') as f:
  f.write(f"Epochs: {EPOCHS}, LR: {LEARNING_RATE}, BatchSize: {BATCH_SIZE}, SAMPLE_RATE = {SAMPLE_RATE}, N_FFT/WINDOW_SIZE = {N_FFT}, HOP_LENGTH = {HOP_LENGTH}, N_MELS = {N_MELS}")


Traininig finished! Saving Model...


In [59]:
drive.flush_and_unmount()
print('All changes made in this colab session should now be visible in Drive.')

All changes made in this colab session should now be visible in Drive.
