In [132]:
# Deep Learning
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchaudio 

import os
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score,accuracy_score, precision_score, recall_score, classification_report, roc_auc_score
from scipy.special import softmax
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from tqdm import tqdm

## Define Model
    ### probably autotagger-like model on the spectrogram? 

In [134]:
# VAD model
class Conv_2d(nn.Module):
    def __init__(self, input_channels, output_channels, shape=3, stride=1, pooling=2):
        super(Conv_2d, self).__init__()
        self.conv = nn.Conv2d(input_channels, output_channels, shape, stride=stride, padding=shape//2)
        self.bn = nn.BatchNorm2d(output_channels)
        self.relu = nn.ReLU()
        self.mp = nn.MaxPool2d(pooling)
    def forward(self, x):
        out = self.mp(self.relu(self.bn(self.conv(x))))
        #out = self.mp(self.relu(self.conv(x)))
        return out

class VAD(nn.Module):
    def __init__(self):
        super(VAD, self).__init__()
        self.a_norming = nn.BatchNorm2d(1) 
        #self.to_db = torchaudio.transforms.AmplitudeToDB() 

        self.conv1 = Conv_2d(1,32)
        self.conv2 = Conv_2d(32,64)
        self.conv3 = Conv_2d(64,128)
        self.conv4 = Conv_2d(128,256)
        
        self.a_fc1 =  nn.Linear(10240, 512)
        self.a_fc2 = nn.Linear(512, 256)
        self.a_fc3 = nn.Linear(256, 128)       

        self.drop = nn.Dropout(p=0.3)
        self.logits  = nn.Linear(128, 1)
        
    def forward(self,audio_input):
        #Audio Branch 
        #audio_db = self.to_db(audio_input) #[FIX! think need to upgrade torch]
        audio_norm = self.a_norming(audio_input) 
        
        x_audio = self.conv1(audio_norm)
        x_audio = self.conv2(x_audio)
        x_audio = self.conv3(x_audio)
        x_audio = self.conv4(x_audio)

        x_audio = x_audio.view(x_audio.size(0), -1)
        x_audio = F.relu(self.a_fc1(x_audio))
        x_audio = F.relu(self.a_fc2(x_audio))
        x_audio = F.relu(self.a_fc3(x_audio))
        
        #Merged Branch
        x_audio = self.drop(x_audio)
        logits = self.logits(x_audio)
        output = torch.sigmoid(logits)
        return output, logits

In [4]:
# get autotagger
def get_VAD(device):
    # Define loss and optimizer
    vad_model = VAD()
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(vad_model.parameters(), lr=0.001, weight_decay=1e-4)
    vad_model.to(device)
    return vad_model, optimizer, criterion

In [125]:
# Training loop
def train_vad(vad_model, train_loader, optimizer, criterion):
    for epoch in range(NUM_EPOCHS):  # loop over the dataset multiple times
        vad_model.train()
        epoch_loss = 0.0
        correct = 0
        # iterate the training set
        with tqdm(train_loader, unit="batch") as tepoch:
            for data in tepoch:
                #tepoch.set_description(f"Epoch {epoch+1}")

                # I split each input into 3 second segments (those together will make a batch)
                mel_in = data[0].to(device)
                labels = torch.squeeze(data[1]).to(device)


                # Choosing 3 seconds partitioning -> 92 frames
                padded_mel = torch.zeros(1,1,128,mel_in.shape[3] + FRAMES_3SEC) #Padding input to have 3 seconds of silence at the end
                padded_mel[:,:,:,:mel_in.shape[3]] = mel_in
                #num_batches = (padded_mel.shape[3] - FRAMES_3SEC) / BATCH_SIZE # Because we will ignore the first 92 frames

                #for batch in np.arange(0,num_batches):
                partitioned_mels_3secs = torch.zeros(mel_in.shape[3]-FRAMES_3SEC,1,128,FRAMES_3SEC)
                label_centerframe = torch.zeros(mel_in.shape[3]-FRAMES_3SEC, 1)
                half_window = int(FRAMES_3SEC/2)
                for idx, central_frame in enumerate(np.arange(FRAMES_3SEC,mel_in.shape[3],1)):
                    partitioned_mels_3secs[idx,:,:,:] = padded_mel[:,:,:,central_frame-half_window:central_frame+half_window]
                    label_centerframe[idx] = labels[central_frame]

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward + backward + optimize
                outputs,logits = vad_model(partitioned_mels_3secs) # Should be by frame
                loss = criterion(logits, label_centerframe) #Notice, CE in pytorch requires targets as indices
                loss.backward()
                optimizer.step()

                #_, predicted_idx = torch.max(outputs.data, 1)
                #correct += (predicted_idx == labels).sum().item()

                # compute epoch loss
                epoch_loss += loss.item()
                tepoch.set_postfix(loss=loss.item())

    print('Finished Training')

## Make dataset pipeline
    ### for each sample: 
        # Load audio/spectrogram
        # Load groundtruth on frame-level

In [126]:
# Defining dataset pipeline 
class VAD_Dataset(torch.utils.data.Dataset):
    def __init__(self, data_directory,  device = 'cpu'):
        filenames = os.listdir(data_directory)
        self.df = pd.DataFrame(filenames)
        self.data_directory = data_directory
        self.device = device

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        file_id = self.df.loc[index].values[0]
        data = np.load(self.data_directory + str(file_id))
        spectrogram = torch.from_numpy(data['mel'])
        label = torch.from_numpy(data['labels'])
        
        # this is to ensure all mels have same shape (padded if missing)
        #mel_spec = torch.zeros(1,128,1292) # SET TO MAX LENGTH
        #labels_stretched = torch.zeros(1,1292)
        if(spectrogram.dim() == 2):
            spectrogram = torch.unsqueeze(spectrogram,0)
        #mel_spec[:, :, :spectrogram.shape[2]] = spectrogram
        #labels_stretched[:, :label.shape[1]] = label

        return spectrogram , label

In [127]:
# initiating dataloader 
def initialize_dataloaders(trainDataDir, testDataDir):        
    train_instance = VAD_Dataset(trainDataDir)
    test_instance = VAD_Dataset(testDataDir)
    
    # I am setting the batch size to 1, because I will be batching each input file 
    # by partitioning around moving central frame
    train_loader = torch.utils.data.DataLoader(train_instance,batch_size=1,shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_instance,batch_size=1,shuffle=False)
    
    #validation_instance = UserAwareDataset("/home/mounted/implicit_valid_set.csv",
    #                            "/home/mounted/groundtruths/user_embeds_existing.csv",
    #                           "/home/mounted/implicit_mels/")
    #valid_loader = torch.utils.data.DataLoader(validation_instance,batch_size=32,shuffle=True)

    return train_loader, test_loader

## Training Loop 

In [128]:
trainDataDir = "/home/karim/Desktop/Sonos_Assignment/vad_train_set/data_ready/"
testDataDir = "/home/karim/Desktop/Sonos_Assignment/vad_test_set/data_ready/"
results_path = "/home/karim/Desktop/Sonos_Assignment/results/"
model_save_path = "/home/karim/Desktop/Sonos_Assignment/saved_models"

In [None]:
NUM_EPOCHS = 5
BATCH_SIZE = 32
FRAMES_3SEC = 92
#min_val_loss = 10**5 #just initialize with random big number 
#epochs_no_improve = 0
#n_epochs_stop = 10

device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
print("Using device: " + str(device))

train_loader, test_loader = initialize_dataloaders(trainDataDir, testDataDir)

vad_model, optimizer, criterion = get_VAD(device)
train_vad(vad_model, train_loader, optimizer, criterion)
#results, autotagger_y_pred_prob, test_labels = test_autotagger(autotagger, test_loader, y_test, 
#                                             label_encoder, labels_list, results_path)


model_name = model_save_path + "whatToCall"
torch.save(autotagger.state_dict(),model_name)
torch.cuda.empty_cache()
print("================================================================")

## Testing section