In [None]:
"""
This notebook first trains a sequence generation model and then use the model to generate sequences 
for data augmentation for intent classification model.
Each generated sequence is then added to dataset as a new utterance.
After all that an intent classier is used for classification
"""

"""
This notebook can be run from top to bottom
It runs a grid search of combinations of speakers and utterrances 
may take a while. Sometimes up to 4 hours
"""

In [None]:
!pip install gdown

In [None]:
#Download dataset
!gdown --fuzzy https://drive.google.com/file/d/1LIrogRWSL-4CifdzciM6vV8V30JArQG6/view?usp=sharing

In [None]:
!unzip -qn ./phonemes.zip 
!rm -rf 'phonemes/validation/pp10/spchdatadir/recording1/Untitled.ipynb' #Remove this unwantd file

In [None]:
!pip install torchsummaryX 

In [None]:
import numpy as np
import pandas as pd 
import time
import os
import os.path as osp
import glob
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchsummaryX import summary
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from tqdm import tqdm
import random
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

torch.manual_seed(1)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

In [None]:
#We use 2 and 4 intents classification for our final report
intents_6 = ["move", 'turn', 'approach', 'grab', 'point', 'lift']
intents_4 = ['approach', 'grab', 'point', 'lift']
intents_2 = ['approach', 'lift']
intents_3 = ['approach', 'grab', 'point']


In [None]:
X_dir = 'phonemes/train/*/spchdatadir/*/*'
X_dir_val = 'phonemes/validation/*/spchdatadir/*/*'
X_dir_test = 'phonemes/test/*/spchdatadir/*/*'

X_files_train = sorted(glob.glob(X_dir))
X_files_val = sorted(glob.glob(X_dir_val))
X_files_test = sorted(glob.glob(X_dir_test))
files = X_files_train
files.extend(X_files_val)
files.extend(X_files_test)

all_phones = []

phones = set()

#Get all phonemes occuring in the dataset
for x in files:
    f = np.load(x)
    phones.update(f)
    all_phones.extend(f)
    
PHONEMES = list(phones) #Phoneme vocaburaly
vocab = PHONEMES

#A utility function to get number of required number of records
def records(lis, num):
    recs = []
    for i in range(0, len(lis), 15):
        recs.extend(lis[i:i+num])
    return recs

#A utility function to pull files depending on the list of intents chosen
def intents_func(intents_lis, X_files, Y_files):
    xfiles = []
    yfiles = []
    for i, file in enumerate(Y_files):         
        f = open(file) 
        intent = f.read()   
        if intent in intents_lis:
            xfiles.append(X_files[i])
            yfiles.append(file)

    return xfiles, yfiles, intents_lis

#A utility function to retrieve speakers required
def choose_speakers(speakers_lis, xlis, ylis, n=7):
    speakers = random.choices(speakers_lis, k=n)
    x_train_files = []
    y_train_files = []
    for i, file in enumerate(xlis):
        dirs = file.split('/')
        for speaker in speakers:
            if speaker in dirs:
                x_train_files.append(file)
                y_train_files.append(ylis[i]) 
    return x_train_files, y_train_files

In [None]:
import pandas as pd
df=pd.DataFrame({'Phones': all_phones})

#Get frequent phonemes
most_freq = list(df['Phones'].value_counts()[:10].index)
most_freq_idcs = [PHONEMES.index(xx) for xx in most_freq]
most_freq_idcs

In [None]:
speakers = ['pp2', 'pp3', 'pp4', 'pp5', 'pp6', 'pp7', 'pp8']

In [None]:
partition_train= "train"
partition_validate= "validation"

X_dir_train = 'phonemes/' + partition_train + '/*/spchdatadir/*/*'
X_files_train = sorted(glob.glob(X_dir_train)) #Train files

X_dir_val = 'phonemes/' + partition_validate + '/*/spchdatadir/*/*'
X_files_val = sorted(glob.glob(X_dir_val))  #Validation files

In [None]:
#Get all utterances and convert them to indices then append to a list
#The list contains all utterances
dataset = []
for path in X_files_train:
    X = np.load(path)
    
    X_indices = [PHONEMES.index(xx) for xx in X]
    dataset.append(X_indices)
    
for path in X_files_val:
    X = np.load(path)
    X_indices = [PHONEMES.index(xx) for xx in X]
    dataset.append(X_indices)
    

In [None]:
#Dataloader similar to HW4P1
class LanguageModelDataLoader(DataLoader):
    """
        TODO: Define data loader logic here
    """
    def __init__(self, dataset, batch_size, seq_lens=[9, 6], shuffle=True):
        # super(dataset).__init__()
        
        self.dataset = dataset
        self.batch_size = batch_size
        self.seq_lens = seq_lens

    def __iter__(self):
        # concatenate your articles and build in0to batches
        dataset = self.dataset
        np.random.shuffle(dataset)
        concatenated = np.concatenate(dataset)
#         concatenated = [torch.tensor(it) for it in concatenated]
        inputs = []
        targets = []
        seq_len = np.random.choice(self.seq_lens, p=[0.95, 0.05])
        for i in range(0, len(concatenated), seq_len):
            inputs.append(torch.tensor(concatenated[i:i+seq_len], dtype=torch.long))
            targets.append(torch.tensor(concatenated[i+1:i+seq_len+1], dtype=torch.long))
        inputs = inputs[:-1]
        targets = targets[:-1]
        for i in range(0, len(inputs), self.batch_size):
            yield inputs[i:i+self.batch_size], targets[i:i+self.batch_size]            

        
        

In [None]:

class LockedDropout(nn.Module):
    """ LockedDropout applies the same dropout mask to every time step.

    **Thank you** to Sales Force for their initial implementation of :class:`WeightDrop`. Here is
    their `License
    <https://github.com/salesforce/awd-lstm-lm/blob/master/LICENSE>`__.

    Args:
        p (float): Probability of an element in the dropout mask to be zeroed.
    """

    def __init__(self, p=0.5):
        self.p = p
        super().__init__()

    def forward(self, x):
        """
        Args:
            x (:class:`torch.FloatTensor` [sequence length, batch size, rnn hidden size]): Input to
                apply dropout too.
        """
        if not self.training or not self.p:
            return x
        x = x.clone()
        mask = x.new_empty(1, x.size(1), x.size(2), requires_grad=False).bernoulli_(1 - self.p)
        mask = mask.div_(1 - self.p)
        mask = mask.expand_as(x)
        return x * mask


    def __repr__(self):
        return self.__class__.__name__ + '(' \
            + 'p=' + str(self.p) + ')'

In [None]:
import numpy as np

import torch

def embedded_dropout(embed, words, dropout=0.2, scale=None):
  if dropout:
    mask = embed.weight.data.new().resize_((embed.weight.size(0), 1)).bernoulli_(1 - dropout).expand_as(embed.weight) / (1 - dropout)
    masked_embed_weight = mask * embed.weight
  else:
    masked_embed_weight = embed.weight
  if scale:
    masked_embed_weight = scale.expand_as(masked_embed_weight) * masked_embed_weight

  padding_idx = embed.padding_idx
  if padding_idx is None:
      padding_idx = -1

  X = torch.nn.functional.embedding(words, masked_embed_weight,
    padding_idx, embed.max_norm, embed.norm_type,
    embed.scale_grad_by_freq, embed.sparse
  )
  return X

In [None]:
# model for sequene generation
#Similar to HW4P1

class LanguageModel(nn.Module):
    """
        TODO: Define your model here
    """
    
    def __init__(self, vocab_size, embedd_size=256, hidden_size=512, p=0.3):
        super(LanguageModel, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedd_size)
        
        self.lstm = nn.LSTM(embedd_size, hidden_size, 3, batch_first=True, dropout=0.4)
        self.locked_dropout = LockedDropout(p)

        self.fc = nn.Sequential(
            
            nn.Linear(hidden_size, 1024), 
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(1024, hidden_size))
        
        self.fc2 = nn.Linear(hidden_size, vocab_size)

        self.fc.weight = self.embedding.weight

    def forward(self, x):
        embed = embedded_dropout(self.embedding, x)

        embed = self.locked_dropout(embed)

        output, out_hid = self.lstm(embed)
        output = self.locked_dropout(output)

        output = self.fc(output)
        output = self.fc2(output)
      
        return output

    


In [None]:
# model trainer

class LanguageModelTrainer:
    def __init__(self, model, loader, max_epochs=1, run_id='exp'):
        """
            Use this class to train your model
        """
        self.model = model
        self.loader = loader
        self.train_losses = []
        self.epochs = 0
        self.max_epochs = max_epochs
        self.run_id = run_id
        
        # self.optimizer = optim.ASGD(self.model.parameters(), lr= 0.002, t0=50000, weight_decay=1e-4)
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.002)
        # self.optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)
        self.criterion = torch.nn.CrossEntropyLoss().to(device)
        # scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.8, min_lr=0.0004, patience=3, verbose=False)

    def train(self):
        self.model.train() # set to training mode
        epoch_loss = 0
        num_batches = 0
        for batch_num, (inputs, targets) in enumerate(self.loader):
            
            loss = self.train_batch(inputs, targets)
            epoch_loss += loss
            
        epoch_loss = epoch_loss / (batch_num + 1)
        self.epochs += 1
        print('[TRAIN]  Epoch [%d/%d]   Loss: %.4f'
                      % (self.epochs, self.max_epochs, epoch_loss))
        self.train_losses.append(epoch_loss)

    def train_batch(self, inputs, targets):
        """ 
            TODO: Define code for training a single batch of inputs
        
        """
        inputs = torch.stack(inputs).to(device)
        targets = torch.stack(targets).to(device)
        self.optimizer.zero_grad()
        outputs = self.model(inputs)
        outputs = torch.transpose(outputs, 2, 1)
        loss = self.criterion(outputs, targets)
        loss.backward()
        self.optimizer.step()
        
        return loss

In [None]:
# TODO: define other hyperparameters here
NUM_EPOCHS = 100
BATCH_SIZE = 48


In [None]:
run_id = str(int(time.time()))
gen_model = LanguageModel(len(vocab)).cuda()
loader = LanguageModelDataLoader(dataset=dataset, batch_size=BATCH_SIZE, seq_lens=[40, 60], shuffle=True)
trainer = LanguageModelTrainer(model=gen_model, loader=loader, max_epochs=NUM_EPOCHS, run_id=run_id)

In [None]:

best_nll = 1e30 
for epoch in range(NUM_EPOCHS):
    trainer.train()
    print("\n")

"""The purpose of the project is to convert audio recordings into phonemes and then classify the phonemes into intents.
Each sequence of phonemes is mapped to one of 6 intents. The model should be able to read phoneme sequence
and output an intent.
"""

In [None]:
"""
The dataset class reads sequnce of phonemes and a correspong intent.
The phonemes are mapped into indices using the above PHONEMES list
"""
class LibriSamples(torch.utils.data.Dataset):

    def __init__(self, recs, intents_lst, speakers_num, partition= "train"): 
        self.X_dir = 'phonemes/' + partition + '/*/spchdatadir/*/*'
        self.Y_dir = 'phonemes/' + partition + '/*/framedir/*/*'
        
        self.X_files = sorted(glob.glob(self.X_dir)) 
        self.Y_files = sorted(glob.glob(self.Y_dir))             
        
        X_files = records(self.X_files, recs)
        Y_files = records(self.Y_files, recs)
        
        x_files, y_files, self.intents = intents_func(intents_lst, X_files, Y_files)
        
        if partition == 'train':
            self.X_files, self.Y_files = choose_speakers(speakers, x_files, y_files, speakers_num)
        else:
            self.X_files, self.Y_files = x_files, y_files
                
        self.PHONEMES = PHONEMES
        assert(len(self.X_files) == len(self.Y_files))
        self.inputs = []
        self.targets = []
        aug_inputs = []
        aug_targets = []
#         print(partition)

        for ind in range(len(self.X_files)):
            X_path = self.X_files[ind] 
            Y_path = self.Y_files[ind] 
            X = np.load(X_path)
            X_indices = [PHONEMES.index(xx) for xx in X]
            f = open(Y_path) 
            r = f.read() 
            Y_index = self.intents.index(r) 

            ln = len(X_indices)
            temp = X_indices.copy()

            probs = [0.25, 0.2, 0.15, 0.15, 0.1, 0.08, 0.02, 0.02, 0.02, 0.01]

            #Generate new sequence from utterance in dataset
            for i in range(1, ln, 1):
                ph = X_indices[:i]
                ph = torch.tensor([ph]).to(device)

                preds = gen_model(ph)
                preds = preds[:, -1, :]
                indices = torch.argmax(preds,  axis=1)

                temp[i] = indices[0].item()

            # print(ch)
#             if not ch:
#               for i in range(ln):
#                     freq_idx = np.random.choice(most_freq_idcs, p=probs)
#                     temp[i] = freq_idx
                    
            # print("X indices", X_indices)
            # print("Temp", temp)

            if partition ==  'train':

                self.inputs.append(X_indices)
                self.targets.append(Y_index)
                aug_inputs.append(temp) #Add new utterance to dataset
                aug_targets.append(Y_index)
            else:
                self.inputs.append(X_indices)
                self.targets.append(Y_index)
                
            f.close()

        self.inputs.extend(aug_inputs)
        self.targets.extend(aug_targets)
        assert(len(self.inputs) == len(self.targets))

    def __len__(self):
        return len(self.X_files)

    def __getitem__(self, ind):

        return self.inputs[ind], self.targets[ind]
    
    def collate_fn(self, batch):

        batch_x = [torch.tensor(x) for x,y in batch] 
        batch_y = [torch.tensor(y) for x,y in batch]
        batch_x_pad = pad_sequence(batch_x, batch_first=True, padding_value=0) #Utterances have variable length
        lengths_x = [len(x) for x,y in batch] #Store lenghths of all utterances

        return batch_x_pad, torch.tensor(batch_y), torch.tensor(lengths_x)


In [None]:
batch_size = 16

def data_prep(recs, intent_lst, speakers_num):
    #Load dataloaders with specified records, intents list, and number of speakers
    train_data = LibriSamples(recs, intent_lst, speakers_num, 'train')
    val_data = LibriSamples(recs, intent_lst, speakers_num, 'validation')
    
    train_loader = torch.utils.data.DataLoader(train_data, batch_size, shuffle=True, collate_fn=train_data.collate_fn)
    val_loader = torch.utils.data.DataLoader(val_data, batch_size, shuffle=False, collate_fn=val_data.collate_fn)

    label_sizes = 0 
    
    if len(intent_lst) == 2:
        label_sizes = 2
        
    elif len(intent_lst) == 3:
        label_sizes = 3
        
    elif len(intent_lst) == 4:
        label_sizes = 4
        
    elif len(intent_lst) == 6:
        label_sizes = 6
    
    for data in val_loader:
        x, y, lx = data 
        break

    return train_loader, val_loader, x, y, lx, label_sizes

In [None]:
#Classifier architecture
class ICASSP2CNN(nn.Module):
    def __init__(self, vocab_size, embed_size=128, hidden_size=512, num_lstm_layers = 2, bidirectional = False, label_size=31):
        super().__init__()
        self.n_layers = num_lstm_layers 
        self.hidden = hidden_size
        self.bidirectional = bidirectional
        
        self.embed = nn.Embedding(vocab_size, embed_size)

        self.cnn  = nn.Conv1d(embed_size, embed_size, kernel_size=3, padding=1)
        self.cnn2 = nn.Conv1d(embed_size, embed_size, kernel_size=5, padding=2)

        self.batchnorm = nn.BatchNorm1d(2 * embed_size)

        self.lstm = nn.LSTM(input_size = 2 * embed_size, 
                            hidden_size = hidden_size, 
                            num_layers = num_lstm_layers, 
                            bidirectional = bidirectional
                            )

        self.linear = nn.Linear(in_features = 2 * hidden_size if bidirectional else hidden_size, 
                                out_features = label_size)


    def forward(self, x, lengths):
        """
        padded_x: (B,T) padded LongTensor
        """

        input = self.embed(x)
        
        batch_size = input.size(0)
        input = input.transpose(1,2)

        cnn_output = torch.cat([self.cnn(input), self.cnn2(input)], dim=1)

        input = F.relu(self.batchnorm(cnn_output))

        input = input.transpose(1,2)

        pack_tensor = nn.utils.rnn.pack_padded_sequence(input, lengths, batch_first=True, enforce_sorted=False)
        _, (hn, cn) = self.lstm(pack_tensor)

        if self.bidirectional:
            h_n = hn.view(self.n_layers, 2, batch_size, self.hidden)
            h_n = torch.cat([ h_n[-1, 0,:], h_n[-1,1,:] ], dim = 1)
        else:
            h_n = hn[-1]
        
        logits = self.linear(h_n)

        return logits
    
model = ICASSP2CNN(len(PHONEMES), label_size=4).cuda() 


In [None]:
criterion = torch.nn.CrossEntropyLoss() 
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
optimizer = torch.optim.SGD(model.parameters(), lr=0.005, momentum=0.9, weight_decay=1e-4)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.002)

scaler = torch.cuda.amp.GradScaler()
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.999, min_lr=0.0005, patience=5, verbose=False)


In [None]:
def train(train_loader, val_loader,label_sizes, intent=None, k=None, s=None, p=None):  # todo: separation of train & validation. Which data should we train on?
    torch.cuda.empty_cache()
    model = ICASSP2CNN(len(PHONEMES), label_size=label_sizes).cuda() 
    criterion = torch.nn.CrossEntropyLoss() 
    optimizer = torch.optim.SGD(model.parameters(), lr=0.005, momentum=0.9, weight_decay=1e-4)

    scaler = torch.cuda.amp.GradScaler()
    # scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.999, min_lr=0.0005, patience=5, verbose=False)
    last_improvement = 0
    best_acc = 0
    epochs = 1000
    patience_epochs = 400
    epoch_bar = tqdm(total=epochs, dynamic_ncols=True, leave=True, position=0, desc='Train')

    for epoch in list(range(epochs)):
        model.train()
        num_correct = 0
        total_loss = 0
        
        for i, _data in enumerate(train_loader):

            x, y, input_lengths = _data
            data = x.float().to(device)
            y = y.long().to(device)
            
            optimizer.zero_grad()

            x = x.cuda()
            y = y.cuda()

            with torch.cuda.amp.autocast():     
                outputs = model(x, input_lengths)     
                loss = criterion(outputs, y)
            
            num_correct += int((torch.argmax(outputs,  axis=1) == y).sum())
            total_loss += float(loss)
            
            ls = torch.argmax(outputs, axis=1)
            scaler.scale(loss).backward() 

            scaler.step(optimizer) 

            scaler.update() 

        model.eval()
        num_correct2 = 0
        for i, _data in enumerate(val_loader):
            x, y, input_lengths = _data
            x = x.cuda()
            y = y.cuda()

            with torch.no_grad():
                outputs = model(x, input_lengths)

            num_correct2 += int((torch.argmax(outputs,  axis=1) == y).sum())

        last_improvement+=1

        validation_score = 100 * num_correct2 / ((len(val_loader) * batch_size))
        if best_acc < validation_score:
            best_acc = validation_score
            last_improvement = 0

        if last_improvement == patience_epochs:
          # print(f"Breaking since model refused to learn for {patience_epochs} patience epochs")
          break
        m = "i={} k={} s={} p={} - Epoch {}/{}: Train Acc {:.04f}%, Train Loss {:.04f}, Learning Rate {:.04f} CurrAcc: {:.2f} BestAcc: {:.2f} currPatience: {}".format(
            intent, k, s, p, int(epoch) + 1, epochs,
            100 * num_correct / (len(train_loader) * batch_size),
            float(total_loss / len(train_loader)),
            float(optimizer.param_groups[0]['lr']), validation_score, best_acc, patience_epochs-last_improvement )
        epoch_bar.set_description(m)
        epoch_bar.update()
    epoch_bar.close()

    return best_acc

In [None]:
params = {
    'intents': zip(["Two"],[intents_2]),
    'speakers': range(1,8),
    'recordings': range(1,8)
}

intent_scores = {}

#Loop that runs grid search 
for name, intent in params['intents']:
    k_scores = {}
    for k in params['recordings']:
        s_scores = {}
        for s in params['speakers']:
            train_loader, val_loader, x, y, lx, label_sizes = data_prep(recs=k, intent_lst=intent, speakers_num=s)
            val_score = train(train_loader, val_loader, label_sizes, intent=name, k=k, s=s, p=len(PHONEMES))
#             print(k,s)
            s_scores.update({s: val_score})
        k_scores.update({k: s_scores})
    intent_scores.update({name: k_scores})

In [None]:
intent_scores

In [None]:
import pandas as pd

#Generate .csv file of grid search results
dfs = []
for k, v in intent_scores.items():
    dfs.append((k, pd.DataFrame(v)))
# for i in df:
for item in dfs:
    item[1].to_csv(f"{item[0]}.csv")