<a href="https://colab.research.google.com/github/GeoffreyKimani/Google-Books-Android/blob/master/project_(4)_Jeff_Version.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gdown



In [None]:
!gdown --fuzzy https://drive.google.com/file/d/1LIrogRWSL-4CifdzciM6vV8V30JArQG6/view?usp=sharing

Downloading...
From: https://drive.google.com/uc?id=1LIrogRWSL-4CifdzciM6vV8V30JArQG6
To: /content/phonemes.zip
  0% 0.00/5.22M [00:00<?, ?B/s]100% 5.22M/5.22M [00:00<00:00, 200MB/s]


In [None]:
!unzip -qn ./phonemes.zip
!rm -rf '/content/phonemes/validation/pp10/spchdatadir/recording1/Untitled.ipynb'

In [None]:
!pip install torchsummaryX 



In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import os.path as osp
import glob
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchsummaryX import summary
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from tqdm import tqdm
import random

torch.manual_seed(1)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

Device:  cuda


In [None]:
intents_6 = ["move", 'turn', 'approach', 'grab', 'point', 'lift']
intents_4 = ['approach', 'grab', 'point', 'lift']
intents_2 = ['approach', 'lift']


In [None]:
X_dir = '/content/phonemes/train/*/spchdatadir/*/*'
X_dir_val = '/content/phonemes/validation/*/spchdatadir/*/*'
X_dir_test = '/content/phonemes/test/*/spchdatadir/*/*'

X_files_train = sorted(glob.glob(X_dir))
X_files_val = sorted(glob.glob(X_dir))
X_files_test = sorted(glob.glob(X_dir))
files = X_files_train
files.extend(X_files_val)
files.extend(X_files_test)

phones = set()

for x in files:
    f = np.load(x)
    phones.update(f)
    
PHONEMES = list(phones)



### Helper functions

In [None]:
#Get number of required records
def records(lis, num):
    k = []
    for i in range(0, len(lis), 15):
        k.extend(lis[i:i+num])
        
    return k

def intents_func(intents_lis, X_files, Y_files):
    xfiles = []
    yfiles = []
    for i, file in enumerate(Y_files):         
        f = open(file) 
        intent = f.read()   
        if intent in intents_lis:
            xfiles.append(X_files[i])
            yfiles.append(file)

    return xfiles, yfiles, intents_lis

def choose_speakers(speakers_lis, xlis, ylis, n=7):
    speakers = random.choices(speakers_lis, k=n)
    # print(f'{len(speakers)} Speakers: {speakers}')
    x_train_files = []
    y_train_files = []
    for i, file in enumerate(xlis):
        dirs = file.split('/')
        for speaker in speakers:
            if speaker in dirs:
                x_train_files.append(file)
                y_train_files.append(ylis[i]) 
    return x_train_files, y_train_files

In [None]:
speakers = ['pp2', 'pp3', 'pp4', 'pp5', 'pp6', 'pp7', 'pp8']

In [None]:
len(PHONEMES)

119

"""The purpose of the project is to convert audio recordings into phonemes and then classify the phonemes into intents.
Each sequence of phonemes is mapped to one of 6 intents. The model should be able to read phoneme sequence
and output an intent.
"""

In [None]:
"""
The dataset class reads sequnce of phonemes and a correspong intent.
The phonemes are mapped into indices using the above PHONEMES list
"""
class LibriSamples(torch.utils.data.Dataset):

    def __init__(self, recs, intents_lst, speakers_num, partition= "train"): # You can use partition to specify train or dev
        
        self.X_dir = '/content/phonemes/' + partition + '/*/spchdatadir/*/*'
        self.Y_dir = '/content/phonemes/' + partition + '/*/framedir/*/*'
        
        self.X_files = sorted(glob.glob(self.X_dir)) # TODO: list files in the mfcc directory
        self.Y_files = sorted(glob.glob(self.Y_dir)) # TODO: list files in the transcript directory            
        
        X_files = records(self.X_files, recs)
        Y_files = records(self.Y_files, recs)
        
        x_files, y_files, self.intents = intents_func(intents_lst, X_files, Y_files)
        
        if partition == 'train':
            self.X_files, self.Y_files = choose_speakers(speakers, x_files, y_files, speakers_num)
        else:
            self.X_files, self.Y_files = x_files, y_files
                
        self.PHONEMES = PHONEMES
        assert(len(self.X_files) == len(self.Y_files))

    def __len__(self):
        return len(self.X_files)

    def __getitem__(self, ind):
    
        X_path = self.X_files[ind] 
        Y_path = self.Y_files[ind] #Each file in Y_files has one of the intents in the list above

        X = np.load(X_path) #Load numpy files of phonemes corresponding to each recording
        X_indices = [PHONEMES.index(xx) for xx in X] #Get index of each phoneme of loaded file
        
        f = open(Y_path) 
        r = f.read() #Read each intent
        Y_index = self.intents.index(r) #Index of each read intent

        f.close()
                
        return torch.LongTensor(X_indices), Y_index
    
    def collate_fn(self, batch):

        batch_x = [torch.tensor(x) for x,y in batch] 
        batch_y = [torch.tensor(y) for x,y in batch]
        batch_x_pad = pad_sequence(batch_x, batch_first=True, padding_value=0) #Utterances have variable length
        lengths_x = [len(x) for x,y in batch] #Store lenghths of all utterances

        return batch_x_pad, torch.tensor(batch_y), torch.tensor(lengths_x)

In [None]:
def data_prep(recs, intent_lst, speakers_num):
    batch_size = 64

    train_data = LibriSamples(recs, intent_lst, speakers_num, 'train')
    val_data = LibriSamples(recs, intent_lst, speakers_num, 'validation')
    # test_data = LibriSamplesTest(root, 'test_order.csv')

    train_loader = torch.utils.data.DataLoader(train_data, batch_size, shuffle=True, collate_fn=train_data.collate_fn)
    val_loader = torch.utils.data.DataLoader(val_data, batch_size, shuffle=False, collate_fn=val_data.collate_fn)

    # print("Batch size: ", batch_size)
    # print("Train dataset samples = {}, batches = {}".format(train_data.__len__(), len(train_loader)))

    for data in val_loader:
        x, y, lx = data 
        # print('Checking shapes')
        # print(x.shape, y.shape, lx.shape)
        break

    return train_loader, val_loader, x, y, lx

In [None]:
# Test code for checking shapes and return arguments of the train and val loaders
# train_loader, val_loader = data_prep() 

# for data in val_loader:
#     x, y, lx = data 
#     print(x.shape, y.shape, lx.shape)
#     break

# once
train_loader, val_loader, x, y, lx = data_prep(recs=7, intent_lst=intents_4, speakers_num=5)



In [None]:
class ICASSP3CNN(nn.Module):
    def __init__(self, vocab_size, embed_size=128, hidden_size=512, num_lstm_layers = 2, bidirectional = False, label_size=31):
        super().__init__()
        self.n_layers = num_lstm_layers 
        self.hidden = hidden_size
        self.bidirectional = bidirectional
        
        self.embed = nn.Embedding(vocab_size, embed_size)

        self.cnn  = nn.Conv1d(embed_size, embed_size, kernel_size=3, padding=1)

        self.cnn2 = nn.Conv1d(embed_size, embed_size, kernel_size=5, padding=2)

        self.cnn3 = nn.Conv1d(embed_size, embed_size, kernel_size=7, padding=3)

        self.batchnorm = nn.BatchNorm1d(3 * embed_size)

        self.lstm = nn.LSTM(input_size = 3 * embed_size, 
                            hidden_size = hidden_size, 
                            num_layers = num_lstm_layers, 
                            bidirectional = bidirectional,
                            dropout = 0.2)

        self.linear = nn.Linear(in_features = 2 * hidden_size if bidirectional else hidden_size, 
                                out_features = label_size)


    def forward(self, x, lengths):
        """
        padded_x: (B,T) padded LongTensor
        """

        batch_size = x.shape[0]
        input = self.embed(x)
        
        batch_size = input.size(0)
        input = input.transpose(1,2)    # (B,T,H) -> (B,H,T)

        cnn_output = torch.cat([self.cnn(input), self.cnn2(input), self.cnn3(input)], dim=1)

        input = F.relu(self.batchnorm(cnn_output))

        input = input.transpose(1,2)

        pack_tensor = nn.utils.rnn.pack_padded_sequence(input, lengths, batch_first=True, enforce_sorted=False)
        _, (hn, cn) = self.lstm(pack_tensor)

        if self.bidirectional:
            h_n = hn.view(self.n_layers, 2, batch_size, self.hidden)
            h_n = torch.cat([ h_n[-1, 0,:], h_n[-1,1,:] ], dim = 1)
        else:
            h_n = hn[-1]
        
        logits = self.linear(h_n)

        return logits

In [None]:
#Architecture implemented in the paper
# Model Parameters Value
# Embedding Size 256
# CNN kernel size 3
# No. of CNN filters 256
# No. of LSTM layers 1 ( or 2)
# LSTM hidden size 256
# Batch Normalization False


In [None]:
model = ICASSP3CNN(len(PHONEMES)).cuda() 
summary(model, x.to(device), lx) 

              Kernel Shape   Output Shape   Params  Mult-Adds
Layer                                                        
0_embed         [128, 119]  [64, 25, 128]    15232      15232
1_cnn        [128, 128, 3]  [64, 128, 25]    49280    1228800
2_cnn2       [128, 128, 5]  [64, 128, 25]    82048    2048000
3_cnn3       [128, 128, 7]  [64, 128, 25]   114816    2867200
4_batchnorm          [384]  [64, 384, 25]      768        384
5_lstm                   -     [787, 512]  3940352    3932160
6_linear         [512, 31]       [64, 31]    15903      15872
-------------------------------------------------------------
                        Totals
Total params           4218399
Trainable params       4218399
Non-trainable params         0
Mult-Adds             10107648


  df_sum = df.sum()


Unnamed: 0_level_0,Kernel Shape,Output Shape,Params,Mult-Adds
Layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0_embed,"[128, 119]","[64, 25, 128]",15232,15232
1_cnn,"[128, 128, 3]","[64, 128, 25]",49280,1228800
2_cnn2,"[128, 128, 5]","[64, 128, 25]",82048,2048000
3_cnn3,"[128, 128, 7]","[64, 128, 25]",114816,2867200
4_batchnorm,[384],"[64, 384, 25]",768,384
5_lstm,-,"[787, 512]",3940352,3932160
6_linear,"[512, 31]","[64, 31]",15903,15872


In [None]:
# model = Network(256, 256, 1, 6).cuda() 
# summary(model, x.to(device), lx) 

In [None]:
criterion = torch.nn.CrossEntropyLoss() 
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-4)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.002)

scaler = torch.cuda.amp.GradScaler()
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.999, min_lr=0.0005, patience=5, verbose=False)


In [None]:
def train(train_loader, val_loader,  batch_size=64):  # todo: separation of train & validation. Which data should we train on?
    torch.cuda.empty_cache()
    epochs = 1000
    for epoch in range(epochs):
        
        model.train()

        batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train') 

        num_correct = 0
        total_loss = 0
        
        for i, _data in enumerate(train_loader):
            x, y, input_lengths = _data
            data = x.float().to(device)
            y = y.long().to(device)
            
            optimizer.zero_grad()

            x = x.cuda()
            y = y.cuda()

            with torch.cuda.amp.autocast():     
                outputs = model(x, input_lengths)     
                loss = criterion(outputs, y)
            
            
            num_correct += int((torch.argmax(outputs,  axis=1) == y).sum())
            total_loss += float(loss)
            
            ls = torch.argmax(outputs, axis=1)
                
            batch_bar.set_postfix(
                acc="{:.04f}%".format(100 * num_correct / ((i + 1) * batch_size)),
                loss="{:.04f}".format(float(total_loss / (i + 1))),
                num_correct=num_correct,
                lr="{:.04f}".format(float(optimizer.param_groups[0]['lr'])))
            
            scaler.scale(loss).backward() 
            scaler.step(optimizer) 
            scaler.update() 

            batch_bar.update() 
        batch_bar.close() 

        print("Epoch {}/{}: Train Acc {:.04f}%, Train Loss {:.04f}, Learning Rate {:.04f}".format(
            epoch + 1,
            epochs,
            100 * num_correct / (len(train_loader) * batch_size),
            float(total_loss / len(train_loader)),
            float(optimizer.param_groups[0]['lr'])))
        
        model.eval()
        batch_bar = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')
        num_correct2 = 0
        for i, _data in enumerate(val_loader):
            x, y, input_lengths = _data
            x = x.cuda()
            y = y.cuda()

            with torch.no_grad():
                outputs = model(x, input_lengths)

            num_correct2 += int((torch.argmax(outputs,  axis=1) == y).sum())
            batch_bar.set_postfix(acc="{:.04f}%".format(100 * num_correct2 / ((i + 1) * batch_size)))

            batch_bar.update()
        
        batch_bar.close()
        validation_score = 100 * num_correct2 / ((len(val_loader) * batch_size))
        print("Validation: {:.04f}%".format(validation_score))

        return validation_score

In [None]:
def train_tst():
    eps = 10
    for i in range(eps):
        print(f'Hello {i}')

break 

## Tests

In [None]:
# grid search
params = {
    'intents': zip(["Two", "Four", "Six"],[intents_2, intents_4, intents_6]),
    'speakers': range(1, 8),
    'recordings': range(5, 12)
}

intent_scores = {}
for name, intent in params['intents']:
    
    k_scores = {}
    for k in params['recordings']:
    
        s_scores = {}
        for s in params['speakers']:
            print(f'i: {name} k: {k} s: {s} phones: {len(PHONEMES)}')
            # num = random.randint(0, 100) # todo remove

            train_loader, val_loader, x, y, lx = data_prep(recs=k, intent_lst=intent, speakers_num=s)
            val_score = train(train_loader, val_loader)

            s_scores.update({s: val_score})

        k_scores.update({k: s_scores})
    intent_scores.update({name: k_scores})

In [None]:
len(PHONEMES)

In [None]:
# intent_scores

In [None]:
import pandas as pd

df = []
for k, v in intent_scores.items():
    # print(k)
    df.append((k, pd.DataFrame(v)))

display(df[0][0])
df[0][1]

In [None]:
ph = np.load(sorted(glob.glob('/content/phonemes/train/*/spchdatadir/*/*'))[0])

In [None]:
label = [PHONEMES.index(yy) for yy in ph]

In [None]:
label

In [None]:
# embeds(torch.LongTensor(label))

In [None]:
fk = sorted(glob.glob('/content/phonemes/validation/*/framedir/*/*'))
len(fk)

In [None]:
fl = sorted(glob.glob('/content/phonemes/train/*/spchdatadir/*/*'))
len(fl)

In [None]:
!rm -rf '/content/phonemes/validation/pp10/spchdatadir/recording1/Untitled.ipynb'

In [None]:
# fl

In [None]:
f = open(fl)
r = f.read()

In [None]:
r = f.read()
f.close()
r

In [None]:
intents.index(r)