# 11785-DeepLearning Sequence to Sequence Translation  `Mar 29`

In this homework you will again be working with speech data. We are going to be using
unaligned labels in this contest, which means the correlation between the features and labels
is not given explicitly and your model will have to igure this out by itself. Hence your data
will have a list of phonemes for each utterance, but not which frames correspond to which
phonemes.
Your main task for this assignment will be to predict the phonemes contained in utterances
in the test set. You are not given aligned phonemes in the training data, and you are not
asked to produce alignment for the test data.



## DataLoader
`wsj0 dataloader`: Dataloder will return a matrix of `(time step, 40)` of length `frames`

In [2]:
from __future__ import print_function, division
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

class wsj0dataset(Dataset):
    def __init__(self, file_dir, label_dir=None):
        self.X = np.load(file_dir, allow_pickle=True)
        
        self.label_dir = label_dir
        print("Successfully load the data file {0}".format(file_dir))
        self.frame_num = self.X.shape[0]
        
                  
        if label_dir:
            self.Y = np.load(label_dir, allow_pickle=True)
            print("The label set is loaded, shape = ,", self.Y.shape)
        else:
            print("The data shape is {0}".format(self.X[0].shape))
            
        for i in range(1):
            print("The first data shape is ", self.X[0].shape)
            if label_dir:
                print("The first label shape is", self.Y[0].shape)
                
    def __len__(self):  
        return self.frame_num
        
    def __getitem__(self, idx):
        if self.label_dir:
            
            return (torch.from_numpy(self.X[idx]).long(), torch.from_numpy(self.Y[idx]).long())
        else:
            return torch.from_numpy(self.X[idx]).long()

cpu


### Initialize Train and Dev and Test  dataloader

* `wsj0 train.npy`: This file contains your feature data for training the model. It will be
of the shape `(frames, time step, 40)`, where the second dimension will be variable as in
HW1P2.

* `wsj0 dev.npy`: This file is similar to wsj0 train.npy, but should be used to calculate
your validation losses and accuracy.

* `wsj0 dev.npy`: This file is similar to wsj0 train.npy, but should be used to calculate
your validation losses and accuracy.

* `wsj0 test.npy`: This file is similar to wsj0 train.npy, but should be used to predict the
phoneme labels for the final Kaggle submission.

In [3]:
from torch.nn.utils.rnn import pack_sequence
from torch.utils.data import DataLoader

def train_dev_collate(batch):
    X = [torch.FloatTensor(item[0].float()) for item in batch]
    X = pad_sequence(X)
    Y = [torch.FloatTensor(item[1].float()) for item in batch]
    Y = pad_sequence(Y, batch_first=True)
    
    X_lens = torch.FloatTensor([x.size(0) for x, _ in batch])
    Y_lens = torch.FloatTensor([y.size(0) for _, y in batch])
    return (X, Y, X_lens, Y_lens)

def test_collate(batch):
    X = [torch.FloatTensor(item.float()) for item in batch]
    X = pad_sequence(X)
    
    X_lens = torch.FloatTensor([x.size(0) for x in batch])
    return (X, X_lens)

In [4]:
import os, sys

def InitDataLoader(root_dir):
    kwargs={'num_workers':4, 'pin_memory': True}
    # Initialize tarining data
    train_path = os.path.join(root_dir, "wsj0_train")
    train_label_path = os.path.join(root_dir, "wsj0_train_merged_labels.npy")
    dev_path = os.path.join(root_dir, "wsj0_dev.npy")
    dev_label_path = os.path.join(root_dir, "wsj0_dev_merged_labels.npy")
    test_path = os.path.join(root_dir, "wsj0_test")
    
    train_dataloader = DataLoader(wsj0dataset(train_path, train_label_path), 
                                  batch_size=16,
                                  shuffle=True, 
                                  collate_fn=train_dev_collate,
                                  **kwargs)
    dev_dataloader = DataLoader(wsj0dataset(dev_path, dev_label_path), 
                                batch_size=16,
                                shuffle=True,
                                collate_fn=train_dev_collate,
                                **kwargs)
    test_dataloader = DataLoader(wsj0dataset(test_path), 
                                 batch_size=16,
                                 shuffle=False,
                                 collate_fn=test_collate,
                                 **kwargs)
    
    return train_dataloader, dev_dataloader, test_dataloader

In [5]:
root_dir = "hw3p2"
train_dataloader, dev_dataloader, test_dataloader = InitDataLoader(root_dir)

Successfully load the data file hw3p2/wsj0_train
The label set is loaded, shape = , (24724,)
The first data shape is  (548, 40)
The first label shape is (57,)
Successfully load the data file hw3p2/wsj0_dev.npy
The label set is loaded, shape = , (1106,)
The first data shape is  (958, 40)
The first label shape is (100,)
Successfully load the data file hw3p2/wsj0_test
The data shape is (535, 40)
The first data shape is  (535, 40)


### Unit Test for dataloader

In [None]:
for i, data in enumerate(test_dataloader):
    if i < 1:
        print("Test Dataloader")
        x, x_len = data
        print(x_len)
        print("the {0}th test_data is {1} and {2}".format(i, x.size(), x_len.size()))
    else:
        break
        
for i, data in enumerate(train_dataloader):
    if i < 1:
        print("Train Dataloader")
        x, y, x_len, y_len = data
        print(x_len)
        print("the {0}th train_data is {1} and {2}".format(i, x.size(), x_len.size()))
    else:
        break

for i, data in enumerate(dev_dataloader):
    if i < 1:
        print("Train Dataloader")
        x, y, x_len, y_len = data
        print("the {0}th dev_data is {1} and {2}".format(i, x.size(), x_len.size()))
    else:
        break

## LSTM model


In [None]:
import torch.nn as nn
class LSTMmodel(nn.Module):
    def __init__(self, input_size, out_size, hidden_size):
        super(LSTMmodel, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.gru = nn.LSTM(input_size,  hidden_size, num_layers = 3, bidirectional = True)
        self.output = nn.Linear(hidden_size * 2, out_size)
        
    def forward(self, X, lengths):
        print("1. Initail X size,", X.size())
        packed_X = pack_padded_sequence(X, lengths, enforce_sorted=False)
        
        print("2. packed_X size", packed_X.data.size())
        packed_out = self.gru(packed_X)[0]
        
        print("3. packed_out size", packed_out.data.size())
        out, out_lens = pad_packed_sequence(packed_out)
        
        out = self.output(out).log_softmax(2)
        return out, out_lens

In [None]:
input_size = 40
out_size = 47
hidden_size = 512

test_LSTMmodel = LSTMmodel(input_size, out_size, hidden_size)

test_LSTMmodel = test_LSTMmodel.to(device)

for i, data in enumerate(dev_dataloader):
    if i < 1:
        X, Y, X_lens, Y_lens = data
        X = X.to(device)
        Y = Y.to(device)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
        X_lens = X_lens.to(device)
        Y_lens = Y_lens.to(device)
        print("X.size() = ", X.size())
        print(len(X_lens))
        packed_out, final_state = test_LSTMmodel(X, X_lens)
        print(packed_out.size())
    else:
        break

## LSTM+CNN+FC model

In [5]:
import torch.nn as nn
class LSTM_CNN_FC_model(nn.Module):
    def __init__(self, input_size, channel1, channel2, channel3, lstm_hidden_size, fc1_num, fc2_num, out_size):
        super(LSTM_CNN_FC_model, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        self.cnn1 = nn.Conv1d(input_size, channel1, kernel_size=3, stride=1, padding=1) # [64, 1_channel, 1349] (N, C_in, L)
        self.batchnorm1 = nn.BatchNorm1d(channel1)
        self.relu1 = nn.ReLU(inplace=True)
        
        self.cnn2 = nn.Conv1d(channel1, channel2, kernel_size=3, stride=1, padding=1) # [64, 2_channel, 1349]
        self.batchnorm2 = nn.BatchNorm1d(channel2)
        self.relu2 = nn.ReLU(inplace=True)
        
        self.cnn3 = nn.Conv1d(channel2, channel3, kernel_size=3, stride=1, padding=1) # [64, 2_channel, 1349]
        self.batchnorm3 = nn.BatchNorm1d(channel3)
        self.relu3 = nn.ReLU(inplace=True)
        
        self.conv2_drop = nn.Dropout(0.3)

        self.lstm = nn.GRU(channel3, lstm_hidden_size, num_layers = 3, bidirectional = True, dropout=0.5)
        
        self.fc1 = nn.Linear(lstm_hidden_size * 2, fc1_num) # (N, *, H_in) > (N, *, H_out)
        self.fc1relu = nn.ReLU(inplace=True)
        
        self.fc2 = nn.Linear(fc1_num, fc2_num)
        self.fc2relu = nn.ReLU(inplace=True)
        
        self.output = nn.Linear(fc2_num, out_size)
        
    def forward(self, X, lengths):
#         print("1. Initial X size,", X.size())
        X = X.transpose(0,1) # [64, 1349, 40] (N, C_in, L)
        X = X.transpose(1,2) # [64, 40, 1349] (N, C_in, L)
#         print("2. Transposed X size,", X.size())
        X = self.cnn1(X)
#         print("3. CNN1 X size,", X.size())
        X = self.batchnorm1(X)
        X = self.relu1(X) # [64, 40, 1349] (N, 1_channel, L)
        
        X = self.cnn2(X)
#         print("4. CNN2 X size,", X.size())
        X = self.batchnorm2(X)
        X = self.relu2(X) # [64, 40, 1349] (N, 2_channel, L)
        
        X = self.cnn3(X)
#         print("4. CNN2 X size,", X.size())
        X = self.batchnorm3(X)
        X = self.relu3(X) # [64, 40, 1349] (N, 2_channel, L)
        
        X = self.conv2_drop(X)
        
        X = X.transpose(1,2) # [64, 1349, 40] (seq_len, batch, input_size)
#         print("4. Transpoed X size,", X.size())
        
        packed_X = pack_padded_sequence(X, lengths, batch_first = True, enforce_sorted=False)
#         print("5. packed_X size,", packed_X.data.size())
        
        packed_out = self.lstm(packed_X)[0]  # [1349, 64, 47] (seq_len, batch, class_num)
#         print("6. packed_out size,", packed_out.data.size())
       
        out, out_lens = pad_packed_sequence(packed_out, batch_first = True)
        
        out = self.fc1relu(self.fc1(out)) # [64, 1349, fc1_num]
        
        out = self.fc2relu(self.fc2(out)) # [64, 1349, fc2_num] 
        
        out = self.output(out).log_softmax(2)
        return out, out_lens


## LSTM+FC model

In [6]:
import torch.nn as nn
class LSTM_FC_model(nn.Module):
    def __init__(self, input_size, lstm_hidden_size, fc1_num, fc2_num, out_size):
        super(LSTM_FC_model, self).__init__()
        self.input_size = input_size

        self.lstm = nn.GRU(input_size, lstm_hidden_size, num_layers = 3, bidirectional = True, dropout=0.1)
        
        self.batchnorm1 = nn.BatchNorm1d(lstm_hidden_size * 2)
        
        self.fc1 = nn.Linear(lstm_hidden_size * 2, fc1_num) # (N, *, H_in) > (N, *, H_out)
        self.fc1relu = nn.ReLU(inplace=True)
        
        self.fc2 = nn.Linear(fc1_num, fc2_num)
        self.fc2relu = nn.ReLU(inplace=True)
        
        self.output = nn.Linear(fc2_num, out_size)
        
    def forward(self, X, lengths):
#         print("1. Initial X size,", X.size())
        X = X.transpose(0,1) # [64, 1349, 40] (N, C_in, L)
#         X = X.transpose(1,2) # [64, 40, 1349] (N, C_in, L)
#         print("2. Transposed X size,", X.size())
        
#         X = X.transpose(0,1) # [64, 1349, 40] (N, C_in, L)
        packed_X = pack_padded_sequence(X, lengths, batch_first = True, enforce_sorted=False)
#         print("5. packed_X size,", packed_X.data.size())
        
        packed_out = self.lstm(packed_X)[0]  # [1349, 64, 47] (seq_len, batch, class_num)
#         print("6. packed_out size,", packed_out.data.size())
       
        out, out_lens = pad_packed_sequence(packed_out, batch_first = True)
#         print("out", out.size())
        
        out = out.transpose(1,2)        
        out = self.batchnorm1(out)
#         print("out", out.size())
        out = out.transpose(1,2)
#         print("out", out.size())
        out = self.fc1relu(self.fc1(out)) # [64, 1349, fc1_num]
        
        out = self.fc2relu(self.fc2(out)) # [64, 1349, fc2_num] 
        
        out = self.output(out).log_softmax(2)
        return out, out_lens


### Unit Test For model

In [24]:
input_size = 40
out_size = 47
hidden_size = 512

torch.cuda.empty_cache()

test_LSTMmodel = LSTM_FC_model(input_size = 40,
                               lstm_hidden_size = 1024, 
                               fc1_num = 2048, 
                               fc2_num = 1024, 
                               out_size = 47)

test_LSTMmodel = test_LSTMmodel.to(device)
criterion = nn.CTCLoss()
for i, data in enumerate(dev_dataloader):
    torch.cuda.empty_cache()
    if i < 1:
        X, Y, X_lens, Y_lens = data
        X = X.to(device)
        Y = Y.to(device).int()                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
        X_lens = X_lens.to(device).int()
        Y_lens = Y_lens.to(device).int()
        print("X.size() = ", X.size())
        out, out_lens = test_LSTMmodel(X, X_lens)
        
        out = out.transpose(0,1)
        print("Log_probs size = ", out.size())
        print("Targets size = ", Y.size())
        print("Input_lengths = ", out_lens.size())
        print("Target_lengths = ", Y_lens.size())
                                       
        loss = criterion(out, Y, out_lens, Y_lens)
        print(out_lens.size())
        del X,Y,X_lens,Y_lens, out, out_lens, loss
        torch.cuda.empty_cache()
        
    else:
        break

X.size() =  torch.Size([1368, 16, 40])
2. Transposed X size, torch.Size([16, 1368, 40])
out torch.Size([16, 1368, 2048])
out torch.Size([16, 2048, 1368])
out torch.Size([16, 1368, 2048])
Log_probs size =  torch.Size([1368, 16, 47])
Targets size =  torch.Size([16, 156])
Input_lengths =  torch.Size([16])
Target_lengths =  torch.Size([16])
torch.Size([16])


In [26]:
del test_LSTMmodel
torch.cuda.empty_cache()

In [14]:
def debug_memory():
    import collections, gc, resource, torch
    print('maxrss = {}'.format(
        resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
    tensors = collections.Counter((str(o.device), o.dtype, tuple(o.shape))
                                  for o in gc.get_objects()
                                  if torch.is_tensor(o))
    for line in tensors.items():
        print('{}\t{}'.format(line[0], line[1]))

In [15]:
torch.cuda.empty_cache()
debug_memory()

maxrss = 6216816
('cuda:0', torch.float32, (512, 40, 3))	4
('cuda:0', torch.float32, (512,))	32
('cuda:0', torch.float32, (512, 512, 3))	4
('cuda:0', torch.float32, (6144, 512))	8
('cuda:0', torch.float32, (6144, 2048))	24
('cuda:0', torch.float32, (6144,))	48
('cuda:0', torch.float32, (6144, 4096))	16
('cuda:0', torch.float32, (1024, 4096))	4
('cuda:0', torch.float32, (1024,))	8
('cuda:0', torch.float32, (1024, 1024))	4
('cuda:0', torch.float32, (47, 1024))	4
('cuda:0', torch.float32, (47,))	4
('cpu', torch.float32, (1256, 32, 40))	1
('cpu', torch.float32, (32, 143))	1
('cpu', torch.float32, (32,))	4
('cuda:0', torch.float32, (1256, 32, 40))	1
('cuda:0', torch.int32, (32, 143))	1
('cpu', torch.int32, (32,))	4
('cuda:0', torch.int64, ())	4
('cuda:0', torch.float32, (32, 1256, 512))	1
('cuda:0', torch.int64, (32,))	2
('cuda:0', torch.float32, (22636, 512))	1
('cpu', torch.int64, (1256,))	1
('cuda:0', torch.float32, (6, 32, 2048))	1
('cuda:0', torch.float32, (1145, 32, 40))	1
('cuda:0', 



## Training 

https://pytorch.org/docs/stable/nn.html#ctcloss

`nn.CTCLoss` takes 4 arguments to compute the loss:
* `log_probs`: Prediction of your model at each time step.
  * Shape: (T, N, C), where T is the largest length in the batch, N is batch size, and C is number of classes (remember that it should be number of phonemes plus 1).
  * **Values must be log probabilities.** Neither probabilities nor logits will work. Make sure the output of your network is log probabilities, by adding a `nn.LogSoftmax` after the last linear layer.
* `targets`: The ground truth sequences.
  * Shape: (N, S), where N is batch size, and S is the largest length in the batch. **WARNING!** This dimension order is unconventional in PyTorch. If you use `torch.nn.utils.rnn.pad_sequence` to pad the target sequence,  **you must explicitly set `batch_first=True`**.
  * Values are indices of phonemes. Again, remember that index 0 is reserved for "blank" and should not represent any phoneme.
* `input_lengths`: Lengths of sequences in `log_probs`.
  * Shape: (N,).
  * This is not necessarily the same as lengths of input of the model. If your model uses CNNs or pyramidal RNNs, it changes the length of sequences, and you must correctly compute the lengths of its output to be used here.
* `target_lengths`: Lengths of sequences in `targets`.
  * Shape: (N,).


In [6]:
import numpy as np
import matplotlib.pyplot as plt

def train(model, input_size, out_size, criterion, decoder, PHONEME_MAP, file_name, learning_rate):
    torch.manual_seed(11785)
    loss_history = []
    log_interval = 20
    for epoch in range(800):
        loss_interval = 0
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.01*learning_rate)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.9)
        for i, data in enumerate(train_dataloader):
#             print(i)
            X, Y, X_lens, Y_lens = data
            X = X.to(device)
            Y = Y.to(device).int()
#             print("Y_lens.size()", Y_lens.size())
            Y_lens = Y_lens.int()
            X_lens = X_lens.int()

            optimizer.zero_grad()
            
            out, out_lens = model(X, X_lens)
#             print("out_lens.size()", out_lens.size())
            
            out = out.transpose(0,1)
            
            
            loss = criterion(out, Y, out_lens, Y_lens)
            loss.backward()
#             print(loss.item())
            loss_interval += loss.item()
            loss_history.append(loss.item)
            optimizer.step()
            loss_interval=loss.item()
#             try:
#                 loss = criterion(out, Y, out_lens, Y_lens)
#                 loss.backward()
#                 loss_interval += loss.item()
#                 loss_history.append(loss.item)
#                 optimizer.step()
#             except:
#                 print("Last batch, skip")
            
            if i%log_interval == 0:
                print('Epoch', epoch + 1, 'Sample', i, 'Loss',loss_interval/log_interval)
                loss_interval = 0
                torch.save(model.state_dict(), file_name)
            del X, Y, X_lens, Y_lens, out, out_lens
            torch.cuda.empty_cache()
        
        
        
        evaluate(model, 16, decoder, PHONEME_MAP)
#         scheduler.step(loss)
        scheduler.step()
        torch.save(model.state_dict(), file_name)
        print('Epoch', epoch + 1, 'Loss', loss.item())
        learning_rate = 0.1 * learning_rate
        torch.cuda.empty_cache()
            
            

## Evaluation

In [13]:
N_STATES = 138
N_PHONEMES = N_STATES // 3
PHONEME_LIST = [
    "",
    "+BREATH+",
    "+COUGH+",
    "+NOISE+",
    "+SMACK+",
    "+UH+",
    "+UM+",
    "AA",
    "AE",
    "AH",
    "AO",
    "AW",
    "AY",
    "B",
    "CH",
    "D",
    "DH",
    "EH",
    "ER",
    "EY",
    "F",
    "G",
    "HH",
    "IH",
    "IY",
    "JH",
    "K",
    "L",
    "M",
    "N",
    "NG",
    "OW",
    "OY",
    "P",
    "R",
    "S",
    "SH",
    "SIL",
    "T",
    "TH",
    "UH",
    "UW",
    "V",
    "W",
    "Y",
    "Z",
    "ZH"
]

PHONEME_MAP = [
    '',
    '_',  # "+BREATH+"
    '+',  # "+COUGH+"
    '~',  # "+NOISE+"
    '!',  # "+SMACK+"
    '-',  # "+UH+"
    '@',  # "+UM+"
    'a',  # "AA"
    'A',  # "AE"
    'h',  # "AH"
    'o',  # "AO"
    'w',  # "AW"
    'y',  # "AY"
    'b',  # "B"
    'c',  # "CH"
    'd',  # "D"
    'D',  # "DH"
    'e',  # "EH"
    'r',  # "ER"
    'E',  # "EY"
    'f',  # "F"
    'g',  # "G"
    'H',  # "HH"
    'i',  # "IH"
    'I',  # "IY"
    'j',  # "JH"
    'k',  # "K"
    'l',  # "L"
    'm',  # "M"
    'n',  # "N"
    'G',  # "NG"
    'O',  # "OW"
    'Y',  # "OY"
    'p',  # "P"
    'R',  # "R"
    's',  # "S"
    'S',  # "SH"
    '.',  # "SIL"
    't',  # "T"
    'T',  # "TH"
    'u',  # "UH"
    'U',  # "UW"
    'v',  # "V"
    'W',  # "W"
    '?',  # "Y"
    'z',  # "Z"
    'Z',  # "ZH"
]

assert len(PHONEME_LIST) == len(PHONEME_MAP)
assert len(set(PHONEME_MAP)) == len(PHONEME_MAP)

In [12]:
# !pip install python-Levenshtein



In [7]:
from Levenshtein import distance
distance('kitten', 'sitting')

3

In [9]:
def evaluate(model, batch_size, decoder, PHONEME_MAP):
    with torch.no_grad():
        batch_amount = 0
        loss = 0
        distances = 0
        overall_data_num = 0
        for i, data in enumerate(dev_dataloader):
            
            X, Y, X_lens, Y_lens = data
#             print(Y_lens.size())
            X = X.to(device)
            Y = Y.to(device).int()
            Y_lens = Y_lens.int()
            X_lens = X_lens.int()
#             print(X.type(),X_lens.type())
            
            out, out_lens = model(X, X_lens)
#             print("out.size()=",out.size())
#             print("out_lens.size()=",out_lens.size())
            out = out.transpose(0,1)
            
            loss += criterion(out, Y, out_lens, Y_lens)
            eval_Y, _, _, eval_Y_lens = decoder.decode(out.transpose(0, 1), out_lens)
#             print("eval_Y.size()=",eval_Y.size())
#             print("eval_Y_lens.size()=",eval_Y_lens.size())
#             print("Y.size()=",Y.size())
#             print("lens.size()=",Y_lens.size())
            overall_data_num += eval_Y.size(0)
            for j in range(eval_Y.size(0)):
                eval_best_idx = eval_Y[j, 0, 0:eval_Y_lens[j,0]]
                eval_best_phoneme = ""
                for phoneme_idx in eval_best_idx:
                    eval_best_phoneme+=PHONEME_LIST[phoneme_idx]

                best_idx = Y[j,0:Y_lens[j]]
                best_phoneme = ""
                for phoneme_idx in best_idx:
                    best_phoneme+=PHONEME_LIST[phoneme_idx]
#                 print("eval_best_phoneme = ", eval_best_phoneme,"best_phoneme = ", best_phoneme)
                distances += distance(eval_best_phoneme,best_phoneme)
            
            del X, Y, X_lens, Y_lens, out, out_lens
            torch.cuda.empty_cache()
            
        distances /= (overall_data_num * 2)
        print("distance = ", distances)
            
            
        av_loss = loss/batch_amount
        print("evaluation_loss = ", av_loss)

In [1]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [10]:
from ctcdecode import CTCBeamDecoder
input_size = 40
out_size = 47
hidden_size = 512
batch_size = 16
# file_name = 'cnn2_lstm3_fc2_large'
# file_name = 'cnn3_lstm3_dropout_fc2_large'
file_name = 'lstm3_dropout_fc2_large'

try: 
    del model
    torch.cuda.empty_cache()
except:
    print("No model in storage")
    
torch.cuda.empty_cache()

# model = LSTMmodel(input_size, out_size, hidden_size).to(device)
# model = LSTM_CNN_FC_model(input_size = 40,
#                            channel1 = 512, 
#                            channel2 = 512, 
#                            lstm_hidden_size = 2048, 
#                            fc1_num = 1024, 
#                            fc2_num = 1024, 
#                            out_size = 47).to(device)
# model = LSTM_CNN_FC_model(input_size = 40,
#                            channel1 = 128, 
#                            channel2 = 256, 
#                            channel3 = 512,
#                            lstm_hidden_size = 2048, 
#                            fc1_num = 1024, 
#                            fc2_num = 1024, 
#                            out_size = 47).to(device)
model = LSTM_FC_model(input_size = 40,
                      lstm_hidden_size = 2048, 
                      fc1_num = 1024, 
                      fc2_num = 1024, 
                      out_size = 47).to(device)
model.load_state_dict(torch.load(file_name, map_location=torch.device('cpu')))


<All keys matched successfully>

In [None]:
criterion = nn.CTCLoss()
learning_rate = 1e-6
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-5, weight_decay=1e-7)
decoder = CTCBeamDecoder(['$'] * (len(PHONEME_MAP)), beam_width=10, num_processes = os.cpu_count(), log_probs_input=True)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.9)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=0)

# evaluate(model, batch_size, decoder, PHONEME_MAP)
train(model, input_size, out_size, criterion, decoder, PHONEME_MAP, file_name, learning_rate)

Epoch 1 Sample 0 Loss 0.034434324502944945
Epoch 1 Sample 20 Loss 0.03113899230957031
Epoch 1 Sample 40 Loss 0.039797359704971315
Epoch 1 Sample 60 Loss 0.030459046363830566
Epoch 1 Sample 80 Loss 0.03343487977981567
Epoch 1 Sample 100 Loss 0.03189215958118439
Epoch 1 Sample 120 Loss 0.03129576444625855
Epoch 1 Sample 140 Loss 0.041374611854553225
Epoch 1 Sample 160 Loss 0.036687979102134706
Epoch 1 Sample 180 Loss 0.036330944299697875
Epoch 1 Sample 200 Loss 0.035518613457679746
Epoch 1 Sample 220 Loss 0.036587494611740115
Epoch 1 Sample 240 Loss 0.03303710520267487
Epoch 1 Sample 260 Loss 0.042043089866638184
Epoch 1 Sample 280 Loss 0.03513888418674469
Epoch 1 Sample 300 Loss 0.025820434093475342
Epoch 1 Sample 320 Loss 0.03313668072223663
Epoch 1 Sample 340 Loss 0.029886752367019653
Epoch 1 Sample 360 Loss 0.029035082459449767
Epoch 1 Sample 380 Loss 0.035458290576934816
Epoch 1 Sample 400 Loss 0.03287593424320221
Epoch 1 Sample 420 Loss 0.0379438728094101
Epoch 1 Sample 440 Loss 0.

## Decoder

`CTCBeamDecoder`:
* `phonemes`: **It doesn't need to be actual phonemes.** The only requirement is being a list of characters whose length is the number of classes (number of phonemes plus 1). 
* `beam_width`: Larger beam width produces better output, but also costs more time and memory.
* `num_processes`: Number of processes for parallel decoding. Setting it to `os.cpu_count()` is recommended as it utilizes all CPU cores.
* `log_probs_input`: Should always be True, since your model output is log probabilities.

`CTCBeamDecoder.decode` arguments:
* `probs`: Prediction from your model as log probabilities (if `log_probs_input=True`).
  * Shape: (N, T, C). where N is batch size, T is the largest length in the batch, and C is number of classes. **WARNING!** This dimension order is unconventional in PyTorch. You likely need to do `out.transpose(0, 1)` on your output.
* `len`: Lengths of sequences in `probs`.
  * Shape: (N,)


`CTCBeamDecoder.decode` return value (tuple of 4):
* First item `output`: Decoded top sequences.
  * Shape: (N, B, T), where B is the beam width. Normally we only need the best sequences, which are indexed 0 in the second (beam width) dimension.
* Second and third can be ignored.
* Last item `out_seq_len`: Length of sequences in `output`. 
  * Shape: (N, B). Lengths of best sequences are indexed 0 in the second (beam width) dimension.

In [11]:
import os

def test(model, decoder, PHONEME_LIST):
    result_phoneme = []
    with torch.no_grad():
        for i, data in enumerate(test_dataloader):
            
            X, X_lens = data
            X = X.to(device)
            X_lens = X_lens.int()
            out, out_lens = model(X, X_lens)
            out = out.cpu()
            
            test_Y, _, _, test_Y_lens = decoder.decode(out, out_lens)
            
            for j in range(test_Y.size(0)):
                best_idx = test_Y[j, 0, 0:test_Y_lens[j,0]]
                best_phoneme = ""
                for phoneme_idx in best_idx:
                    if phoneme_idx!=0:
                        best_phoneme+=PHONEME_LIST[phoneme_idx+1] #beacuse the first is the blank
                result_phoneme.append(best_phoneme)
    return result_phoneme

In [None]:
decoder = CTCBeamDecoder(['$'] * (len(PHONEME_MAP)), beam_width=10, num_processes = os.cpu_count(), log_probs_input=True)
predict_phoneme = test(model, decoder, PHONEME_MAP)

## Write Test Result

In [16]:
import pandas as pd
id = [x for x in range(0, 523)]
print("len of id = ", len(id))
print("len of phoneme = ", len(predict_phoneme))

result = {'id':id, 'Predicted':predict_phoneme}
df = pd.DataFrame(result, columns= ['id', 'Predicted'])
print(df)
df.to_csv (file_name+'.csv', index = False, header=True)

len of id =  523
len of phoneme =  523
      id                                          Predicted
0      0  .inshmhlkalWhzbestwnAdDhnAksywbhnpRizhndhnDhka...
1      1  .DhssOvIitskamhInWylkamhHAvbikhnWiTgRyntRUzfam...
2      2  .RyhfRIsklzWrfoRmmOTimpoRnErHijAnAnmithklAskhm...
3      3  .gREnzhnsYbiGDiskOlhnDhtRedhvREnhnfEvrhbhlzRiG...
4      4  .?UnyndistEtskhndrtukptUdIfAndWestrn?UROphgens...
..   ...                                                ...
518  518  .hSevRanpethththlOfyvHhntrfitITrmil?hndalrSlhs...
519  519  .RelIzWrmhtikhlstIstEjdtUIhnfoRsDhkempEnzmeshn...
520  520  .milhteIpalhsIWhztikIphtRhvhlRwtsObhnhndpRhtek...
521  521  .hnDhpOgrWIkkWORnbOthndsoRghnfoRmrzkhnRhsIvnyn...
522  522  .innyntInEtIEtkamhaprEtiGhnkamWizrRekidfoRHhnt...

[523 rows x 2 columns]
