In [None]:
import torch
import numpy as np
import phoneme_list
import ctcdecode
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
import torch.nn as nn
from torch.nn import functional as F
from tqdm.notebook import tqdm
import torch.optim as optim
import time
import Levenshtein

verbose = True
mode = "development"
cuda = torch.cuda.is_available()
num_workers = 4 if cuda else 0 
device = torch.device("cuda" if cuda else "cpu")

In [None]:
if verbose:
    print("mode: %s" % mode)
    print("torch version: %s" % torch.__version__)
    print("np version: %s" % np.__version__)
    print("cuda: %s" % cuda)
    print("num_workers: %s" % num_workers)
    print("device: %s" % device)
    print("verbose: %s" % verbose)

In [None]:
data_path = "../data/"
train_path = data_path + "wsj0_train"
dev_path = data_path + "wsj0_dev.npy"
test_path = data_path + "wsj0_test"
train_merged_labels_path = data_path + "wsj0_train_merged_labels.npy"
dev_merged_labels_path = data_path + "wsj0_dev_merged_labels.npy"

In [None]:
if mode == 'actual':
    train = np.load(train_path, allow_pickle=True)
    train_merged_labels = np.load(train_merged_labels_path, allow_pickle=True)
else:
    train = np.load(dev_path, allow_pickle=True)
    train_merged_labels = np.load(dev_merged_labels_path, allow_pickle=True)

dev = np.load(dev_path, allow_pickle=True)
dev_merged_labels = np.load(dev_merged_labels_path, allow_pickle=True)
test = np.load(test_path, allow_pickle=True)

In [None]:
class simpleDataset(Dataset):
    def __init__(self, x, y=None, is_test=False):
        super().__init__()

        self.is_test = is_test
        self._x = x
        self._y = y

    def __len__(self):
        return len(self._x)
      
    def __getitem__(self, index):
        if not self.is_test:
            return torch.from_numpy(self._x[index]).float(), torch.from_numpy(self._y[index])
        else:
            return torch.from_numpy(self._x[index]).float()

# customize pinned memory for fast host-gpu copies
class CustomBatch:
    def __init__(self, batch, is_test=False):
        # reference: https://suzyahyah.github.io/pytorch/2019/07/01/DataLoader-Pad-Pack-Sequence.html
        if not is_test:
            data, target = zip(*batch)

            self.data_lens = [len(x) for x in data]
            self.target_lens = [len(y) for y in target]

            self.data = pad_sequence(data, batch_first=True)
            self.target = pad_sequence(target, batch_first=True)
        else:
            data = batch
            self.data_lens = [len(x) for x in data]
            self.data = pad_sequence(data, batch_first=True)

    # custom memory pinning method on custom type
    def pin_memory(self):
        # TODO: check if this really works
        self.data = self.data.pin_memory()
        if self.target is not None:
            self.target = self.target.pin_memory()
        return self

def collate_fn(batch):
    return CustomBatch(batch)

def collate_fn_test(batch):
    return CustomBatch(batch, is_test=True)

In [None]:
train_dataset = simpleDataset(train, train_merged_labels)
dev_dataset = simpleDataset(dev, dev_merged_labels)
test_dataset = simpleDataset(test, is_test=True)

In [None]:
# hyper-parameters
batch_size = 2
input_size = 40
hidden_size = 47
output_size = 47
num_layers = 2
dropout = 0.2
bidirectional = True
lr = 0.01
beam_size = 5
blank_idx = 46

In [None]:
train_loader = DataLoader(
                train_dataset,              # The dataset
                batch_size=batch_size,      # Batch size
                shuffle=True,               # Shuffles the dataset at every epoch
                pin_memory=True,            # Copy data to CUDA pinned memory
                num_workers=num_workers,    # Number of worker processes for loading data.
                collate_fn = collate_fn
               )

dev_loader = DataLoader(
                dev_dataset,              # The dataset
                batch_size=batch_size,      # Batch size
                shuffle=False,               # Shuffles the dataset at every epoch
                pin_memory=True,            # Copy data to CUDA pinned memory
                num_workers=num_workers,    # Number of worker processes for loading data.
                collate_fn = collate_fn
               )

test_loader = DataLoader(
                test_dataset,              # The dataset
                batch_size=batch_size,      # Batch size
                shuffle=False,               # Shuffles the dataset at every epoch
                pin_memory=True,            # Copy data to CUDA pinned memory
                num_workers=num_workers,    # Number of worker processes for loading data.
                collate_fn = collate_fn_test
               )

In [None]:
class Baseline(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, bidirectional, dropout):
        super(Baseline, self).__init__()
        
        self.rnn = nn.LSTM(input_size=input_size, 
                          hidden_size=hidden_size,
                          num_layers = num_layers,
                          batch_first = True,
                          dropout = dropout,
                          bidirectional = bidirectional
                          )
        
        self.linear = nn.Linear(hidden_size*2, output_size)

    def forward(self, data, data_lens):
        # pack too rnn
        data_packed = pack_padded_sequence(data, data_lens, batch_first=True, enforce_sorted=False)
        
        output_packed, (hn, cn) = self.rnn(data_packed)
        
        # unpack from rnn
        output_padded, output_lengths = pad_packed_sequence(output_packed, batch_first=True)

        # output shape: (batch_size, seq_len, output_size)
        output = self.linear(output_padded)

        # TODO: softmax considered packed value which should not be considered
        # do softmax before pass to CTC loss
        output = F.log_softmax(output, dim=-1)

        return output, output_lengths

In [None]:
model = Baseline(input_size, hidden_size, output_size, num_layers, bidirectional, dropout)
optimizer = optim.SGD(model.to(device).parameters(), lr=lr, momentum=0.9, weight_decay=5e-4, nesterov=True)   # optimize all cnn parameters
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer)
criterion = nn.CTCLoss(blank=blank_idx)

In [None]:
model.train()

running_loss = 0.0
total_predictions = 0.0
correct_predictions = 0.0

start_time = time.time()

for batch_idx, sample in enumerate(train_loader):
    data, target = sample.data, sample.target
    data_lens, target_lens = sample.data_lens, sample.target_lens
    assert data.shape[1] == max(data_lens)
    assert target.shape[1] == max(target_lens)
    
    outputs, output_lens = model(data, data_lens)
    
    loss = criterion(log_probs = outputs.permute(1, 0, 2), 
          targets = target, 
          input_lengths = output_lengths, 
          target_lengths = torch.tensor(target_lens))
  
    running_loss += loss.item()

    accuracy = get_accuracy(outputs, target, output_lens, target_lens)
    
    loss.backward()
    optimizer.step()

    break

end_time = time.time()

running_loss /= len(train_loader)
# acc = (correct_predictions / total_predictions) * 100.0
print('Training Loss: ', running_loss, 'Time: ',end_time - start_time, 's')
# print('Training Accuracy: ', acc, '%')

In [None]:
def get_accuracy(outputs, target, output_lens, target_lens):
    # outputs: log_softmax output from model
    
    # TODO: step1: decode outputs using CTC beamsearch
    
    # step2: calculate Levenshtein distance as accuracy
    Levenshtein.distance(preds, golds)

In [None]:
# TODO: batch process
def convert_to_string(tokens, vocab, seq_len):
    return ''.join([vocab[x] for x in tokens[0:seq_len]])

# TODO: use what to represent blank symbol ?
vocab_list = phoneme_list.PHONEME_MAP + ['#']
# TODO: use '_' as stop sign?
decoder = ctcdecode.CTCBeamDecoder(labels=vocab_list, 
                                   beam_width=beam_size,
                                   blank_id=blank_idx,
                                   log_probs_input=True)
beam_result, beam_scores, timesteps, out_seq_len = decoder.decode(outputs, seq_lens=output_lens)

preds = convert_to_string(beam_result[0][0], vocab_list, out_seq_len[0][0])
golds = convert_to_string(target[0], vocab_list, target_lens[0])