In [None]:
# !git clone --recursive https://github.com/parlance/ctcdecode.git
# !pip install wget
# %cd ctcdecode
# !pip install .
# %cd ..
# !pip install python-Levenshtein

In [None]:
import torch
import numpy as np
import phoneme_list
import ctcdecode
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
import torch.nn as nn
from torch.nn import functional as F
from tqdm.notebook import tqdm
import torch.optim as optim
import time
import Levenshtein
import os

verbose = True
mode = "actual"
cuda = torch.cuda.is_available()
num_workers = 4 if cuda else 0 
device = torch.device("cuda" if cuda else "cpu")

In [None]:
if verbose:
    print("mode: %s" % mode)
    print("torch version: %s" % torch.__version__)
    print("np version: %s" % np.__version__)
    print("cuda: %s" % cuda)
    print("num_workers: %s" % num_workers)
    print("device: %s" % device)
    print("verbose: %s" % verbose)

In [None]:
pred_path = "../pred/"
data_path = "../data/"
checkpoint_path = "../checkpoint/"

ID = 1
checkpoint_filename = checkpoint_path + "checkpoint_%d.tar" % ID
pred_filename = pred_path + "pred.csv"

train_path = data_path + "wsj0_train"
dev_path = data_path + "wsj0_dev.npy"
test_path = data_path + "wsj0_test"
train_merged_labels_path = data_path + "wsj0_train_merged_labels.npy"
dev_merged_labels_path = data_path + "wsj0_dev_merged_labels.npy"

In [None]:
if mode == 'actual':
    train = np.load(train_path, allow_pickle=True)
    train_merged_labels = np.load(train_merged_labels_path, allow_pickle=True)
else:
    train = np.load(dev_path, allow_pickle=True)
    train_merged_labels = np.load(dev_merged_labels_path, allow_pickle=True)

dev = np.load(dev_path, allow_pickle=True)
dev_merged_labels = np.load(dev_merged_labels_path, allow_pickle=True)
test = np.load(test_path, allow_pickle=True)

In [None]:
class simpleDataset(Dataset):
    def __init__(self, x, y=None, is_test=False):
        super().__init__()

        self.is_test = is_test
        self._x = x
        self._y = y

    def __len__(self):
        return len(self._x)
      
    def __getitem__(self, index):
        if not self.is_test:
            return torch.from_numpy(self._x[index]).float(), torch.from_numpy(self._y[index])
        else:
            return torch.from_numpy(self._x[index]).float()

# customize pinned memory for fast host-gpu copies
class CustomBatch:
    def __init__(self, batch, is_test=False):
        # reference: https://suzyahyah.github.io/pytorch/2019/07/01/DataLoader-Pad-Pack-Sequence.html
        if not is_test:
            data, target = zip(*batch)

            self.data_lens = [len(x) for x in data]
            self.target_lens = [len(y) for y in target]

            self.data = pad_sequence(data, batch_first=True)
            self.target = pad_sequence(target, batch_first=True)
        else:
            data = batch
            self.data_lens = [len(x) for x in data]
            self.data = pad_sequence(data, batch_first=True)
            self.target = None

    # custom memory pinning method on custom type
    def pin_memory(self):
        self.data = self.data.pin_memory()
        if self.target is not None:
            self.target = self.target.pin_memory()
        return self

def collate_fn(batch):
    return CustomBatch(batch)

def collate_fn_test(batch):
    return CustomBatch(batch, is_test=True)

In [None]:
train_dataset = simpleDataset(train, train_merged_labels)
dev_dataset = simpleDataset(dev, dev_merged_labels)
test_dataset = simpleDataset(test, is_test=True)

In [None]:
# hyper-parameters
batch_size = 128
input_size = 40
hidden_size = 512
output_size = 47
num_layers = 2
dropout = 0.2
bidirectional = True
lr = 0.05
beam_size = 10
blank_idx = 46
epochs = 100

# decoding-related
# TODO: use what to represent blank symbol ?
vocab = phoneme_list.PHONEME_MAP + ['#']
decoder = ctcdecode.CTCBeamDecoder(labels=vocab, 
                                   beam_width=beam_size,
                                   blank_id=blank_idx,
                                   log_probs_input=True,
                                   num_processes = os.cpu_count())

In [None]:
train_loader = DataLoader(
                train_dataset,              # The dataset
                batch_size=batch_size,      # Batch size
                shuffle=True,               # Shuffles the dataset at every epoch
                pin_memory=True,            # Copy data to CUDA pinned memory
                num_workers=num_workers,    # Number of worker processes for loading data.
                collate_fn = collate_fn
               )

dev_loader = DataLoader(
                dev_dataset,              # The dataset
                batch_size=batch_size,      # Batch size
                shuffle=False,               # Shuffles the dataset at every epoch
                pin_memory=True,            # Copy data to CUDA pinned memory
                num_workers=num_workers,    # Number of worker processes for loading data.
                collate_fn = collate_fn
               )

test_loader = DataLoader(
                test_dataset,              # The dataset
                batch_size=batch_size,      # Batch size
                shuffle=False,               # Shuffles the dataset at every epoch
                pin_memory=True,            # Copy data to CUDA pinned memory
                num_workers=num_workers,    # Number of worker processes for loading data.
                collate_fn = collate_fn_test
               )

In [None]:
class Baseline(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, bidirectional, dropout):
        super(Baseline, self).__init__()
        
        self.rnn = nn.LSTM(input_size=input_size, 
                          hidden_size=hidden_size,
                          num_layers = num_layers,
                          batch_first = True,
                          dropout = dropout,
                          bidirectional = bidirectional
                          )
        
        self.linear = nn.Linear(hidden_size*2, output_size)

    def forward(self, data, data_lens):
        # pack too rnn
        data_packed = pack_padded_sequence(data, data_lens, batch_first=True, enforce_sorted=False)
        
        output_packed, (hn, cn) = self.rnn(data_packed)
        
        # unpack from rnn
        output_padded, output_lengths = pad_packed_sequence(output_packed, batch_first=True)

        # output shape: (batch_size, seq_len, output_size)
        output = self.linear(output_padded)

        output = F.log_softmax(output, dim=-1)

        return output, output_lengths

In [None]:
def convert_to_string(tokens, vocab, seq_len):
    return ''.join([vocab[x] for x in tokens[0:seq_len]])

def decode_beam_result(batch_tokens, vocab, batch_seq_lens):
    decode_strs = []
    for idx, tokens in enumerate(batch_tokens):
        decode_str = convert_to_string(tokens, vocab, batch_seq_lens[idx])
        decode_strs.append(decode_str)

    return decode_strs


def decode(decoder, vocab, outputs, target, output_lens, target_lens):
    # outputs: log_softmax output from model    
    # step1: CTC beamsearch
    beam_result, beam_scores, timesteps, beam_output_lens \
        = decoder.decode(outputs, seq_lens=output_lens)
    
    # step2: decode
    preds = decode_beam_result(beam_result[:, 0, :], vocab, beam_output_lens[:, 0])
    if (target is not None and target_lens is not None):
        golds = decode_beam_result(target, vocab, target_lens)
    else:
        golds = None

    return (preds, golds)

def get_edit_distances(preds, golds):
    sum_edit_dists = 0
    for idx, pred in enumerate(preds):
        # calculate Levenshtein distance as accuracy
        edit_dist = Levenshtein.distance(pred, golds[idx])
        sum_edit_dists +=edit_dist
    return sum_edit_dists

In [None]:
def train_epoch(model, train_loader, criterion, optimizer, device, decoder, vocab):
    model.train()

    running_loss = 0.0
    total_samples = 0
    total_edit_distance = 0

    start_time = time.time()

    for batch_idx, sample in enumerate(tqdm(train_loader)):
        data, target = sample.data, sample.target
        data_lens, target_lens = sample.data_lens, sample.target_lens
        assert data.shape[1] == max(data_lens)
        assert target.shape[1] == max(target_lens)
        
        data = data.to(device)
        target = target.to(device)

        optimizer.zero_grad()

        outputs, output_lens = model(data, data_lens)

        loss = criterion(log_probs = outputs.permute(1, 0, 2), 
              targets = target, 
              input_lengths = output_lens, 
              target_lengths = torch.tensor(target_lens))
  
        running_loss += loss.item()
        total_samples += target.size(0)
#         total_edit_distance += get_edit_distances(*decode(decoder, vocab, 
#                                                   outputs, target, 
#                                                   output_lens, 
#                                                   target_lens))

        loss.backward()
        optimizer.step()

    end_time = time.time()

    running_loss /= len(train_loader)
#     acc = (total_edit_distance / total_samples)
    acc = 0
    print('Training Loss: ', running_loss, 'Time: ',end_time - start_time, 's')
#     print('Training Accuracy (edit distance): ', acc)
    
    return running_loss, acc

def evaluate_model(model, eval_loader, criterion, device, decoder, vocab):
    with torch.no_grad():
        model.eval()

        running_loss = 0.0
        total_samples = 0
        total_edit_distance = 0

        for batch_idx, sample in enumerate(tqdm(eval_loader)):
            data, target = sample.data, sample.target
            data_lens, target_lens = sample.data_lens, sample.target_lens
            assert data.shape[1] == max(data_lens)
            assert target.shape[1] == max(target_lens)

            data = data.to(device)
            target = target.to(device)

            outputs, output_lens = model(data, data_lens)

            loss = criterion(log_probs = outputs.permute(1, 0, 2), 
                              targets = target, 
                              input_lengths = output_lens, 
                              target_lengths = torch.tensor(target_lens)).detach()
  
            running_loss += loss.item()
            total_samples += target.size(0)
            total_edit_distance += get_edit_distances(*decode(decoder, vocab, 
                                                      outputs, target, 
                                                      output_lens, 
                                                      target_lens))

        running_loss /= len(eval_loader)
        acc = (total_edit_distance / total_samples)
        print('evaluate Loss: ', running_loss)
        print('evaluate Accuracy (edit distance): ', acc)
        return running_loss, acc

def train_model(model, epochs, train_loader, eval_loader, criterion, optimizer, device, decoder, vocab, scheduler=None, checkpoint_filename=None):
    model.to(device)

    for epoch in range(epochs):
        print("epoch: %d" % (epoch))
        
        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device, decoder, vocab)
        eval_loss, eval_acc = evaluate_model(model, eval_loader, criterion, device, decoder, vocab)
        
        if scheduler:
            if type(scheduler) is optim.lr_scheduler.StepLR:
                scheduler.step()
            elif type(scheduler) is optim.lr_scheduler.ReduceLROnPlateau:
                scheduler.step(eval_loss)
            else:
                raise valueError("No such scheduler")
        
        if checkpoint_filename:
            checkpoint = {
                "model_state_dict" : model.state_dict(),
                "optimizer_state_dict" : optimizer.state_dict(),
                "scheduler_state_dict" : scheduler.state_dict()
            }
            torch.save(checkpoint, checkpoint_filename)
            print('model is saved to {}'.format(checkpoint_filename))
        
        print('=' * 20)
    
    return 

In [None]:
def test_model(model, test_loader, device, decoder, vocab, save=False, filename="../data/test_pred.csv"):
    all_preds = []
    
    with torch.no_grad():
        model.eval()

        model.to(device)

        # no target in test dataset/data loader
        for batch_idx, sample in enumerate(tqdm(test_loader)):
            data = sample.data
            data_lens = sample.data_lens
            assert data.shape[1] == max(data_lens)

            data = data.to(device)

            outputs, output_lens = model(data, data_lens)

            preds, golds = decode(decoder, vocab, outputs, None, output_lens, None)
            
            all_preds.extend(preds)

    if save:
        result = np.concatenate([np.arange(len(all_preds)).reshape(-1, 1),
                                 np.array(all_preds).reshape(-1, 1)], axis=1)
        np.savetxt(filename, result, fmt="%s", delimiter=",", header="id,Predicted", comments="")

    return all_preds

In [None]:
model = Baseline(input_size, hidden_size, output_size, num_layers, bidirectional, dropout)
optimizer = optim.SGD(model.to(device).parameters(), lr=lr, momentum=0.9, weight_decay=5e-4, nesterov=True)
# optimizer = optim.Adam(model.to(device).parameters(), lr=0.01)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.98)
criterion = nn.CTCLoss(blank=blank_idx)

In [None]:
checkpoint = torch.load(checkpoint_filename, map_location=device)
# checkpoint = torch.load(checkpoint_path + "checkpoint_1_11-86.tar", map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

In [None]:
train_model(model, epochs, train_loader, dev_loader, 
            criterion, optimizer, device, decoder, vocab, scheduler, checkpoint_filename=checkpoint_filename)

if verbose:
    print("finished")

In [None]:
predicts = test_model(model, test_loader, device, decoder, vocab, save=True, filename=pred_filename)

if verbose:
    print("finished")

## Debugging Area

In [None]:
# to change lr in half way
for params in optimizer.param_groups:
    params['lr']=0.01
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.98)
scheduler.get_last_lr()

In [None]:
# scheduler.step()
optimizer

In [None]:
!nvidia-smi

In [None]:
# for batch_idx, sample in enumerate(tqdm(train_loader)):
#       data, target = sample.data, sample.target
#       data_lens, target_lens = sample.data_lens, sample.target_lens
#       assert data.shape[1] == max(data_lens)
#       assert target.shape[1] == max(target_lens)
      
#       data = data.to(device)
#       target = target.to(device)

#       outputs, output_lens = model(data, data_lens)

#       loss = criterion(log_probs = outputs.permute(1, 0, 2), 
#             targets = target, 
#             input_lengths = output_lens, 
#             target_lengths = torch.tensor(target_lens))

#       pred, golds = decode(decoder, vocab, 
#                                           outputs, target, 
#                                           output_lens, 
#                                           target_lens)
#       total_edit_distance = get_edit_distances(pred, golds)
#       break

In [None]:
def test_model_ensemble(model_1, model_2, test_loader, device, decoder, vocab, save=False, filename="../data/test_pred.csv"):
    all_preds = []
    
    with torch.no_grad():
        model.eval()

        model.to(device)

        # no target in test dataset/data loader
        for batch_idx, sample in enumerate(tqdm(test_loader)):
            data = sample.data
            data_lens = sample.data_lens
            assert data.shape[1] == max(data_lens)

            data = data.to(device)

            outputs_1, output_lens_1 = model_1(data, data_lens)
            outputs_2, output_lens_2 = model_2(data, data_lens)

            outputs = 0.5 * outputs_1 + 0.5 * outputs_2
            output_lens = output_lens_1

            preds, golds = decode(decoder, vocab, outputs, None, output_lens, None)
          
            all_preds.extend(preds)

        if save:
            result = np.concatenate([np.arange(len(all_preds)).reshape(-1, 1),
                                    np.array(all_preds).reshape(-1, 1)], axis=1)
            np.savetxt(filename, result, fmt="%s", delimiter=",", header="id,Predicted", comments="")

        return all_preds  

## Ensemble

In [None]:
model_1_path, model_2_path = checkpoint_filename, checkpoint_path + "checkpoint_1_11-86.tar"

model1 = Baseline(input_size, hidden_size, output_size, num_layers, bidirectional, dropout)
checkpoint = torch.load(model_1_path, map_location=device)
model1.load_state_dict(checkpoint['model_state_dict'])

model2 = Baseline(input_size, hidden_size, output_size, num_layers, bidirectional, dropout)
checkpoint = torch.load(model_2_path, map_location=device)
model2.load_state_dict(checkpoint['model_state_dict'])

In [None]:
predicts = test_model_ensemble(model1, model2, test_loader, device, decoder, vocab, save=True, filename=pred_filename)

if verbose:
    print("finished ensemble")