In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# !cp drive/My\ Drive/11785/hw4_p2/src/dataloader.py .
# !cp drive/My\ Drive/11785/hw4_p2/src/models.py .
# !pip install python-Levenshtein

In [None]:
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
import torch.nn as nn
from torch.nn import functional as F
from tqdm.notebook import tqdm
import torch.optim as optim
import time
import Levenshtein
import os
import importlib
import dataloader
import models

mode = "actual"
colab = False
cuda = torch.cuda.is_available()
num_workers = 4 if cuda else 0 
device = torch.device("cuda" if cuda else "cpu")

def reload():
    importlib.reload(dataloader)
    importlib.reload(models)
    
print("mode: %s" % mode)
print("torch version: %s" % torch.__version__)
print("np version: %s" % np.__version__)
print("cuda: %s" % cuda)
print("num_workers: %s" % num_workers)
print("device: %s" % device)
print("colab: %s" % colab)

In [None]:
def generate_mask(lens, batch_size):
    lens = torch.tensor(lens)
    max_len = torch.max(lens)
    mask = (torch.arange(0, max_len).repeat(batch_size, 1) < \
                lens.unsqueeze(1).expand(batch_size, max_len)).int()
    return mask

def get_avg_edit_distances(preds, golds):
    sum_edit_dists = 0
    for pred, gold in zip(preds, golds):
        # calculate Levenshtein distance as accuracy
        edit_dist = Levenshtein.distance(pred, gold)
        sum_edit_dists +=edit_dist
    return sum_edit_dists / len(preds)

def train_epoch(model, train_loader, criterion, optimizer):
    model.train()

    running_loss = 0.0
    running_perplexity = 0.0

    start_time = time.time()

    # 1) Iterate through your loader
    for batch_idx, sample in enumerate(tqdm(train_loader)):
        if (batch_idx == 0):
            print("output: ", dataloader.transform_index_to_letter(sample.outputs.cpu().numpy(), index2letter, [letter2index['<pad>']])[0])
            print("target: ", dataloader.transform_index_to_letter(sample.targets.cpu().numpy(), index2letter, [letter2index['<pad>']])[0])
        assert sample.inputs.shape[1] == max(sample.inputs_lens)
        assert sample.outputs.shape[1] == max(sample.outputs_lens)
        assert sample.targets.shape[1] == max(sample.targets_lens)

        inputs, inputs_lens = sample.inputs, sample.inputs_lens
        outputs, outputs_lens = sample.outputs, sample.outputs_lens
        targets, targets_lens = sample.targets, sample.targets_lens

        # 2) TODO: Use torch.autograd.set_detect_anomaly(True) to get notices about gradient explosion

        # 3) Set the inputs to the device.
        inputs, outputs, targets = inputs.to(device), outputs.to(device), targets.to(device)

        optimizer.zero_grad()

        # 4) Pass your inputs, and length of speech into the model.
        predictions = model(None, None, text=outputs, isTrain=True)

        # 5) Generate a mask based on the lengths of the text to create a masked loss. 
        # 5.1) Ensure the mask is on the device and is the correct shape.
        # 6) If necessary, reshape your predictions and origianl text input 
        # 6.1) Use .contiguous() if you need to. 
        targets_mask = generate_mask(targets_lens, inputs.shape[0]).to(device)

        # 7) Use the criterion to get the loss.
        all_loss = criterion(predictions.view(-1, predictions.size(2)), targets.view(-1))

        # 8) Use the mask to calculate a masked loss.
        loss = torch.sum(all_loss * targets_mask.view(-1)) / torch.sum(targets_mask)

        running_loss += loss.item()
        running_perplexity += torch.exp(loss).item()

        # 9) Run the backward pass on the masked loss. 
        loss.backward()

        # TODO: 10) Use torch.nn.utils.clip_grad_norm(model.parameters(), 2)

        # 11) Take a step with your optimizer
        optimizer.step()

    end_time = time.time()
    # 12) Normalize the masked loss
    running_loss /= len(train_loader)
    running_perplexity /= len(train_loader)

    # 13) Optionally print the training loss after every N batches
    print('Training Loss: ', running_loss, 'Training Perplexity: ', running_perplexity, 
          'Time: ',end_time - start_time, 's')
        
    return running_loss, running_perplexity

def evaluate_epoch(model, eval_loader, criterion):
    with torch.no_grad():
        model.eval()

        running_loss = 0.0
        running_perplexity = 0.0
        running_edit_distance = 0.0

        for batch_idx, sample in enumerate(tqdm(eval_loader)):
            assert sample.inputs.shape[1] == max(sample.inputs_lens)
            assert sample.outputs.shape[1] == max(sample.outputs_lens)
            assert sample.targets.shape[1] == max(sample.targets_lens)

            inputs, inputs_lens = sample.inputs, sample.inputs_lens
            outputs, outputs_lens = sample.outputs, sample.outputs_lens
            targets, targets_lens = sample.targets, sample.targets_lens

            inputs, outputs, targets = inputs.to(device), outputs.to(device), targets.to(device)

            # training mode
            predictions = model(None, None, text=outputs, isTrain=True)
            targets_mask = generate_mask(targets_lens, inputs.shape[0]).to(device)

            all_loss = criterion(predictions.view(-1, predictions.size(2)), targets.view(-1))
            loss = torch.sum(all_loss * targets_mask.view(-1)) / torch.sum(targets_mask)

            running_loss += loss.item()
            running_perplexity += torch.exp(loss).item()
            
            # generate mode
            predictions = model(None, None, text=outputs, isTrain=False, batch_size=inputs.shape[0])
            
            # TODO: use random search/greedy search here
            predictions_texts = \
                dataloader.transform_index_to_letter(predictions.argmax(-1).detach().cpu().numpy(), index2letter, [letter2index['<eos>'], letter2index['<pad>']])            
            targets_texts = \
                dataloader.transform_index_to_letter(targets.detach().cpu().numpy(), index2letter, [letter2index['<eos>'], letter2index['<pad>']])
        
            running_edit_distance += get_avg_edit_distances(predictions_texts, targets_texts)
            
            if (batch_idx % 20 == 0):
                print("pred:\n", predictions_texts[0],"\ntgt:\n", targets_texts[0])        
        
        batch_num = len(eval_loader)
        running_loss /= batch_num
        running_perplexity /= batch_num
        running_edit_distance /= batch_num
        
        print('Evaluate Loss: ', running_loss, 
              'Evaluate Perplexity: ', running_perplexity,
              'Evaluate Edit Distance:', running_edit_distance)

        return running_loss, running_perplexity, running_edit_distance

def train_model(model, epochs, train_loader, eval_loader, criterion, optimizer, scheduler=None, checkpoint_filename=None):
    model.to(device)

    for epoch in range(epochs):
        print("epoch: %d" % (epoch))
        
        train_loss, train_perplexity = train_epoch(model, train_loader, criterion, optimizer)
        eval_loss, eval_perplexity, eval_ed = evaluate_epoch(model, eval_loader, criterion)
        
        if scheduler:
            if type(scheduler) is optim.lr_scheduler.StepLR:
                scheduler.step()
            elif type(scheduler) is optim.lr_scheduler.ReduceLROnPlateau:
                scheduler.step(eval_loss)
            else:
                raise valueError("No such scheduler")
        
        if checkpoint_filename:
            checkpoint = {
                "model_state_dict" : model.state_dict(),
                "optimizer_state_dict" : optimizer.state_dict(),
                "scheduler_state_dict" : scheduler.state_dict()
            }
            torch.save(checkpoint, checkpoint_filename)
            print('model is saved to {}'.format(checkpoint_filename))
        
        print('=' * 20)

def test_model(model, test_loader, save=False, filename="../pred/pred.csv"):
    all_preds = []
    
    with torch.no_grad():
        model.eval()

        model.to(device)

        # no target in test dataset/data loader
        for batch_idx, sample in enumerate(tqdm(test_loader)):
            assert sample.inputs.shape[1] == max(sample.inputs_lens)

            inputs, inputs_lens = sample.inputs, sample.inputs_lens

            inputs = inputs.to(device)

            # generate mode
            predictions = model(None, None, text=None, isTrain=False, batch_size=inputs.shape[0])

            # TODO: use random search/greedy search here
            predictions_texts = \
                dataloader.transform_index_to_letter(predictions.argmax(-1).detach().cpu().numpy(), index2letter, [letter2index['<eos>'], letter2index['<pad>']])            

            all_preds.extend(predictions_texts)
        

    if save:
        result = np.concatenate([np.arange(len(all_preds)).reshape(-1, 1),
                                 np.array(all_preds).reshape(-1, 1)], axis=1)
        np.savetxt(filename, result, fmt="%s", delimiter=",", header="Id,Predicted", comments="")

    return all_preds

In [None]:
if colab:
    root_path = "drive/My Drive/11785/hw4_p2/"
else:
    root_path = "../"

pred_path = root_path + "pred/"
data_path = root_path + "data/"
checkpoint_path = root_path + "checkpoint/"

ID = 1
checkpoint_filename = checkpoint_path + "checkpoint_%d.tar" % ID
pred_filename = pred_path + "pred.csv"

train_path = data_path + "train_new.npy"
train_transcripts_path = data_path + "train_transcripts.npy"
dev_path = data_path + "dev_new.npy"
dev_transcripts_path = data_path + "dev_transcripts.npy"
test_path = data_path + "test_new.npy"

In [None]:
if mode == 'actual':
    train = np.load(train_path, allow_pickle=True, encoding='bytes')
    train_transcripts = np.load(train_transcripts_path, allow_pickle=True, encoding='bytes')
else:
    train = np.load(dev_path, allow_pickle=True, encoding='bytes')
    train_transcripts = np.load(dev_transcripts_path, allow_pickle=True, encoding='bytes')

dev = np.load(dev_path, allow_pickle=True, encoding='bytes')
dev_transcripts = np.load(dev_transcripts_path, allow_pickle=True, encoding='bytes')
test = np.load(test_path, allow_pickle=True, encoding='bytes')

In [None]:
LETTER_LIST = ['<pad>', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', \
               'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '-', "'", '.', '_', '+', ' ','<sos>','<eos>']
batch_size = 128 if mode == 'actual' else 32
epochs = 100
vocab_size = len(LETTER_LIST)
hidden_dim = 256
value_size=128
key_size=128

In [None]:
letter2index, index2letter = dataloader.create_dictionaries(LETTER_LIST)

train_transcripts_index = dataloader.transform_letter_to_index(
                            train_transcripts, letter2index)
dev_transcripts_index = dataloader.transform_letter_to_index(
                            dev_transcripts, letter2index)

train_dataset = dataloader.Speech2TextDataset(train, train_transcripts_index)
dev_dataset = dataloader.Speech2TextDataset(dev, dev_transcripts_index)
test_dataset = dataloader.Speech2TextDataset(test, text=None, isTrain=False)

In [None]:
# TODO: may need to sort all data
train_loader = DataLoader(
                train_dataset,              # The dataset
                batch_size=batch_size,      # Batch size
                shuffle=True,               # Shuffles the dataset at every epoch
                pin_memory=True,            # Copy data to CUDA pinned memory
                num_workers=num_workers,    # Number of worker processes for loading data.
                collate_fn = dataloader.collate
               )

dev_loader = DataLoader(
                dev_dataset,              # The dataset
                batch_size=batch_size,      # Batch size
                shuffle=False,               # Shuffles the dataset at every epoch
                pin_memory=True,            # Copy data to CUDA pinned memory
                num_workers=num_workers,    # Number of worker processes for loading data.
                collate_fn = dataloader.collate
               )

test_loader = DataLoader(
                test_dataset,              # The dataset
                batch_size=batch_size,      # Batch size
                shuffle=False,               # Shuffles the dataset at every epoch
                pin_memory=True,            # Copy data to CUDA pinned memory
                num_workers=num_workers,    # Number of worker processes for loading data.
                collate_fn = dataloader.collate_test
               )

In [None]:
# language model
model = models.Decoder(vocab_size, hidden_dim, value_size, key_size, isAttended=False, isLM=True)
criterion = nn.CrossEntropyLoss(reduction='none')
optimizer = optim.Adam(model.to(device).parameters(), lr=0.01)
print(model)

In [None]:
train_model(model, epochs, train_loader, dev_loader, 
            criterion, optimizer, scheduler=None, checkpoint_filename=None)

print("finished")

In [None]:
test_predicts = test_model(model, test_loader, save=True, filename=pred_filename)

print("finished")

## DEBUG