# installation

https://github.com/bentrevett/pytorch-seq2seq
https://github.com/oxford-cs-deepnlp-2017/lectures
http://www.phontron.com/class/nn4nlp2019/assignments.html

In [0]:
!pip install torch==1.0.0

Collecting torch==1.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/7e/60/66415660aa46b23b5e1b72bc762e816736ce8d7260213e22365af51e8f9c/torch-1.0.0-cp36-cp36m-manylinux1_x86_64.whl (591.8MB)
[K     |████████████████████████████████| 591.8MB 31kB/s 
[31mERROR: torchvision 0.4.2 has requirement torch==1.3.1, but you'll have torch 1.0.0 which is incompatible.[0m
[?25hInstalling collected packages: torch
  Found existing installation: torch 1.3.1
    Uninstalling torch-1.3.1:
      Successfully uninstalled torch-1.3.1
Successfully installed torch-1.0.0


In [0]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 8265018910091821816, name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 1824534260472105775
 physical_device_desc: "device: XLA_CPU device", name: "/device:XLA_GPU:0"
 device_type: "XLA_GPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 3221333456112038677
 physical_device_desc: "device: XLA_GPU device", name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 15956161332
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 16314864333158252133
 physical_device_desc: "device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0"]

In [0]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sat_Aug_25_21:08:01_CDT_2018
Cuda compilation tools, release 10.0, V10.0.130


# utils.py

In [0]:
import math
from typing import List

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

In [0]:
def pad_sents_char(sents: List[List[List[int]]], char_pad_token: int) -> List[List[List[int]]]:
    """ Pad list of sentences according to the longest sentence in the batch and max_word_length.
    @param sents (list[list[list[int]]]): list of sentences, result of `words2charindices()`
        from `vocab.py`
    @param char_pad_token (int): index of the character-padding token
    @returns sents_padded (list[list[list[int]]]): list of sentences where sentences/words shorter
        than the max length sentence/word are padded out with the appropriate pad token, such that
        each sentence in the batch now has same number of words and each word has an equal
        number of characters
        Output shape: (batch_size, max_sentence_length, max_word_length)

    In this function: bacth_size is the number of sentences returned from batch_iter
                      max_sentence_length is the length of the longest sentence among senteneces within
                      the batch
                      max_word_length is pre-defined
    """
    # Words longer than 21 characters should be truncated
    max_word_length = 21

    padded_sents = list()

    max_sent_len = max(len(sent) for sent in sents)
    word_pad_tokens = [char_pad_token] * max_word_length

    for sent in sents:
        # Pad word
        padded_words = list()
        for word in sent:
            if len(word) <= max_word_length:
                padded_word = word + [char_pad_token] * (max_word_length - len(word))
                padded_words.append(padded_word)
            else:
                truncated_word = word[:max_word_length]
                word.append(truncated_word)
        # Pad sentence
        padded_sent = padded_words + [word_pad_tokens] * (max_sent_len - len(padded_words))
        padded_sents.append(padded_sent)
    
    #print(80*'-')
    #print('padded_sents: ', padded_sents)
    #print('-'*80)

    return padded_sents

In [0]:
def pad_sents(sents: List[List[int]], pad_token: int):
    """ Pad list of sentences according to the longest sentence in the batch.
    @param sents (list[list[int]]): list of sentences, where each sentence
                                    is represented as a list of words
    @param pad_token (int): padding token
    @returns sents_padded (list[list[int]]): list of sentences where sentences shorter
        than the max length sentence are padded out with the pad_token, such that
        each sentences in the batch now has equal length.
        Output shape: (batch_size, max_sentence_length)
    """
    sents_padded = []

    padded_sentences = []

    max_len_sents = max([len(sent) for sent in sents])
    for sent in sents:
    	# Pad sentence
        padded_sent = sent + (max_len_sents - len(sent)) * [pad_token]
        padded_sentences.append(padded_sent)

    #print(80*'-')
    #print('padded_sentences: ', padded_sentences)
    #print('-'*80)
    return padded_sentences

In [0]:
def read_corpus(file_path, source):
    """ Read file, where each sentence is dilineated by a `\n`.
    @param file_path (str): path to file containing corpus
    @param source (str): "tgt" or "src" indicating whether text
        is of the source language or target language
    """
    data = []
    for line in open(file_path, encoding = 'utf-8-sig'):
        sent = line.strip().split(' ')
        # only append <s> and </s> to the target sentence
        if source == 'tgt':
            sent = ['<s>'] + sent + ['</s>']
        data.append(sent)

    return data

In [0]:
def batch_iter(data, batch_size, shuffle=False):   
    """ Yield batches of source and target sentences reverse sorted by length (largest to smallest).
    @param data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentence
    @param batch_size (int): batch size
    @param shuffle (boolean): whether to randomly shuffle the dataset
    """
    batch_num = math.ceil(len(data) / batch_size)
    index_array = list(range(len(data)))

    if shuffle:
        np.random.shuffle(index_array)

    for i in range(batch_num):
        indices = index_array[i * batch_size: (i + 1) * batch_size]
        examples = [data[idx] for idx in indices]

        examples = sorted(examples, key=lambda e: len(e[0]), reverse=True)
        src_sents = [e[0] for e in examples]
        tgt_sents = [e[1] for e in examples]

        yield src_sents, tgt_sents

# vocab.py

In [0]:
from collections import Counter
from docopt import docopt
from itertools import chain
import json
import torch
from typing import List

In [0]:
class VocabEntry(object):
    def __init__(self, word2id = None):
        """ Init VocabEntry Instance.
        @param word2id (dict): dictionary mapping words 2 indices
        """
        if word2id:
            self.word2id = word2id
        else:
            self.word2id = dict()
            self.word2id['<pad>'] = 0   # Pad Token
            self.word2id['<s>'] = 1 # Start Token
            self.word2id['</s>'] = 2    # End Token
            self.word2id['<unk>'] = 3   # Unknown Token
        self.unk_id = self.word2id['<unk>']
        self.id2word = {v: k for k, v in self.word2id.items()}
        
        ## Additions to the A4 code:
        self.char_list = list("""0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]“”＂""") 
        self.char_list.extend(['a','ă','â','b','c','d','đ','e','ê','g','h','i','k','l',
                               'm','n','o','ô','ơ','p','q','r','s','t','u','ư','v','x','y'])
        self.char_list.extend(['á','à','ả','ã','ạ','é','è','ẻ','ẽ','ẹ','ế','ề','ể','ễ','ệ',
                               'ó','ò','ỏ','õ','ọ','ố','ồ','ổ','ỗ','ộ','ớ','ờ','ở','ỡ','ợ',
                               'ú','ù','ủ','ũ','ụ','ứ','ừ','ử','ữ','ự','ý','ỳ','ỷ','ỹ','ỵ',
                               'í','ì','ỉ','ĩ','ị','ấ','ầ','ẩ','ẫ','ậ','ắ','ằ','ẳ','ẵ','ặ'])
        self.char_list.extend(['A','B','C','D','Đ','E','Ê','G','H','I','K','L','M','N',
                               'O','Ô','Ơ','P','Q','R','S','T','U','Ư','V','X','Y','Ă','Â'])
        self.char_list.extend(['Á','À','Ả','Ã','Ạ','É','È','Ẻ','Ẽ','Ẹ','Ế','Ề','Ể','Ễ','Ệ',
                                'Ó','Ò','Ỏ','Õ','Ọ','Ố','Ồ','Ổ','Ỗ','Ộ','Ớ','Ờ','Ở','Ỡ','Ợ',
                                'Ú','Ù','Ủ','Ũ','Ụ','Ứ','Ừ','Ử','Ữ','Ự','Ý','Ỳ','Ỷ','Ỹ','Ỵ',
                                'Í','Ì','Ỉ','Ĩ','Ị','Ấ','Ầ','Ẩ','Ẫ','Ậ','Ắ','Ằ','Ẳ','Ẵ','Ặ'])

        self.char2id = dict() # Converts characters to integers
        self.char2id['<pad>'] = 0
        self.char2id['{'] = 1
        self.char2id['}'] = 2
        self.char2id['<unk>'] = 3

        for i, c in enumerate(self.char_list):
            self.char2id[c] = len(self.char2id)

        self.char_unk = self.char2id['<unk>']
        self.start_of_word = self.char2id["{"]
        self.end_of_word = self.char2id["}"]

        assert self.start_of_word + 1 == self.end_of_word

        self.id2char = {v: k for k, v in self.char2id.items()} # Converts integers to characters
        ## End additions to the A4 code

    def __getitem__(self, word):
        """ Retrieve word's index. Return the index for the unk
        token if the word is out of vocabulary.
        @param word (str): word to look up.
        @returns index (int): index of word 
        """
        return self.word2id.get(word, self.unk_id)

    def __contains__(self, word):
        """ Check if word is captured by VocabEntry.
        @param word (str): word to look up
        @returns contains (bool): whether word is contained    
        """
        return word in self.word2id

    def __setitem__(self, key, value):
        """ Raise error, if one tries to edit the VocabEntry.
        """
        raise ValueError('vocabulary is readonly')

    def __len__(self):
        """ Compute number of words in VocabEntry.
        @returns len (int): number of words in VocabEntry
        """
        return len(self.word2id)

    def __repr__(self):
        """ Representation of VocabEntry to be used
        when printing the object.
        """
        return 'Vocabulary[size=%d]' % len(self)

    def id2word(self, wid):
        """ Return mapping of index to word.
        @param wid (int): word index
        @returns word (str): word corresponding to index
        """
        return self.id2word[wid]

    def add(self, word):
        """ Add word to VocabEntry, if it is previously unseen.
        @param word (str): word to add to VocabEntry
        @return index (int): index that the word has been assigned
        """
        if word not in self:
            wid = self.word2id[word] = len(self)
            self.id2word[wid] = word
            return wid
        else:
            return self[word]

    def words2charindices(self, sents: List[List[str]]) -> List[List[List[int]]]:
        """ Convert list of sentences of words into list of list of list of character indices.

            The first two steps : SPLITTING & VOCABULARY LOOKUP

        @param sents (list[list[str]]): sentence(s) in words
        @return char_ids (list[list[list[int]]]): sentence(s) in indices
        """
        # List of list of list of words with '{' and '}' between each word
        sents_list_words = [ [ ['{' + word + '}'] for word in sent ] for sent in sents ]
        # List of list of list of indices 
        # Each word is now: ['word'], a list of one word
        char_ids = [ [ [self.char2id[char] for char in word[0]] for word in sent] for sent in sents_list_words]

        return char_ids

    def words2indices(self, sents: List[List[str]]) -> List[List[int]]:
        """ Convert list of sentences of words into list of list of indices.
        @param sents (list[list[str]]): sentence(s) in words
        @return word_ids (list[list[int]]): sentence(s) in indices
        """
        return [[self[w] for w in s] for s in sents]

    def indices2words(self, word_ids: List[int]) -> List[str]:
        """ Convert list of indices into words.
        @param word_ids (list[int]): list of word ids
        @return sents (list[str]): list of words
        """
        return [self.id2word[w_id] for w_id in word_ids]

    def to_input_tensor_char(self, sents: List[List[str]], device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.
        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tensor, i.e. CPU or GPU
        @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)
        
        max_sent_len is the number of words including <pad>'s
        max_word_length is the number of characters includng <pad>'s
        batch_size is the number of sentences
        
        Tensor of shape (batch_size, max_sent_len, max_word_len) is more easily interpretable
        for human, but this function returned value is fed into pack_padded_sequence eventually
        and this function requires its input's shape to be (longest_len, batch, )
        """
        #print(80*'-')
        #print('sents: ', sents)
        #print(80*'-')
        char_ids = self.words2charindices(sents)
        #print(80*'-')
        #print('char_ids: ', char_ids)
        #print(80*'-')
        sents_t = pad_sents_char(char_ids, self.char2id['<pad>']) # (batch_size, max_sent_len, max_word_len)
        #print(80*'-')
        #print('sents_t: ', sents_t)
        #print(80*'-')
        chars_var = torch.tensor(sents_t, dtype = torch.long, device = device)
        #print(80*'-')
        #print('chars_var:', chars_var)
        #print('-'*80)
        # Cannot use torch.t(chars_var) 'cause torch.t() only transposes tensor <= 2-D

        chars_var = chars_var.permute(1, 0, 2) # (max_sent_len, batch_size, max_word_len)

        return chars_var

    def to_input_tensor(self, sents: List[List[str]], device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tesnor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size)
        """
        word_ids = self.words2indices(sents)
        sents_t = pad_sents(word_ids, self['<pad>'])

        sents_var = torch.tensor(sents_t, dtype = torch.long, device = device) # (batch_size, max_sent_length)

        return torch.t(sents_var) # Transpose

    @staticmethod
    def from_corpus(corpus, size, freq_cutoff = 2):
        """ Given a corpus construct a Vocab Entry.
        @param corpus (list[str]): corpus of text produced by read_corpus function
        @param size (int): # of words in vocabulary
        @param freq_cutoff (int): if word occurs n < freq_cutoff times, drop the word
        @returns vocab_entry (VocabEntry): VocabEntry instance produced from provided corpus
        """
        vocab_entry = VocabEntry()
        # Treat consecutive sentences as a single sentence and create a mapping between
        # word and its corresponding frequency
        word_freq = Counter(chain(*corpus))
        valid_words = [w for w, v in word_freq.items() if v >= freq_cutoff]

        print('number of word types: {}, number of word types w/ frequency >= {}: {}'
              .format(len(word_freq), freq_cutoff, len(valid_words)))

        top_k_words = sorted(valid_words, key=lambda w: word_freq[w], reverse=True)[:size]

        for word in top_k_words:
            vocab_entry.add(word)

        return vocab_entry


In [0]:
class Vocab(object):
    """ Vocab encapsulating src and target langauges.
    """
    def __init__(self, src_vocab: VocabEntry, tgt_vocab: VocabEntry):
        """ Init Vocab.
        @param src_vocab (VocabEntry): VocabEntry for source language
        @param tgt_vocab (VocabEntry): VocabEntry for target language
        """
        self.src = src_vocab
        self.tgt = tgt_vocab

    def get_word2id(self):
        return self.src.word2id, self.tgt.word2id

    @staticmethod
    def build(src_sents, tgt_sents, vocab_size, freq_cutoff) -> 'Vocab':
        """ Build Vocabulary.
        @param src_sents (list[str]): Source sentences provided by read_corpus() function
        @param tgt_sents (list[str]): Target sentences provided by read_corpus() function
        @param vocab_size (int): Size of vocabulary for both source and target languages
        @param freq_cutoff (int): if word occurs n < freq_cutoff times, drop the word.
        """
        assert len(src_sents) == len(tgt_sents)

        print('initialize source vocabulary ..')
        src = VocabEntry.from_corpus(src_sents, vocab_size, freq_cutoff)

        print('initialize target vocabulary ..')
        tgt = VocabEntry.from_corpus(tgt_sents, vocab_size, freq_cutoff)

        return Vocab(src, tgt)

    def save(self, file_path):
        """ Save Vocab to file as JSON dump.
        @param file_path (str): file path to vocab file
        """
        json.dump(dict(src_word2id=self.src.word2id, tgt_word2id=self.tgt.word2id), open(file_path, 'w'), indent=2)

    @staticmethod
    def load(file_path):
        """ Load vocabulary from JSON dump.
        @param file_path (str): file path to vocab file
        @returns Vocab object loaded from JSON dump
        """
        entry = json.load(open(file_path, 'r'))
        src_word2id = entry['src_word2id']
        tgt_word2id = entry['tgt_word2id']

        return Vocab(VocabEntry(src_word2id), VocabEntry(tgt_word2id))

    def __repr__(self):
        """ Representation of Vocab to be used
        when printing the object.
        """
        return 'Vocab(source %d words, target %d words)' % (len(self.src), len(self.tgt))

# source_model_embeddings.py

In [0]:
from typing import List, Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F

In [0]:
import numpy as np

import torch
import torch.nn.functional as F
from torch import nn

In [0]:
class SourceModelEmbeddings(nn.Module): 
    """
    Class that converts input words to their embeddings.
    """
    def __init__(self, embed_size, vocab):
        """
        Init the Embedding layers.

        @param embed_size (int): Embedding size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        """
        super(SourceModelEmbeddings, self).__init__()
        self.embed_size = embed_size

        # default values
        self.source = None

        src_pad_token_idx = vocab.src['<pad>']
        
        self.source = nn.Embedding(num_embeddings = len(vocab.src), embedding_dim = self.embed_size,
                                    padding_idx =  src_pad_token_idx)

# target_model_embeddings.py

In [0]:
import torch.nn as nn

In [0]:
class TargetModelEmbeddings(nn.Module): 
    """
    Class that converts input words to their embeddings.
    """
    def __init__(self, embed_size, vocab):
        """
        Init the Embedding layers.

        @param embed_size (int): Embedding size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        """
        super(TargetModelEmbeddings, self).__init__()
        self.embed_size = embed_size

        # default values
        self.target = None

        tgt_pad_token_idx = vocab.tgt['<pad>']
        
        self.target = nn.Embedding(num_embeddings = len(vocab.src), embedding_dim = self.embed_size,
                                    padding_idx =  tgt_pad_token_idx)
            
    def forward(self, data):
      return self.target(data)

# Mounting Google Drive

In [0]:
from google.colab import drive
drive.mount('/content/drive')

root = '/content/drive/My Drive/'

# nmt_model.py

In [0]:
from collections import namedtuple
import sys
from typing import List, Tuple, Dict, Set, Union
import torch
import torch.nn as nn
import torch.nn.utils
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

Hypothesis = namedtuple('Hypothesis', ['value', 'score'])
import random

In [0]:
class NMT(nn.Module):
    """ Simple Neural Machine Translation Model:
        - Bidrectional LSTM Encoder
        - Unidirection LSTM Decoder
        - Global Attention Model (Luong, et al. 2015)
    """
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2, no_char_decoder=False):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()

        self.model_embeddings_source = SourceModelEmbeddings(embed_size, vocab)
        self.model_embeddings_target = TargetModelEmbeddings(embed_size, vocab)

        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # default values
        self.encoder = None 
        self.decoder = None
        self.h_projection = None
        self.c_projection = None
        self.att_projection = None
        self.combined_output_projection = None
        self.target_vocab_projection = None
        self.dropout = None

        # Input of this layer is ((seq_len, batch, input_size), h, c)
        features_in_input = embed_size # Size of each word embedding
        features_in_hidden = self.hidden_size
        self.encoder = nn.LSTM(input_size = features_in_input, hidden_size = features_in_hidden, 
                                bias = True, bidirectional = True, num_layers = 1)

        features_in_input = embed_size + self.hidden_size # y_bar_t
        features_in_hidden = self.hidden_size 
        self.decoder = nn.LSTMCell(input_size = features_in_input, hidden_size = features_in_hidden, 
                                   bias = True)

        # In the note, we have W * x + b, but here, the formular is x * W + b

        # Perform: input * W_h_projection
        self.h_projection = nn.Linear(in_features = 2 * self.hidden_size, 
                                      out_features = self.hidden_size, bias = False)

        # Perform: input * W_c_projection
        self.c_projection = nn.Linear(in_features = 2 * self.hidden_size, 
                                      out_features = self.hidden_size, bias = False)

        self.att_projection = nn.Linear(in_features = 2 * self.hidden_size,
                                        out_features = self.hidden_size, bias = False)

        self.combined_output_projection = nn.Linear(in_features = 3 * self.hidden_size, 
                                                    out_features = self.hidden_size, bias = False)

        self.target_vocab_projection = nn.Linear(in_features = self.hidden_size, 
                                                 out_features = len(vocab.tgt), bias = False)

        self.dropout = nn.Dropout(p = self.dropout_rate)

        if not no_char_decoder:
           self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt)
        else:
           self.charDecoder = None

    def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor:
        """ Take a mini-batch of source and target sentences, compute the log-likelihood of
        target sentences under the language models learned by the NMT system.

        @param source (List[List[str]]): list of source sentence tokens
        @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>`

        @returns scores (Tensor): a variable/tensor of shape (b, ) representing the
                                    log-likelihood of generating the gold-standard target sentence for
                                    each example in the input batch. Here b = batch size.
        """
        # Compute sentence lengths before applying padding
        source_lengths = [len(s) for s in source]

        # Convert list of lists into tensors

        source_padded_chars = self.vocab.src.to_input_tensor(source, device = self.device) # (src_len, batch, max_word_len)
        target_padded_chars = self.vocab.tgt.to_input_tensor_char(target, device = self.device) # (tgt_len, batch, max_word_len)
        # For predictions
        target_padded = self.vocab.tgt.to_input_tensor(target, device = self.device) # (tgt_len, b)

        enc_hiddens, dec_init_state = self.encode(source_padded_chars, source_lengths)
        enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths)
        combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded)

        # (sent_max_len, batch_size, vocab_size)
        P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1)

        # Zero out, probabilities for which we have nothing in the target text
        target_masks = (target_padded != self.vocab.tgt['<pad>']).float()

        # Compute log probability of generating true target words
        target_gold_words_log_prob = torch.gather(P, index=target_padded[1:].unsqueeze(-1), 
                                                  dim=-1).squeeze(-1) * target_masks[1:]
        scores = target_gold_words_log_prob.sum() # mhahn2 Small modification from A4 code.

        if self.charDecoder is not None:
            max_word_len = target_padded_chars.shape[2]
            # Exclude <pad> token
            target_words = target_padded[1:].contiguous().view(-1)
            target_chars = target_padded_chars[1:].contiguous().view(-1, max_word_len)
            target_outputs = combined_outputs.view(-1, self.hidden_size)

            target_chars_oov = target_chars 
            rnn_states_oov = target_outputs 
            oovs_losses = self.charDecoder.train_forward(target_chars_oov.t(), 
                    (rnn_states_oov.unsqueeze(0), rnn_states_oov.unsqueeze(0)))
            scores = scores - oovs_losses

        return scores

    def encode(self, source_padded: torch.Tensor, source_lengths: List[int]) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """ Apply the encoder to source sentences to obtain encoder hidden states.
            Additionally, take the final states of the encoder and project them to obtain initial states for decoder.
        @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b, max_word_length), where
                                        b = batch_size, src_len = maximum source sentence length. Note that
                                       these have already been sorted in order of longest to shortest sentence.
        @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch
        @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where
                                        b = batch size, src_len = maximum source sentence length, h = hidden size.
        @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial
                                                hidden state and cell.
        """
        enc_hiddens, dec_init_state = None, None

        # Look up word embeddings from an embedding matrix
        X = self.model_embeddings_source.source(source_padded)
        # Because RNN module takes PackedSequence as input
        # https://www.kdnuggets.com/2018/06/taming-lstms-variable-sized-mini-batches-pytorch.html
        packed_X = pack_padded_sequence(X, lengths = source_lengths)
        enc_hiddens, (last_hidden, last_cell) = self.encoder(packed_X)

        # Inverse operation of pack_padded_sequence
        # Returned value: (padded sentence: tuple of Tensor, list of lengths of each sentence in the batch: Tensor)
        padded_enc_hiddens, _ = pad_packed_sequence(sequence = enc_hiddens, batch_first = True)
        # If input of encoder (nn.LSTM) is PackedSequence object, the output must be unpacked
        enc_hiddens = padded_enc_hiddens

        forwards, backwards = last_hidden[0][:], last_hidden[1][:]
        concatenated_last_hidden = torch.cat(tensors = (forwards, backwards), dim = 1)
        init_decoder_hidden = self.h_projection(concatenated_last_hidden)

        forwards, backwards = last_cell[0][:], last_cell[1][:]
        concatenated_last_cell = torch.cat(tensors = (forwards, backwards), dim = 1)
        init_decoder_cell = self.c_projection(concatenated_last_cell)

        dec_init_state = (init_decoder_hidden, init_decoder_cell)

        return enc_hiddens, dec_init_state

    def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor, dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor:
        """Compute combined output vectors for a batch.
        @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where
                                     b = batch size, src_len = maximum source sentence length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where
                                     b = batch size, src_len = maximum source sentence length.
        @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder
        @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b, max_word_length), where
                                       tgt_len = maximum target sentence length, b = batch size.
        @returns combined_outputs (Tensor): combined output tensor  (tgt_len, b,  h), where
                                        tgt_len = maximum target sentence length, b = batch_size,  h = hidden size
        """
        # Chop of the <END> token for max length sentences.
        target_padded = target_padded[:-1] # reduce the first dimension by 1

        # Initialize the decoder state (hidden and cell)
        dec_state = dec_init_state

        # Initialize previous combined output vector o_{t-1} as zero
        batch_size = enc_hiddens.size(0)
        o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device)

        # Initialize a list we will use to collect the combined output o_t on each step
        combined_outputs = []

        enc_hiddens_proj = self.att_projection(enc_hiddens)

        # Look up target word embeddings
        Y = self.model_embeddings_target(target_padded)

        for Y_t in torch.split(Y, split_size_or_sections = 1):
            # Dimension of size 1 removed
            Y_t = torch.squeeze(Y_t, dim = 0)
            Ybar_t = torch.cat((Y_t, o_prev), dim = 1)
            dec_state, o_t, _ = self.step(Ybar_t, dec_state, enc_hiddens,
                                          enc_hiddens_proj, enc_masks)
            combined_outputs.append(o_t)
            o_prev = o_t

        combined_outputs = torch.stack(combined_outputs)

        return combined_outputs

    def step(self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the attention computation.
        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum source length.
        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """

        combined_output = None
        # new dec_state
        dec_state = self.decoder(Ybar_t, dec_state)
        (dec_hidden, dec_cell) = dec_state

        dec_hidden = torch.unsqueeze(dec_hidden, dim = 2)
        # Attention scores
        e_t = torch.bmm(enc_hiddens_proj, dec_hidden)
        e_t = torch.squeeze(e_t, dim = 2)

        # Set e_t to -inf where enc_masks has 1
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.byte(), -float('inf'))

        # Attention distribution
        alpha_t = F.softmax(e_t, dim = 1)
        
        alpha_t = torch.unsqueeze(alpha_t, dim = 1)
        a_t = torch.bmm(alpha_t, enc_hiddens)
        a_t = torch.squeeze(a_t, dim = 1)

        dec_hidden = torch.squeeze(dec_hidden, dim = 2)
        U_t = torch.cat((dec_hidden, a_t), dim = 1)

        V_t = self.combined_output_projection(U_t)

        O_t = torch.tanh(V_t)
        O_t = self.dropout(O_t)
        
        combined_output = O_t
        return dec_state, combined_output, e_t

    def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor:
        """ Generate sentence masks for encoder hidden states.

        @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size,
                                     src_len = max source length, h = hidden size.
        @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch.

        @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len),
                                    where src_len = max source length, h = hidden size.
        """
        enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float)
        for e_id, src_len in enumerate(source_lengths):
            # fill_mask_ fills positions that are 1
            enc_masks[e_id, src_len:] = 1
        return enc_masks.to(self.device)

    def beam_search(self, src_sent: List[str], beam_size: int=5, 
                    max_decoding_time_step: int=70) -> List[Hypothesis]:
        """ Given a single source sentence, perform beam search, yielding translations in the target language.
        @param src_sent (List[str]): a single source sentence (words)
        @param beam_size (int): beam size
        @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN
        @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields:
                value: List[str]: the decoded target sentence, represented as a list of words
                score: float: the log-likelihood of the target sentence
        """
        src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device)

        src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)])
        src_encodings_att_linear = self.att_projection(src_encodings)

        h_tm1 = dec_init_vec
        att_tm1 = torch.zeros(1, self.hidden_size, device=self.device)

        eos_id = self.vocab.tgt['</s>']

        hypotheses = [['<s>']]
        hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device)
        completed_hypotheses = []

        t = 0
        while len(completed_hypotheses) < beam_size and t < max_decoding_time_step:
            t += 1
            hyp_num = len(hypotheses)

            exp_src_encodings = src_encodings.expand(hyp_num,
                                                     src_encodings.size(1),
                                                     src_encodings.size(2))

            exp_src_encodings_att_linear = src_encodings_att_linear.expand(hyp_num,
                                                                           src_encodings_att_linear.size(1),
                                                                           src_encodings_att_linear.size(2))

            y_tm1 = self.vocab.tgt.to_input_tensor(list([hyp[-1]] for hyp in hypotheses), 
                                                        device=self.device)
            y_t_embed = self.model_embeddings_target(y_tm1)
            y_t_embed = torch.squeeze(y_t_embed, dim=0)


            x = torch.cat([y_t_embed, att_tm1], dim=-1)

            (h_t, cell_t), att_t, _  = self.step(x, h_tm1,
                                                exp_src_encodings, exp_src_encodings_att_linear, 
                                                enc_masks=None)

            # log probabilities over target words
            log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1)

            live_hyp_num = beam_size - len(completed_hypotheses)
            contiuating_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)
            top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(contiuating_hyp_scores, k=live_hyp_num)

            prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt)
            hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt)

            new_hypotheses = []
            live_hyp_ids = []
            new_hyp_scores = []

            decoderStatesForUNKsHere = []
            for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores):
                prev_hyp_id = prev_hyp_id.item()
                hyp_word_id = hyp_word_id.item()
                cand_new_hyp_score = cand_new_hyp_score.item()

                hyp_word = self.vocab.tgt.id2word[hyp_word_id]

                # Record output layer in case UNK was generated
                if hyp_word == "<unk>":
                   hyp_word = "<unk>" + str(len(decoderStatesForUNKsHere))
                   decoderStatesForUNKsHere.append(att_t[prev_hyp_id])

                new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word]
                if hyp_word == '</s>':
                    completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1],
                                                           score=cand_new_hyp_score))
                else:
                    new_hypotheses.append(new_hyp_sent)
                    live_hyp_ids.append(prev_hyp_id)
                    new_hyp_scores.append(cand_new_hyp_score)

            if len(decoderStatesForUNKsHere) > 0 and self.charDecoder is not None: # decode UNKs
                decoderStatesForUNKsHere = torch.stack(decoderStatesForUNKsHere, dim=0)
                decodedWords = self.charDecoder.decode_greedy((decoderStatesForUNKsHere.unsqueeze(0), decoderStatesForUNKsHere.unsqueeze(0)), max_length=21, device=self.device)
                assert len(decodedWords) == decoderStatesForUNKsHere.size()[0], "Incorrect number of decoded words"
                for hyp in new_hypotheses:
                  if hyp[-1].startswith("<unk>"):
                        hyp[-1] = decodedWords[int(hyp[-1][5:])]#[:-1]

            if len(completed_hypotheses) == beam_size:
                break

            live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device)
            h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids])
            att_tm1 = att_t[live_hyp_ids]

            hypotheses = new_hypotheses
            hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device)

        if len(completed_hypotheses) == 0:
            completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:],
                                                   score=hyp_scores[0].item()))

        completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)
        return completed_hypotheses

    @property
    def device(self) -> torch.device:
        """ Determine which device to place the Tensors upon, CPU or GPU.
        """
        return self.att_projection.weight.device

    @staticmethod
    def load(model_path: str, no_char_decoder=False):
        """ Load the model from a file.
        """
        params = torch.load(model_path, map_location=lambda storage, loc: storage)
        args = params['args']

        model = NMT(vocab = params['vocab'], no_char_decoder=no_char_decoder, **args)
        model.load_state_dict(params['state_dict'])

        return model

    def save(self, path: str):
        """ Save the model to a file.
        """
        print('save model parameters to [%s]' % path)

        params = {
            'args': dict(embed_size = self.model_embeddings_source.embed_size, 
                         hidden_size = self.hidden_size, 
                         dropout_rate = self.dropout_rate),
            'vocab': self.vocab,
            'state_dict': self.state_dict()
        }

        torch.save(params, path)


# char_decoder.py

In [0]:
from typing import Tuple
import torch
import torch.nn as nn

In [0]:
class CharDecoder(nn.Module):
    def __init__(self, hidden_size, char_embedding_size = 50, target_vocab = None):
        """ Init Character Decoder.

        @param hidden_size (int): Hidden size of the decoder LSTM
        @param char_embedding_size (int): dimensionality of character embeddings
        @param target_vocab (VocabEntry): vocabulary for the target language. See vocab.py for documentation.
        """        
        super(CharDecoder, self).__init__()

        self.hidden_size = hidden_size
        self.tgt_vocab = target_vocab
        self.vocab_size = len(target_vocab.char2id)

        # Unidirectional LSTM
        self.charDecoder = nn.LSTM(char_embedding_size, self.hidden_size)
        # W_dec and b_dec to compute final value before applying softmax
        self.char_output_projection = nn.Linear(self.hidden_size, self.vocab_size, bias = True)
        # Char embeddings
        tgt_pad_token_idx = target_vocab.char2id['<pad>']
        self.decoderCharEmb = nn.Embedding(self.vocab_size, char_embedding_size, 
                                           padding_idx = tgt_pad_token_idx)

    def forward(self, input: torch.tensor, dec_hidden = None):
        """ Forward pass of character decoder.

        @param input: tensor of integer indices, shape (seq_length, batch)
        @param dec_hidden: internal state of the LSTM before reading the input characters. A tuple of two tensors of shape (1, batch, hidden_size)

        @returns scores: called s_t in the PDF, shape (seq_length, batch, self.vocab_size)
        @returns dec_hidden: internal state of the LSTM after reading the input characters. A tuple of two tensors of shape (1, batch, hidden_size)
        """
        chr_embeddings = self.decoderCharEmb(input) # (sequence_length, batch, char_embed_size)
        hidden_states, (last_hidden, last_cell) = self.charDecoder(chr_embeddings, dec_hidden) 
        s_t = self.char_output_projection(hidden_states) # (sequence_length, batch, vocab_size)

        dec_hidden = (last_hidden, last_cell)
        scores = s_t

        return scores, dec_hidden

    def train_forward(self, char_sequence, dec_hidden = None):
        """ Forward computation during training.

        @param char_sequence: tensor of integers, shape (length, batch). Note that "length" here and in forward() need not be the same.
        @param dec_hidden: initial internal state of the LSTM, obtained from the output of the word-level decoder. 
        A tuple of two tensors of shape (1, batch, hidden_size)

        @returns The cross-entropy loss, computed as the *sum* of cross-entropy losses of all the words in the batch.
        """
        crossEntropyLoss = nn.CrossEntropyLoss(ignore_index = self.tgt_vocab.char2id['<pad>'], 
                                               reduction = 'sum')	# exclude <PAD>

        # If I include <END>, the LM will compute the next token given <END> (and previous words)
        # however all I need is that the token is an indication of end of sentence
        input = char_sequence[:-1]	# exclude <END>
        scores, dec_hidden = self.forward(input, dec_hidden)

        input = scores.permute(1, 2, 0)	# (seq_length, batch, self.vocab_size) to (N, C, d_1, d_2, ..., d_seq_length)
        # Because the output of the LM don't have <START> token
        target = char_sequence[1:]	 # exclude <START>
        target = target.t()	# (length, batch) to (N, C)	

        loss_char_dec = crossEntropyLoss(input, target)	# seq_length-dimensional loss

        return loss_char_dec

    def decode_greedy(self, initial_states: Tuple[torch.tensor], device: torch.device, max_length=21):
        """ Greedy decoding
        @param initialStates: initial internal state of the LSTM, a tuple of two tensors of size (1, batch, hidden_size)
        @param device: torch.device (indicates whether the model is on CPU or GPU)
        @param max_length: maximum length of words to decode

        @returns decoded_words: a list (of length batch) of strings, each of which has length <= max_length.
                              The decoded strings should NOT contain the start-of-word and end-of-word characters.
        """
        output_words, decoded_words = list(), list()
        start_idx = self.tgt_vocab.start_of_word
        end_idx = self.tgt_vocab.end_of_word 

        dec_hiddens = initial_states
        batch_size = dec_hiddens[0].shape[1]
        current_char = torch.tensor([[start_idx] * batch_size], device = device)

        i = 0
        while i != max_length:
        	scores, dec_hiddens = self.forward(current_char, dec_hiddens)
        	current_char = scores.argmax(-1)
        	output_words += [current_char]
        	i += 1
        output_words = torch.cat(output_words).t().tolist()

        for word in output_words:
        	decoded_word = ''
        	for char_idx in word:
        		if char_idx == end_idx:
        			break
        		decoded_word += self.tgt_vocab.id2char[char_idx]
        	decoded_words += [decoded_word]

        return decoded_words

# run.py

In [0]:
import math
import sys
import pickle
import time

from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction
import numpy as np
from typing import List, Tuple, Dict, Set, Union
from tqdm import tqdm

import torch
import torch.nn.utils

from collections import namedtuple
Hypothesis = namedtuple('Hypothesis', ['value', 'score'])

In [0]:
def evaluate_ppl(model, dev_data, batch_size=32):
    """ Evaluate perplexity on dev sentences
    @param model (NMT): NMT Model
    @param dev_data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentence
    @param batch_size (batch size)
    @returns ppl (perplixty on dev sentences)
    """
    was_training = model.training
    model.eval()

    cum_loss = 0.
    cum_tgt_words = 0.

    # no_grad() signals backend to throw away all gradients
    with torch.no_grad():
        for src_sents, tgt_sents in batch_iter(dev_data, batch_size):
            loss = -model(src_sents, tgt_sents).sum()

            cum_loss += loss.item()
            tgt_word_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            cum_tgt_words += tgt_word_num_to_predict

        ppl = np.exp(cum_loss / cum_tgt_words)

    if was_training:
        model.train()

    return ppl

In [0]:
def compute_corpus_level_bleu_score(references: List[List[str]], hypotheses: List[Hypothesis]) -> float:
    """ Given decoding results and reference sentences, compute corpus-level BLEU score.
    @param references (List[List[str]]): a list of gold-standard reference target sentences
    @param hypotheses (List[Hypothesis]): a list of hypotheses, one for each reference
    @returns bleu_score: corpus-level BLEU score
    """
    if references[0][0] == '<s>':
        references = [ref[1:-1] for ref in references]
    bleu_score = corpus_bleu([[ref] for ref in references],
                             [hyp.value for hyp in hypotheses])
                             #smoothing_function = SmoothingFunction().method1)
    return bleu_score

In [0]:
def train():
    """ Train the NMT Model.
    """
    train_data_src = read_corpus(args['--train-src'], source='src')
    train_data_tgt = read_corpus(args['--train-tgt'], source='tgt')

    dev_data_src = read_corpus(args['--dev-src'], source='src')
    dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt')

    train_data = list(zip(train_data_src, train_data_tgt))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    train_batch_size = args['--batch-size']

    clip_grad = args['--clip-grad']
    valid_niter = args['--valid-niter']
    log_every = args['--log-every']
    model_save_path = args['--save-to']

    vocab = Vocab.load(args['--vocab'])

    model = NMT(embed_size=int(args['--embed-size']),
                hidden_size=int(args['--hidden-size']),
                dropout_rate=float(args['--dropout']),
                vocab=vocab, no_char_decoder=args['--no-char-decoder'])
    model.train()

    uniform_init = args['--uniform-init']
    if np.abs(uniform_init) > 0.:
        print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init))
        for p in model.parameters():
            p.data.uniform_(-uniform_init, uniform_init)

    vocab_mask = torch.ones(len(vocab.tgt))
    vocab_mask[vocab.tgt['<pad>']] = 0

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
    print('use device: %s' % device)

    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr = args['--lr'])

    num_trial = 0
    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    cum_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print('Begin Maximum Likelihood training')
    print(80*'-+')

    while True:
        epoch += 1
        # Take out two sentences for each epoch
        for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True):
            #print('src_sents: ', src_sents)
            #print('tgt_sents: ', tgt_sents)
            train_iter += 1

            optimizer.zero_grad()

            batch_size = len(src_sents)

            example_losses = -model(src_sents, tgt_sents) # (batch_size,)
            batch_loss = example_losses.sum()
            loss = batch_loss / batch_size

            loss.backward()

            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)

            optimizer.step()

            batch_losses_val = batch_loss.item()
            report_loss += batch_losses_val
            cum_loss += batch_losses_val

            tgt_words_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            report_tgt_words += tgt_words_num_to_predict
            cum_tgt_words += tgt_words_num_to_predict
            report_examples += batch_size
            cum_examples += batch_size

            if train_iter % log_every == 0:
                print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
                      'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                         report_loss / report_examples,
                                                                                         math.exp(report_loss / report_tgt_words),
                                                                                         cum_examples,
                                                                                         report_tgt_words / (time.time() - train_time),
                                                                                         time.time() - begin_time), file=sys.stderr)

                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.

            # perform validation
            if train_iter % valid_niter == 0:
                print('epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter,
                                                                                         cum_loss / cum_examples,
                                                                                         np.exp(cum_loss / cum_tgt_words),
                                                                                         cum_examples), file=sys.stderr)

                cum_loss = cum_examples = cum_tgt_words = 0.
                valid_num += 1

                print('begin validation ...')

                # compute dev. ppl and bleu
                dev_ppl = evaluate_ppl(model, dev_data, batch_size=128)   # dev batch size can be a bit larger
                valid_metric = -dev_ppl

                print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl))

                is_better = len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores)
                hist_valid_scores.append(valid_metric)

                if is_better:
                    patience = 0
                    print('save currently the best model to [%s]' % model_save_path)
                    model.save(model_save_path)

                    # also save the optimizers' state
                    torch.save(optimizer.state_dict(), model_save_path + '.optim')
                elif patience < args['--patience']:
                    patience += 1
                    print('hit patience %d' % patience)

                    if patience == args['--patience']:
                        num_trial += 1
                        print('hit #%d trials' % num_trial)
                        if num_trial == args['--max-num-trial']:
                            print('early stop!')
                            exit(0)

                        # decay lr, and restore from previously best checkpoint
                        lr = optimizer.param_groups[0]['lr'] * args['--lr-decay']
                        print('load previously best model and decay learning rate to %f' % lr)

                        # load model
                        params = torch.load(model_save_path, map_location=lambda storage, loc: storage)
                        model.load_state_dict(params['state_dict'])
                        model = model.to(device)

                        print('restore parameters of the optimizers')
                        optimizer.load_state_dict(torch.load(model_save_path + '.optim'))

                        # set new lr
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr

                        # reset patience
                        patience = 0

            if epoch == args['--max-epoch']:
                print('reached maximum number of epochs!', file=sys.stderr)
                return

In [0]:
def decode(args: Dict[str, str]):
    """ Performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    @param args (Dict): args from cmd line
    """
    # Load test data, source sentences and/or target sentences
    print("load test source sentences from [{}]".format(args['TEST_SOURCE_FILE']), file=sys.stderr)
    test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src')
    if args['TEST_TARGET_FILE']:
        print("load test target sentences from [{}]".format(args['TEST_TARGET_FILE']), file=sys.stderr)
        test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt')

    # Load model
    print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr)
    model = NMT.load(args['MODEL_PATH'], no_char_decoder=args['--no-char-decoder'])

    # Check if user wants to user GPU
    if args['--cuda']:
        model = model.to(torch.device("cuda:0"))

    # Apply hypotheses finding algorithm, here I use beam search
    hypotheses = beam_search(model, test_data_src,
                             beam_size=int(args['--beam-size']),
                             max_decoding_time_step=int(args['--max-decoding-time-step']))
    print('hypotheses: ', hypotheses)

    # If I loaded test target sentences, compute BLEU scores between translated sentences and correct ones
    if args['TEST_TARGET_FILE']:
        top_hypotheses = [hyps[0] for hyps in hypotheses]
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses)
        print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr)

    # Write translated sentences to file
    with open(args['OUTPUT_FILE'], 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            hyp_sent = ' '.join(top_hyp.value)
            f.write(hyp_sent + '\n')

In [0]:
def beam_search(model: NMT, test_data_src: List[List[str]], beam_size: int, max_decoding_time_step: int) -> List[List[Hypothesis]]:
    """ Run beam search to construct hypotheses for a list of src-language sentences.
    @param model (NMT): NMT Model
    @param test_data_src (List[List[str]]): List of sentences (words) in source language, from test set.
    @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step)
    @param max_decoding_time_step (int): maximum sentence length that Beam search can produce
    @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence.
    """
    was_training = model.training
    model.eval()

    hypotheses = []
    with torch.no_grad():
        for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout):
            example_hyps = model.beam_search(src_sent, beam_size=beam_size, max_decoding_time_step=max_decoding_time_step)

            hypotheses.append(example_hyps)

    if was_training: model.train(was_training)

    return hypotheses

In [0]:
vocab = Vocab.load(root + 'Deep learning in NLP/Nom Translator/corpora/nom_vlatin_vocab.json')
a, b = vocab.get_word2id()

In [0]:
a

In [0]:
b

# Training

In [0]:
     # seed the random number generators
    seed = 103
    torch.manual_seed(seed)

    cuda = True
    if cuda:
        torch.cuda.manual_seed(seed)
    np.random.seed(seed * 13 // 7)


In [0]:
args = {
    '--train-src': root + 'Deep learning in NLP/Nom Translator/corpora/1. train.nom',
    '--train-tgt': root + 'Deep learning in NLP/Nom Translator/corpora/2. train.vlatin',
    '--dev-src': root + 'Deep learning in NLP/Nom Translator/corpora/3. dev.nom',
    '--dev-tgt': root + 'Deep learning in NLP/Nom Translator/corpora/4. dev.vlatin',
    '--batch-size': 32,
    '--clip-grad': 5.0,
    '--valid-niter': 100,
    '--log-every': 10,
    '--save-to': root + 'Deep learning in NLP/Nom Translator/models/nom_vlatin_model_13.bin',
    '--vocab': root + 'Deep learning in NLP/Nom Translator/corpora/nom_vlatin_vocab.json',
    '--embed-size': 1024,
    '--hidden-size': 512,
    '--dropout': .3,
    '--uniform-init': .1,
    '--cuda': True,
    '--lr': .001 ,
    '--max-epoch': 10,
    '--patience': 5,
    '--lr-decay': .5,
    '--max-num-trial': 5,
    '--no-char-decoder': False
}

In [0]:
train()

# Inference

In [0]:
args = {'TEST_SOURCE_FILE': root + 'Deep learning in NLP/Nom Translator/corpora/5. test.nom',
        'TEST_TARGET_FILE': root + 'Deep learning in NLP/Nom Translator/corpora/6. test.vlatin',
        'MODEL_PATH': root + 'Deep learning in NLP/Nom Translator/models/nom_vlatin_model_13.bin',
        '--no-char-decoder': False,
        '--cuda': True,
        '--beam-size': 5,
        '--max-decoding-time-step': 70,
        'OUTPUT_FILE': root + 'Deep learning in NLP/Nom Translator/results/test_outputs_10.txt'}

In [0]:
decode(args)