# HW 3: Neural Machine Translation

In this homework you will build a full neural machine translation system using an attention-based encoder-decoder network to translate from German to English. The encoder-decoder network with attention forms the backbone of many current text generation systems. See [Neural Machine Translation and Sequence-to-sequence Models: A Tutorial](https://arxiv.org/pdf/1703.01619.pdf) for an excellent tutorial that also contains many modern advances.

## Goals


1. Build a non-attentional baseline model (pure seq2seq as in [ref](https://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf)). 
2. Incorporate attention into the baseline model ([ref](https://arxiv.org/abs/1409.0473) but with dot-product attention as in class notes).
3. Implement beam search: review/tutorial [here](http://www.phontron.com/slides/nlp-programming-en-13-search.pdf)
4. Visualize the attention distribution for a few examples. 

Consult the papers provided for hyperparameters, and the course notes for formal definitions.

This will be the most time-consuming assignment in terms of difficulty/training time, so we recommend that you get started early!

In [2]:
import torch
from namedtensor import ntorch, NamedTensor
import numpy as np
import random

In [6]:
from load_data import DataLoader
#from models import LSTMTranslator, AttentionTranslator

loader = DataLoader('cpu')
train_iter, val_iter, DE, EN = loader.get_iters()

Loading data...
building vocab...
initializing iterators...


In [7]:
class LSTMEncoder(ntorch.nn.Module):
    def __init__(self, DE, emb_dim, hid_dim, n_layers, dropout, attention):
        super().__init__()
        self.attention = attention
        self.embedding = ntorch.nn.Embedding(len(DE.vocab), emb_dim).spec('srcSeqlen','embedding')
        self.rnn = ntorch.nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout, bidirectional = self.attention).spec("embedding", "srcSeqlen", "lstm")
        self.dropout = ntorch.nn.Dropout(dropout)

    def forward(self, src):
        if not self.attention:
            # reverse input
            src = src[{'srcSeqlen':slice(-1,0)}]
        
        # run net
        x = self.embedding(src)
        x = self.dropout(x)
        outputs, hidden = self.rnn(x)
        if self.attention:
            return {'src':outputs}
        else:
            return hidden

class LSTMDecoder(ntorch.nn.Module):
    def __init__(self, EN, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = ntorch.nn.Embedding(len(EN.vocab), emb_dim).spec('trgSeqlen','embedding')
        self.rnn = ntorch.nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout).spec("embedding", "trgSeqlen", "lstm")
        self.out = ntorch.nn.Linear(hid_dim, len(EN.vocab)).spec("lstm", "logit")
        self.dropout = ntorch.nn.Dropout(dropout)

    def forward(self, trg, hidden):
        x = self.embedding(trg)
        x = self.dropout(x)
        x, hidden = self.rnn(x, hidden)
        x = self.out(x)
        return x, hidden

class AttentionDecoder(ntorch.nn.Module):
    def __init__(self, EN, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = ntorch.nn.Embedding(len(EN.vocab), emb_dim).spec('trgSeqlen','embedding')
        self.rnn = ntorch.nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout).spec("embedding", "trgSeqlen", "lstm")
        self.out = ntorch.nn.Linear(hid_dim*2, len(EN.vocab)).spec("lstm", "logit")
        self.dropout = ntorch.nn.Dropout(dropout)

    def forward(self, trg, hidden):
        # get hidden state
        src = hidden['src']
        rnn_state = hidden['rnn_state'] if 'rnn_state' in hidden else None
        
        #run net
        x = self.embedding(trg)
        x = self.dropout(x)
        if rnn_state is not None:
            x, rnn_state = self.rnn(x, rnn_state)
        else:
            x, rnn_state = self.rnn(x)
        context = x.dot('lstm', src).softmax('srcSeqlen').dot('srcSeqlen',src)
        x = self.out(ntorch.cat([context, x], dim = 'lstm'))
        
        # create new hidden state
        hidden = {'src': src, 'rnn_state':rnn_state}
        return x, hidden

class Translator(ntorch.nn.Module):
    def __init__(self, teacher_forcing, device):
        super().__init__()
        self.teacher_forcing = teacher_forcing
        self.device = device
    
    def forward(self, src, trg):
        #get src encoding
        hidden = self.encoder(src)
        
        # initialize outputs
        output_tokens = [trg[{'trgSeqlen':slice(0,1)}]]
        output_distributions = []
        
        # make predictions
        for t in range(trg.shape['trgSeqlen']-1):
            #predict next word
            if random.random() < self.teacher_forcing:
                inp = trg[{'trgSeqlen':slice(t,t+1)}]
                out, hidden = self.decoder(inp, hidden)
            else:
                out, hidden = self.decoder(output_tokens[t], hidden)
            
            #store output
            output_distributions.append(out)
            _, top1 = out.max("logit")
            output_tokens.append(top1)
        
        #format predictions
        return ntorch.cat(output_distributions, dim = 'trgSeqlen')
    
    def fit(self, train_iter, val_iter=[], lr=1e-2, verbose=True,
        batch_size=128, epochs=10, interval=1, early_stopping=False):
        self.to(self.device)
        lr = torch.tensor(lr)
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.parameters(), lr=lr)
        train_iter.batch_size = batch_size

        for epoch in range(epochs):  # loop over the dataset multiple times
            self.train()
            running_loss = 0.0
            self.train()
            for i, data in enumerate(train_iter, 0):
                src, trg = data.src, data.trg
                optimizer.zero_grad()
                out = self(src, trg)
                loss = criterion(
                    out.transpose("batch", "logit", "trgSeqlen").values,
                    trg[{"trgSeqlen":slice(1,trg.shape["trgSeqlen"])}].transpose("batch", "trgSeqlen").values,
                )
                loss.backward()
                optimizer.step()
                running_loss += loss.item()

                # print statistics
                if i % interval == interval - 1:  # print every 2000 mini-batches
                    if verbose:
                        print(f"[epoch: {epoch + 1}, batch: {i + 1}] loss: {running_loss / interval}")
                    running_loss = 0.0

            running_loss = 0.0
            val_count = 0.0
            self.eval()
            for i, data in enumerate(val_iter):
                src, trg = data.src, data.trg
                out = self(src, trg, teacher_forcing_ratio = 0)
                loss = criterion(
                    out.transpose("batch", "logit", "trgSeqlen").values, 
                    trg[{"trgSeqlen":slice(1,trg.shape["trgSeqlen"])}].transpose("batch", "trgSeqlen").values
                )
                running_loss += loss.item()
                val_count += 1
            prev_loss = self.val_loss
            self.val_loss = running_loss / val_count
            if verbose:
                print(f'Val loss: {self.val_loss}, PPL: {np.exp(self.val_loss)}')
            if self.val_loss > prev_loss and early_stopping:
                break
            lr *= .8

class LSTMTranslator(Translator):
    def __init__(self, DE, EN, src_emb_dim, trg_emb_dim, hid_dim, n_layers = 4, dropout = 0.5, teacher_forcing = 0.75, device = 'cpu'):
        super().__init__(teacher_forcing, device)
        self.encoder = LSTMEncoder(DE, src_emb_dim, hid_dim, n_layers, dropout, False)
        self.decoder = LSTMDecoder(EN, trg_emb_dim, hid_dim, n_layers, dropout)

class AttentionTranslator(Translator):
    def __init__(self, DE, EN, src_emb_dim, trg_emb_dim, hid_dim, n_layers = 4, dropout = 0.5, teacher_forcing = 0.75, device = 'cpu'):
        super().__init__(teacher_forcing, device)
        self.encoder = LSTMEncoder(DE, src_emb_dim, hid_dim, n_layers, dropout, True)
        self.decoder = AttentionDecoder(EN, trg_emb_dim, hid_dim*2, n_layers, dropout)

In [8]:
batch = next(iter(train_iter))

In [9]:
enc = LSTMEncoder(DE, 300, 200, 4, 0.5, False)
hidden = enc(batch.src)

In [10]:
dec = LSTMDecoder(EN, 400, 200, 4, 0.5)
out, hidden = dec(batch.trg[{'trgSeqlen':slice(0,1)}], hidden)

In [11]:
enc = LSTMEncoder(DE, 300, 200, 4, 0.5, True)
hidden = enc(batch.src)

In [12]:
dec = AttentionDecoder(EN, 400, 400, 4, 0.5)
out = dec(batch.trg[{'trgSeqlen':slice(0,1)}], hidden)

In [13]:
model = LSTMTranslator(DE, EN, 300, 300, 200)
model.fit(train_iter,val_iter)

[epoch: 1, batch: 1] loss: 9.359054565429688
[epoch: 1, batch: 2] loss: 8.621251106262207
[epoch: 1, batch: 3] loss: 6.06719446182251
[epoch: 1, batch: 4] loss: 5.063589572906494
[epoch: 1, batch: 5] loss: 4.440306663513184
[epoch: 1, batch: 6] loss: 4.90449333190918
[epoch: 1, batch: 7] loss: 4.6908135414123535
[epoch: 1, batch: 8] loss: 5.064265251159668
[epoch: 1, batch: 9] loss: 4.687571048736572


KeyboardInterrupt: 

In [14]:
model = AttentionTranslator(DE, EN, 300, 300, 200)
model.fit(train_iter,val_iter)

[epoch: 1, batch: 1] loss: 9.34449291229248
[epoch: 1, batch: 2] loss: 5.320554733276367


KeyboardInterrupt: 