In [23]:
import time
import numpy as np
from __future__ import print_function
import uuid
import subprocess


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [5]:
import numpy as np
import codecs
import nltk

In [6]:
from corpus import ParallelCorpus
from torch.utils.data import Dataset, DataLoader

In [11]:
# Define hyperparameters
num_epochs = 10
batch_size = 64
learning_rate = 4e-4
embedding_dim = 256
hidden_dim = embedding_dim*2
#hidden_dim = 2 * embedding_dim
max_allowed_sentence_len = 50
drop = 0.2
#force = 1
bidirectional = False
LSTM_instead = False
volatile = False
position_based = False
context = True
force = 0.5

name= str(uuid.uuid4())
print(name)
save_dir = "models/"
save_dir2 = "dicts/"

5861ada2-57c8-4611-a9e3-60d7fc438ea2


In [12]:
# CHANGE: CORRECT DATA LOCATIONS.

training_set = ParallelCorpus(
        source_path="./data/train/train_bpe5k.fr", target_path="./data/train/train_bpe5k.en",
        max_sentence_length=max_allowed_sentence_len
    )

validation_set = ParallelCorpus(
        source_path="./data/val/val_bpe5k.fr", target_path="./data/val/val_bpe5k.en",
        max_sentence_length=max_allowed_sentence_len, use_indices_from=training_set
    )

test_set = ParallelCorpus(
        source_path="./data/test/test_2017_flickr_bpe5k.fr", target_path="./data/test/test_2017_flickr_bpe5k.en",
        max_sentence_length=max_allowed_sentence_len, use_indices_from=training_set
    )


In [13]:
class Seq2Seq(nn.Module):
    """A Vanilla Sequence to Sequence (Seq2Seq) model with LSTMs.
    Ref: Sequence to Sequence Learning with Neural Nets
    https://arxiv.org/abs/1409.3215
    """

    def __init__(
        self, trg_emb_dim,
        trg_vocab_size, trg_hidden_dim,
        pad_token_trg, drop, context=True,
        LSTM_instead=False, bidirectional=False,
        nlayers_trg=1,
    ):
        """Initialize Seq2Seq Model."""
        super(Seq2Seq, self).__init__()
        self.trg_vocab_size = trg_vocab_size
        self.trg_emb_dim = trg_emb_dim
        self.trg_hidden_dim = trg_hidden_dim
        self.bidirectional = bidirectional
        self.nlayers_trg = nlayers_trg
        self.pad_token_trg = pad_token_trg
        self.attn_soft = nn.Softmax(dim=2)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(drop)
        self.LSTM_instead=LSTM_instead
        self.context = context

        # Word Embedding look-up table for the target language
        self.trg_embedding = nn.Embedding(
            self.trg_vocab_size,
            self.trg_emb_dim,
            self.pad_token_trg,
        )


        # Decoder GRU // LSTM
        if (not self.LSTM_instead):
        
            self.decoder = nn.GRU(
                self.trg_emb_dim,
                self.trg_hidden_dim,
                self.nlayers_trg,
                batch_first=True
            )
        else:
            self.decoder = nn.LSTM(
                self.trg_emb_dim,
                self.trg_hidden_dim,
                self.nlayers_trg,
                batch_first=True
            )
        
#         self.scaler = nn.Linear(
#             self.trg_hidden_dim,
#             self.trg_emb_dim*2,
#         )
        
#         self.scaler2 = nn.Linear(
#             self.trg_hidden_dim,
#             self.trg_emb_dim,
#         )
        
        # Projection layer from decoder hidden states to target language vocabulary
        
        if (not self.context):
            self.decoder2vocab = nn.Linear(trg_hidden_dim, trg_vocab_size)
        else:
            self.decoder2vocab = nn.Linear(trg_hidden_dim + trg_emb_dim*2, trg_vocab_size)

    def forward(self, encoder_out, h_t, input_trg, source_lengths, teacher ):
        trg_emb = self.trg_embedding(input_trg)
        trg_emb = self.dropout(trg_emb)
        
        h_t = h_t.unsqueeze(0).expand(self.nlayers_trg, h_t.size(0), h_t.size(1))
        if (self.LSTM_instead):
            h_t = (h_t,h_t)
        
        hidden = []
        outputs = []
        trg_in = trg_emb[:,0,:].unsqueeze(1)
        for i in range(input_trg.size(1)):
#             print( " ")
            if (teacher):
                trg_in = trg_emb[:,i,:].unsqueeze(1)
#             print(trg_in.size())
            trg_h, h_t = self.decoder(trg_in, h_t)
            hidden.append(h_t.squeeze())
#             print(h_t.size())

            if (self.context):
                context = self.attention(h_t.squeeze().unsqueeze(1),encoder_out,source_lengths, encoder_out.size(1))
                trg_h = torch.cat((trg_h,context),2)
            
            trg_h_reshape = trg_h.contiguous().view(
            trg_h.size(0) * trg_h.size(1), trg_h.size(2)
            )
            # Affine transformation of all decoder hidden states
            decoder2vocab = self.decoder2vocab(trg_h_reshape)
            # Reshape
            decoder2vocab = decoder2vocab.view(
                trg_h.size(0), trg_h.size(1), decoder2vocab.size(1)
            )

        
            
            outputs.append(decoder2vocab.squeeze())
            
            if (not teacher):
                _, word = torch.max(decoder2vocab,2)
                trg_in = self.trg_embedding(word)
            
        outputs = torch.stack(outputs,1)
        return outputs    
            
            
            
#             if (not teacher):
#                 trg_in = self.scaler2(decoder_out)
#                 trg_in = self.relu(trg_in)
            
        # hiddens = torch.stack(hidden,1)
        # trg_h = torch.stack(outputs,1)
#         print(trg_h.size())
#         print(hiddens.size())
        #if torch.cuda.is_available()
        #    trg_
        #print(asdfs)
        
            
        #hiddens = self.scaler(hiddens) 
        #hiddens = self.relu(hiddens)
#         if (self.context):
#             context = self.attention(hiddens,encoder_out,source_lengths, encoder_out.size(1))
#             print(hiddens.size())
#             print(encoder_out.size())

#             trg_h = torch.cat((trg_h,context),2)
        
        
#         # Initialize the decoder GRU with the last hidden state of the encoder and 
#         # run target inputs through the decoder.
        
#         # Merge batch and time dimensions to pass to a linear layer
#         trg_h_reshape = trg_h.contiguous().view(
#             trg_h.size(0) * trg_h.size(1), trg_h.size(2)
#         )
        
#         # Affine transformation of all decoder hidden states
#         decoder2vocab = self.decoder2vocab(trg_h_reshape)
        
#         # Reshape
#         decoder2vocab = decoder2vocab.view(
#             trg_h.size(0), trg_h.size(1), decoder2vocab.size(1)
#         )
    
    
    def attention(self, hidden_to_attn, encoder_outputs, source_lengths, max_len):
                
        
#         print(hidden_to_attn.size())
#         print(encoder_outputs.size())
        # repeat the lstm out in third dimension and the encoder outputs in second dimension so we can make a meshgrid
        # so we can do elementwise mul for all possible combinations of h_j and s_i
        h_j = encoder_outputs.unsqueeze(1).repeat(1,hidden_to_attn.size(1),1,1)
        s_i = hidden_to_attn.unsqueeze(2).repeat(1,1,encoder_outputs.size(1),1)
#         print(h_j.size())
#         print(s_i.size())
        # get the dot product between the two to get the energy
        # the unsqueezes are there to emulate transposing. so we can use matmul as torch.dot doesnt accept matrices
        energy = s_i.unsqueeze(3).matmul(h_j.unsqueeze(4)).squeeze(4)
        
#         # this is concat attention, its a different form then the ones we need
#         cat = torch.cat((s_i,h_j),3)
        
#         energy = self.attn_layer(cat)

        # reshaping the encoder outputs for later
        encoder_outputs = encoder_outputs.unsqueeze(1)
        encoder_outputs = encoder_outputs.repeat(1,energy.size(1),1,1)
    
        # apply softmax to the energys 
        allignment = self.attn_soft(energy)
        
        # create a mask like : [1,1,1,0,0,0] whos goal is to multiply the attentions of the pads with 0, rest with 1
        idxes = torch.arange(0,max_len).unsqueeze(0).long().cuda()
        #print(idxes.size())
        mask = Variable((idxes<source_lengths.unsqueeze(1)).float())
        
        # format the mask to be same size() as the attentions
        mask = mask.unsqueeze(1).unsqueeze(3).repeat(1,allignment.size(1),1,1)
        
        # apply mask
        masked = allignment * mask
        
        # now we have to rebalance the other values so they sum to 1 again
        # this is done by dividing each value by the sum of the sequence
        # calculate sums
        msum = masked.sum(-2).repeat(1,1,masked.size(2)).unsqueeze(3)
        
        # rebalance
        attentions = masked.div(msum)
        
        # now we shape the attentions to be similar to context in size
        allignment = allignment.repeat(1,1,1,encoder_outputs.size(3))

        # make context vector by element wise mul
        context = attentions * encoder_outputs
        

        context2 = torch.sum(context,2)
        
        
        return context2
    
#     def attention2(self, hidden_to_attn, encoder_outputs, source_lengths, max_len):
#         # repeat the lstm out in third dimension and the encoder outputs in second dimension so we can make a meshgrid
#         # so we can do elementwise mul for all possible combinations of h_j and s_i
#         h_j = encoder_outputs.unsqueeze(1).repeat(1, hidden_to_attn.size(1), 1, 1)
#         s_i = hidden_to_attn.unsqueeze(2).repeat(1, 1, encoder_outputs.size(1), 1)

#         # get the dot product between the two to get the energy
#         # the unsqueezes are there to emulate transposing. so we can use matmul as torch.dot doesnt accept matrices
#         energy = s_i.unsqueeze(3).matmul(h_j.unsqueeze(4)).squeeze(4)

#         #         # this is concat attention, its a different form then the ones we need
#         #         cat = torch.cat((s_i,h_j),3)

#         #         energy = self.attn_layer(cat)

#         # reshaping the encoder outputs for later
#         encoder_outputs = encoder_outputs.unsqueeze(1)
#         encoder_outputs = encoder_outputs.repeat(1, energy.size(1), 1, 1)

#         # apply softmax to the energys
#         allignment = self.attn_soft(energy)

#         # create a mask like : [1,1,1,0,0,0] whos goal is to multiply the attentions of the pads with 0, rest with 1

#         idxes = torch.arange(0, max_len).unsqueeze(0).long()

#         if torch.cuda.is_available():
#             idxes = idxes.cuda()

#         mask = Variable((idxes < source_lengths.unsqueeze(1)).float())

#         # format the mask to be same size() as the attentions
#         mask = mask.unsqueeze(1).unsqueeze(3).repeat(1, allignment.size(1), 1, 1)

#         # apply mask
#         masked = allignment * mask

#         # now we have to rebalance the other values so they sum to 1 again
#         # this is done by dividing each value by the sum of the sequence
#         # calculate sums
#         msum = masked.sum(-2).repeat(1, 1, masked.size(2)).unsqueeze(3)

#         # rebalance
#         attentions = masked.div(msum)

#         # now we shape the attentions to be similar to context in size
#         allignment = allignment.repeat(1, 1, 1, encoder_outputs.size(3))

#         # make context vector by element wise mul
#         context = attentions * encoder_outputs

#         context2 = torch.sum(context, 2)

#         return context2
    
#     def decode(self, decoder2vocab):
#         # Turn decoder output into a probabiltiy distribution over vocabulary
#         decoder2vocab_reshape = decoder2vocab.view(-1, decoder2vocab.size(2))
#         word_probs = F.softmax(decoder2vocab_reshape)
#         word_probs = word_probs.view(
#             decoder2vocab.size(0), decoder2vocab.size(1), decoder2vocab.size(2)
#         )

#         return word_probs

In [14]:
class Encoder(nn.Module):
    def __init__(
        self, src_emb_dim,
        src_vocab_size,
        src_hidden_dim,
        pad_token_src,
        drop,
        position_based=True,
        LSTM_instead=False,
        bidirectional=False,
        nlayers_src=1
        
    ):
        super(Encoder, self).__init__()
        self.src_vocab_size = src_vocab_size
        self.src_emb_dim = src_emb_dim
        self.src_hidden_dim = src_hidden_dim
        self.pad_token_src = pad_token_src
        self.bidirectional = bidirectional
        self.nlayers_src = nlayers_src
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(drop)
        self.LSTM_instead=LSTM_instead
        self.position_based=position_based
 
        
        
        # Word Embedding look-up table for the soruce language
        self.src_embedding = nn.Embedding(
            self.src_vocab_size,
            self.src_emb_dim,
            self.pad_token_src,
        )
        self.pos_embedding = nn.Embedding(
            self.src_vocab_size,
            self.src_emb_dim,
            0,
        )
#         self.scaler = nn.Linear(
#             self.src_hidden_dim,
#             self.src_emb_dim*2,
#         )
        self.scale_h0 = nn.Linear(
            self.src_emb_dim*2, self.src_hidden_dim
        )
        
        
        
        # Encoder GRU
        self.encoder = nn.GRU(
            self.src_emb_dim // 2 if self.bidirectional else self.src_emb_dim,
            self.src_hidden_dim,
            self.nlayers_src,
            bidirectional=bidirectional,
            batch_first=True,
        )
        
    def forward(self, input_src, src_lengths, positions):
        src_emb = self.src_embedding(input_src) # BxSxE
        
        if (not self.position_based):
            src_emb = self.dropout(src_emb)
            src_emb = pack_padded_sequence(src_emb, src_lengths, batch_first=True)
            packed_output , src_h_t = self.encoder(src_emb) # out:
            h_t = torch.cat((src_h_t[-1], src_h_t[-2]), 1) if self.bidirectional else src_h_t[-1] # BxH
            out, _ = pad_packed_sequence(packed_output, batch_first=True)
            #out = self.scaler(out)
        else:
            src_pos = self.pos_embedding(positions)
            out = torch.cat((src_pos,src_emb),2)
            out = self.dropout(out)
            hidden = torch.mean(out,1)
            h_t = self.scale_h0(hidden)
            h_t = self.relu(h_t)
            
            
        # out = BxSxH
        return out, h_t

In [9]:
# def  get_parallel_minibatchget_par (lines, src_word2id, trg_word2id, index, batch_size, volatile=False):
        
#         # Get source sentences for this minibatch
#         src_lines = [
#             ['<s>'] + list(line[1]) + ['</s>']
#             for line in lines[index: index + batch_size]
#         ]

#         # Get target sentences for this minibatch
#         trg_lines = [
#             ['<s>'] + line[0].split() + ['</s>']
#             for line in lines[index: index + batch_size]
#         ]
        
#         # Sort source sentences by length for length masking in RNNs
#         src_lens = [len(line) for line in src_lines]
#         sorted_indices = np.argsort(src_lens)[::-1]
        
#         # Reorder sentences based on source lengths
#         sorted_src_lines = [src_lines[idx] for idx in sorted_indices]
#         sorted_trg_lines = [trg_lines[idx] for idx in sorted_indices]
        
#         # Compute new sentence lengths
#         sorted_src_lens = [len(line) for line in sorted_src_lines]
#         sorted_trg_lens = [len(line) for line in sorted_trg_lines]
        
#         # Get max source and target lengths to pad input and output sequences
#         max_src_len = max(sorted_src_lens)
#         max_trg_len = max(sorted_trg_lens)
        
#         # Construct padded source input sequence
#         input_lines_src = [
#             [src_word2id[w] if w in src_word2id else src_word2id['<unk>'] for w in line] +
#             [src_word2id['<pad>']] * (max_src_len - len(line))
#             for line in sorted_src_lines
#         ]

#         # Construct padded target input sequence
#         input_lines_trg = [
#             [trg_word2id[w] if w in trg_word2id else trg_word2id['<unk>'] for w in line[:-1]] +
#             [trg_word2id['<pad>']] * (max_trg_len - len(line))
#             for line in sorted_trg_lines
#         ]

#         # Construct padded target output sequence (Note: Output sequence is just the input shifted by 1 position)
#         # This is for teacher-forcing
#         output_lines_trg = [
#             [trg_word2id[w] if w in trg_word2id else trg_word2id['<unk>'] for w in line[1:]] +
#             [trg_word2id['<pad>']] * (max_trg_len - len(line))
#             for line in sorted_trg_lines
#         ]

#         input_lines_src = Variable(torch.LongTensor(input_lines_src), volatile=volatile)
#         input_lines_trg = Variable(torch.LongTensor(input_lines_trg), volatile=volatile)
#         output_lines_trg = Variable(torch.LongTensor(output_lines_trg), volatile=volatile)

#         return {
#             'input_src': input_lines_src,
#             'input_trg': input_lines_trg,
#             'output_trg': output_lines_trg,
#             'src_lens': sorted_src_lens
#         }

In [10]:
cuda_available = torch.cuda.is_available()

In [11]:
training_loader = DataLoader(training_set, batch_size=batch_size)
validation_loader = DataLoader(validation_set, batch_size=batch_size)
test_loader = DataLoader(test_set, batch_size=batch_size)

In [12]:
seq2seq = Seq2Seq(
    trg_emb_dim=embedding_dim,
    trg_vocab_size=training_set.target_vocab_size,
    trg_hidden_dim=hidden_dim,
    pad_token_trg=training_set.target_pad,
    drop=drop,
    bidirectional=bidirectional,
    context=context
)

encoder = Encoder(
    src_emb_dim=embedding_dim,
    src_vocab_size=training_set.source_vocab_size,
    src_hidden_dim=hidden_dim,
    pad_token_src=training_set.source_pad,
    drop=drop,
    bidirectional=bidirectional,
    position_based=position_based
)


if cuda_available:
    seq2seq = seq2seq.cuda()
    encoder = encoder.cuda()

In [13]:
optimizer = optim.Adam(list(seq2seq.parameters()) + list(encoder.parameters()), lr=learning_rate)
weight_mask = torch.ones(training_set.target_vocab_size)
if cuda_available:
    weight_mask = weight_mask.cuda()
weight_mask[training_set.target_pad] = 0
loss_criterion = nn.CrossEntropyLoss(weight=weight_mask)

In [14]:
output_dict = {"train_loss":[],
               "val_loss":[], 
               "test_loss":[],
               "best_epoch" : 0, 
               "best_val_loss" : 999,
               "num_epochs" : num_epochs,
               "batch_size" : batch_size,
               "learning_rate" : learning_rate,
               "embedding_dim" : embedding_dim,
               "hidden_dim" : hidden_dim,
               #hidden_dim = 2 * embedding_dim,
               "max_allowed_sentence_len" : max_allowed_sentence_len,
               "drop" : drop,
               #force = 1,
               "bidirectional" : bidirectional,
               "LSTM_instead" : LSTM_instead,
               "volatile" : volatile,
               "position_based" : position_based,
               "context" : context
               "force" : force
              }

In [15]:
iterations = len(training_loader)
for epoch in range(0, num_epochs):
    losses = []
    start = time.time()
    
    batch = 0
    for source_batch, target_batch, source_lengths, target_lengths, batch_positions in training_loader:
        batch_start = time.time()
        use_teacher_forcing = True if random.random() <= force else False
        
        source_batch = Variable(source_batch, volatile=volatile)
        output_batch = Variable(target_batch[:,1:], volatile=volatile)
        target_batch = Variable(target_batch[:,:-1], volatile=volatile)
        batch_positions = Variable(batch_positions, volatile=volatile)
        #print(target_batch.size())
        #print(output_batch.size())
        
        if cuda_available:
            source_batch.cuda()
            source_batch.cuda()
            output_batch.cuda()
            
        encoder_out, h_t = encoder(
            input_src=source_batch, src_lengths=source_lengths, positions=batch_positions,
        )    
            
        decoder_out = seq2seq(
            encoder_out=encoder_out, h_t=h_t,
            input_trg=target_batch,source_lengths=source_lengths, teacher=use_teacher_forcing
        )
        
#         print(decoder_out.contiguous().view(-1, decoder_out.size(2)).size())
#         print(output_batch.contiguous().view(-1).size())
        
        loss = loss_criterion(
            decoder_out.contiguous().view(-1, decoder_out.size(2)),
            output_batch.contiguous().view(-1)
        )
        optimizer.zero_grad()
        loss.backward()
        # Gradient clipping to avoid exploding gradients
        torch.nn.utils.clip_grad_norm(seq2seq.parameters(), 5.)
        optimizer.step()
        losses.append(loss.item())
        

        
        batch_time = time.time() - batch_start
        print('\r[Epoch {:03d}/{:03d}] Batch {:06d}/{:06d} [{:.1f}/s] '.format(epoch+1, num_epochs, batch+1, iterations, batch_time), end='')
        batch +=1
    dev_nll = []
    for source_batch, target_batch, source_lengths, target_lengths, batch_positions in validation_loader:
        use_teacher_forcing = False
        
        source_batch = Variable(source_batch, volatile=volatile)
        output_batch = Variable(target_batch[:,1:], volatile=volatile)
        target_batch = Variable(target_batch[:,:-1], volatile=volatile)
        #print(target_batch.size())
        #print(output_batch.size())
        
        if cuda_available:
            source_batch.cuda()
            source_batch.cuda()
            output_batch.cuda()
            
        encoder_out, h_t = encoder(
            input_src=source_batch, src_lengths=source_lengths, positions=batch_positions, 
        )    
            
        decoder_out = seq2seq(
            encoder_out=encoder_out, h_t=h_t,
            input_trg=target_batch,source_lengths=source_lengths, teacher=use_teacher_forcing
        )
        
        #print(decoder_out.contiguous().view(-1, decoder_out.size(2)).size())
        #print(output_batch.contiguous().view(-1).size())
        
        loss = loss_criterion(
            decoder_out.contiguous().view(-1, decoder_out.size(2)),
            output_batch.contiguous().view(-1)
        )
        dev_nll.append(loss.item())
        
        
    test_nll = []    
    for source_batch, target_batch, source_lengths, target_lengths, batch_positions in test_loader:
        use_teacher_forcing = False
        
        source_batch = Variable(source_batch, volatile=volatile)
        output_batch = Variable(target_batch[:,1:], volatile=volatile)
        target_batch = Variable(target_batch[:,:-1], volatile=volatile)
        #print(target_batch.size())
        #print(output_batch.size())
        
        if cuda_available:
            source_batch.cuda()
            source_batch.cuda()
            output_batch.cuda()
            
        encoder_out, h_t = encoder(
            input_src=source_batch, src_lengths=source_lengths, positions=batch_positions, 
        )    
            
        decoder_out = seq2seq(
            encoder_out=encoder_out, h_t=h_t,
            input_trg=target_batch,source_lengths=source_lengths, teacher=use_teacher_forcing
        )
        
        #print(decoder_out.contiguous().view(-1, decoder_out.size(2)).size())
        #print(output_batch.contiguous().view(-1).size())
        
        loss = loss_criterion(
            decoder_out.contiguous().view(-1, decoder_out.size(2)),
            output_batch.contiguous().view(-1)
        )
        test_nll.append(loss.item())
    
    output_dict["train_loss"].append(np.mean(losses))
    output_dict["val_loss"].append(np.mean(dev_nll))
    output_dict["test_loss"].append(np.mean(test_nll))
    
    if np.mean(dev_nll) < output_dict["best_val_loss"]:
        
        output_dict["best_epoch"] = epoch
        output_dict["best_val_loss"] = np.mean(dev_nll)
        torch.save(seq2seq, "{}{}_{}.model".format(save_dir, seq2seq.__class__.__name__.lower(), name))
        torch.save(encoder, "{}{}_{}.model".format(save_dir, encoder.__class__.__name__.lower(), name))
    
    
    print(time.time() - start)
    print('Epoch : %d Training Loss : %.3f' % (epoch+1, np.mean(losses)))
    print('Epoch : %d Dev Loss : %.3f' % (epoch+1, np.mean(dev_nll)))
    print('Epoch : %d Test Loss : %.3f' % (epoch+1, np.mean(test_nll)))
    print('-------------------------------------------------------------')

np.save( "{}{}.npy".format(save_dir2, name),output_dict)   



[Epoch 001/010] Batch 000453/000453 [0.2/s] 

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


227.02750492095947
Epoch : 1 Training Loss : 3.699
Epoch : 1 Dev Loss : 8.195
Epoch : 1 Test Loss : 7.962
-------------------------------------------------------------
[Epoch 002/010] Batch 000453/000453 [0.2/s] 227.58037781715393
Epoch : 2 Training Loss : 2.564
Epoch : 2 Dev Loss : 7.137
Epoch : 2 Test Loss : 7.221
-------------------------------------------------------------
[Epoch 003/010] Batch 000453/000453 [0.2/s] 225.52018070220947
Epoch : 3 Training Loss : 2.135
Epoch : 3 Dev Loss : 6.899
Epoch : 3 Test Loss : 6.972
-------------------------------------------------------------
[Epoch 004/010] Batch 000453/000453 [0.2/s] 225.6797547340393
Epoch : 4 Training Loss : 1.887
Epoch : 4 Dev Loss : 6.641
Epoch : 4 Test Loss : 6.862
-------------------------------------------------------------
[Epoch 005/010] Batch 000453/000453 [0.2/s] 234.45887112617493
Epoch : 5 Training Loss : 1.714
Epoch : 5 Dev Loss : 6.667
Epoch : 5 Test Loss : 6.803
-----------------------------------------------

In [24]:
_, sentence = torch.max(decoder_out[3],1)
test_pred = [training_set.target_i2w[word] for word in sentence.cpu().numpy()]
print(test_pred)
test_real = [training_set.target_i2w[word] for word in output_batch[3].cpu().numpy()]
print(test_real)

['a', 'person', 'is', 'sleeping', 'in', 'the', 'street', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>', '<eos>']
['a', 'person', 'sleeping', 'on', 'the', 'street', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [17]:
len(training_loader)

453

In [21]:
def evaluate(encoder, decoder, eval_set, target_path, reference_file_path):
    data_loader = DataLoader(eval_set, batch_size=5)
    softmax = nn.Softmax(dim=1)
    idx2word = eval_set.target_i2w
    sorted_sentence_ids = eval_set.target_sentence_ids.cpu().numpy()
    translated_sentences = []

    # Decode
    # for source_batch, target_batch, source_lengths, target_lengths, batch_positions in data_loader:
    for source_batch, target_batch, source_lengths, target_lengths, batch_positions in data_loader:

        encoder_out, h_t = encoder(
            input_src=source_batch, src_lengths=source_lengths, positions=batch_positions,
        )
        decoder_out = decoder(
            encoder_out=encoder_out, h_t=h_t,
            input_trg=target_batch, source_lengths=source_lengths, teacher=False
        )

        # Get predicted word for every batch instance
        normalized_output = softmax(decoder_out)
        predictions = normalized_output.max(2)[1]  # Only get indices

        for sentence_index in range(predictions.shape[0]):
            token_indices = list(predictions[sentence_index, :].cpu().numpy())

            tokens = list(map(lambda idx: idx2word[idx], token_indices))
            eos_index = len(tokens)
            if "<eos>" in tokens:
                eos_index = tokens.index("<eos>")

            tokens = tokens[:eos_index]  # Cut off after first end of sentence token

            translated_sentence = " ".join(tokens).replace("@@ ", "").replace("@@", "")
            print(translated_sentence)
            translated_sentences.append(translated_sentence)

    # Bring sentence back into the order they were in the test set
    resorted_sentences = [None] * len(translated_sentences)
    for target_id, sentence in zip(sorted_sentence_ids, translated_sentences):
        resorted_sentences[target_id] = sentence

    # Write to file
    with codecs.open(target_path, "wb", "utf-8") as target_file:
        for sentence in resorted_sentences:
            target_file.write("{}\n".format(sentence))

    out = subprocess.getoutput(
        "perl ./multi-bleu.perl {} < {}".format(reference_file_path, target_path)
    )
    print(out[out.index("BLEU"):])

In [24]:
name = "2dfa87d0-ace2-4e7f-a25b-23c6de00ba56"
encoder = torch.load("{}{}_{}.model".format(save_dir, "encoder", name))
decoder = torch.load("{}{}_{}.model".format(save_dir, "seq2seq", name))
evaluate(
    encoder, decoder, test_set, target_path="./eval_out_{}.txt".format(name),
    reference_file_path="./data/test/test_2017_flickr_truecased.en"
)



a mountain aker ning is giving her skponmer ers outside the mountains reground .
a young blond man in front bright outfit be and
one man in all sunshirt black attire stands tenby another cowboy ach k store
a surfer lake is surrounded by his plversing time
this ustaking blue pole with blond-hair sort in &quot; shes and
one small white goalls c is riding top table full .
one small boy sits asleep inside front bed holding umbrella lot olr ing ment m bag
many few is jumping ching on snowy cannked with a cup alley skirt that
an older man is swinging weeping who top to of talking talking talking talking talking different different different different different different different ! ! ! ! him ,,,,,,,,waves waves waves waves waves waves waves waves waves waves waves waves waves
an woman with no blue shirt and white dogs riding for
a bunch of motorcyclist ycles sts ride with lights eyes on on over over
people are carrying ving sort and he dog looking him
two people are staring up each air towar

this skateboarder tenfoot works feet for each edge and legs occes er in front background toward .
pilmotorcycle rider is blowing a corner n with its on
this male riding picture of water with no other , smiling foot out top fli
one guy riding bikes surfboard adler erskating lades jumps high the microphone urve e sign with holding .
two old in white shirt walking along the ball .
the small girl in long pink piece suit jacket black shirt climbing their
one Minger with no ropes ered outfit riding horses top surfboard chair for for in . .
an group ane with no purple sh cap his mountain l
four men on sunny track surrounded which flowers gold biching ders out around from from side al s s ery et et et M lot s s ely ely umpwhite shirts .
children are putting as front field alley ter outfit
this female is walking to microphone sitting on top motorcycle board with .
this motorcycle is inger to some forest umbrella
i yellow dog leaps running along top dirt road V
an bunch urrted colored surfing fo

people dressed up truck cowboy leaolr ates yard game
two people are around each metal ealen tree midair air time .
oversnowboarder olger is sitting on top huge brusch by the couple irboard .
the people urger performs setting up off of dog shot
the girl is walking his hair up book for
a naprepares walking amhis surfboard g order um sunny cross in in in in in in in through through into into into into work trees trees trees trees trees trees trees trees trees trees trees B / / / / / / / / / / /
1 man is sitting on top street and he to his phone with .
1 guy in jeans bikini be ls erskating lades hair jumping onto bikes skateboard urrying kler screen
this flower ric is Asian going top wall with two cars eyes outside .
Indian mountain ack sses man is taking up a field &apos;s
two older gentleman push is holding onto some other flag
an happy ertod dogs people bacake white pants running ching the sky
a man is giving alls with adult surfboard se for
several people riding their cart top crowded 

a nais painting to food platop sort sh .
two women are sitting next to a piece while some work .
an older women in jeans bikini selling riding display skateboard by . ss for
they urry selols asleep the sky of
one surfer sue takes front sweatshirt sits by another and .
a boy is walking to take other of onlookers woods player
one female riding cake sidewalk , out the same of
ping few itted waits for their same cal sitting on sunset table full sit woman woman woman
a bunch of girls are riding top volleyball court .
Indian country icyclists racing kids red umbrella stands about the wooden ble l store
this woman walking through front marathon group filled .
many women with brown ats are standing outside front of a building .
1 guy with no bunch attoo inside the woods ach coat throwing bikini urred time
people outside in the air shirt after .
the sun is getting an skateboard tenss surrounded .
six elderly person walking down the snowy g stands while fsitting sitting sitting smagainst against

one person is walking down the sidewalk with each few sign that .
a bunch performer rides attempting across top edge with drinks eyes midair air .
a bunch splayer jumping for hit ball shot
an elderly women sitting on top triarve d next to each building .
Indian photographomsh ter is six shopping urses ated on the snowy .
I surfboard ice ht ly gets putting playing with each red ball on the mountains .
she girl jumping high while front marathon match .
an bunch is getting various irssling around front snowy . table .
Indian person is performing their cigarette jump on the motorcycle avemd walkway .
two people laying bikes skateboard olcery outfit the sidewalk posing .
a train is performing on railroad ders tree
this volleyball player on top field covered surrounded with . . . . . .
the tennis player is walking at something ball against .
this guy riing m opsts of people white house with wood few door setting on in in . . . above above above above above into into ! ! ! ! B B B B B B B B B

one naon jeans cut listening alf en gets some piece door
three people sit sitting at each table smoking front bikini err e ation ard bag .
a lake is walking on the ground urrt event
Indian couple sit on the edge of river city scene
a couple usette dog leaps swinging on the parade throom store
a small girl is listening ding to get other no lot bowl ving ely l scooter floor with poses while while while into
Indian group dds is running place top flower alley filled that
group of construction umpvess wear eyes eatoting Sarms
a young girl wearing no tective gear are
many lot building with no man leaning riding something top stage alls y day se .
three people inside front blue apron ft suit top surfboard river surrounded that he he he work work work him him him over s s h h h ) ) ) 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
one person is walking down a blue staof of his head .
four men madancing drinks few cks ent ce and pointing eyes board over .
a bunch gets driving various the woods sert cloth

one nais performing an instrument &apos;s front purple room .
Indian lot mondriving no dog laying each top m of in in on
man in orange gray jacket is running towards .
a bunch of children having fun instruments
little boy pustaring food blue umbrella bottle .
tefew biker riding the woods coat
one man works the bridge throwing with adult cane swing outside to to . . . .
a young child is jumping at the other against
one guy with no cane costume pole sunset top of the woods st
a nastands outside the ocean ouving l attire surrounded
a girl of people nariding his horse wall
one guy is surfing cake dog jump some fence .
one bunch climbing their some sort number of on . . . . . . . . ememememememh h h B B cal cal E E E E E ent ent ent ent ent ent ent ent ent ent ent ent ent ent ent
a black and no legs leaps preaway of
this female with no row of people piece olwn country by ss .
a man appears performing up its edge of several ocean team .
four people are pulling some picture br
young people ar

a nasitting on top piece with each eyes &apos;s
a bunch of people are standing around top street corner
this woman and no few are running across top grassy area . &quot; for
three men gets pulling ammidair air after of of of through
an beautiful player runs speaking after the other .
a bridge is skating wing their some sort .
an group sports car is coming their cigarette umbrella .
a man is speaking alling asleep his beard ball are over
a bunch girl is walking barely foot on the beach .
a young girl is walking shorts big maving orn store
two hiplayers are away the ball against
girls playing baseball ry ers re painside over over over .
Indian female team ocim s er gets riding their top stage for
this girl posing black stands sitting on top street &apos;s
a bunch leim s er is jumping selling vegetables for
the dogs are running around field of
children walking down a busy surrounded
two women are staring through the streets surrounded a city scene that
a guy walking with his eyes in midai

a bunch surrounded by six white chasing to in in
two women are pulling through an lot irpping event is is
a mountain biker andal cade e basketball vegetables distance net with just off off over over over down down through through through another another another another ) ) fountain fountain fountain fountain machine machine machine backpack q / / / / / / / / / /
this female is walking some blue umbrella with
this biker carries carrying no bikini outfit looks to at at through . . . .
this nais skiing up an snowy of people athas . .
this girl lot front of the woods &apos;s are to to to to
BLEU = 0.17, 8.7/0.2/0.0/0.0 (BP=1.000, ratio=1.193, hyp_len=13471, ref_len=11292)


In [None]:
output_dict["train_loss"]