In [1]:
import time
import numpy as np
from __future__ import print_function
import uuid

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [3]:
import numpy as np
import codecs
import nltk

In [4]:
from corpus import ParallelCorpus
from torch.utils.data import Dataset, DataLoader

In [5]:
# Define hyperparameters
num_epochs = 20
batch_size = 64
learning_rate = 4e-4
embedding_dim = 256
hidden_dim = embedding_dim*2
#hidden_dim = 2 * embedding_dim
max_allowed_sentence_len = 50
drop = 0.2
#force = 1
bidirectional = False
LSTM_instead = False
volatile = False
position_based = True
context = True

name= str(uuid.uuid4())
print(name)
save_dir = "models/"
save_dir2 = "dicts/"

9e24c7a3-5781-4f9f-a612-cd3cbaf18282


In [6]:
# CHANGE: CORRECT DATA LOCATIONS.

training_set = ParallelCorpus(
        source_path="./data2/train_bpe.fr", target_path="./data2/train_bpe.en",
        max_sentence_length=max_allowed_sentence_len
    )

validation_set = ParallelCorpus(
        source_path="./data2/val_bpe.fr", target_path="./data2/val_bpe.en",
        max_sentence_length=max_allowed_sentence_len, use_indices_from=training_set
    )

test_set = ParallelCorpus(
        source_path="./data2/test_2017_flickr_bpe.fr", target_path="./data2/test_2017_flickr_bpe.en",
        max_sentence_length=max_allowed_sentence_len, use_indices_from=training_set
    )


In [7]:
class Seq2Seq(nn.Module):
    """A Vanilla Sequence to Sequence (Seq2Seq) model with LSTMs.
    Ref: Sequence to Sequence Learning with Neural Nets
    https://arxiv.org/abs/1409.3215
    """

    def __init__(
        self, trg_emb_dim,
        trg_vocab_size, trg_hidden_dim,
        pad_token_trg, drop, context=True,
        LSTM_instead=False, bidirectional=False,
        nlayers_trg=1,
    ):
        """Initialize Seq2Seq Model."""
        super(Seq2Seq, self).__init__()
        self.trg_vocab_size = trg_vocab_size
        self.trg_emb_dim = trg_emb_dim
        self.trg_hidden_dim = trg_hidden_dim
        self.bidirectional = bidirectional
        self.nlayers_trg = nlayers_trg
        self.pad_token_trg = pad_token_trg
        self.attn_soft = nn.Softmax(dim=2)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(drop)
        self.LSTM_instead=LSTM_instead
        self.context = context

        # Word Embedding look-up table for the target language
        self.trg_embedding = nn.Embedding(
            self.trg_vocab_size,
            self.trg_emb_dim,
            self.pad_token_trg,
        )


        # Decoder GRU // LSTM
        if (not self.LSTM_instead):
        
            self.decoder = nn.GRU(
                self.trg_emb_dim,
                self.trg_hidden_dim,
                self.nlayers_trg,
                batch_first=True
            )
        else:
            self.decoder = nn.LSTM(
                self.trg_emb_dim,
                self.trg_hidden_dim,
                self.nlayers_trg,
                batch_first=True
            )
        
#         self.scaler = nn.Linear(
#             self.trg_hidden_dim,
#             self.trg_emb_dim*2,
#         )
        
#         self.scaler2 = nn.Linear(
#             self.trg_hidden_dim,
#             self.trg_emb_dim,
#         )
        
        # Projection layer from decoder hidden states to target language vocabulary
        
        if (not self.context):
            self.decoder2vocab = nn.Linear(trg_hidden_dim, trg_vocab_size)
        else:
            self.decoder2vocab = nn.Linear(trg_hidden_dim + trg_emb_dim*2, trg_vocab_size)

    def forward(self, encoder_out, h_t, input_trg, source_lengths, teacher ):
        trg_emb = self.trg_embedding(input_trg)
        trg_emb = self.dropout(trg_emb)
        
        h_t = h_t.unsqueeze(0).expand(self.nlayers_trg, h_t.size(0), h_t.size(1))
        if (self.LSTM_instead):
            h_t = (h_t,h_t)
        
        hidden = []
        outputs = []
        trg_in = trg_emb[:,0,:].unsqueeze(1)
        for i in range(input_trg.size(1)):
#             print( " ")
            if (teacher):
                trg_in = trg_emb[:,i,:].unsqueeze(1)
#             print(trg_in.size())
            trg_h, h_t = self.decoder(trg_in, h_t)
            hidden.append(h_t.squeeze())
#             print(h_t.size())

            if (self.context):
                context = self.attention(h_t.squeeze().unsqueeze(1),encoder_out,source_lengths, encoder_out.size(1))
                trg_h = torch.cat((trg_h,context),2)
            
            trg_h_reshape = trg_h.contiguous().view(
            trg_h.size(0) * trg_h.size(1), trg_h.size(2)
            )
            # Affine transformation of all decoder hidden states
            decoder2vocab = self.decoder2vocab(trg_h_reshape)
            # Reshape
            decoder2vocab = decoder2vocab.view(
                trg_h.size(0), trg_h.size(1), decoder2vocab.size(1)
            )

        
            
            outputs.append(decoder2vocab.squeeze())
            
            if (not teacher):
                _, word = torch.max(decoder2vocab,2)
                trg_in = self.trg_embedding(word)
            
        outputs = torch.stack(outputs,1)
        return outputs    
            
            
            
#             if (not teacher):
#                 trg_in = self.scaler2(decoder_out)
#                 trg_in = self.relu(trg_in)
            
        # hiddens = torch.stack(hidden,1)
        # trg_h = torch.stack(outputs,1)
#         print(trg_h.size())
#         print(hiddens.size())
        #if torch.cuda.is_available()
        #    trg_
        #print(asdfs)
        
            
        #hiddens = self.scaler(hiddens) 
        #hiddens = self.relu(hiddens)
#         if (self.context):
#             context = self.attention(hiddens,encoder_out,source_lengths, encoder_out.size(1))
#             print(hiddens.size())
#             print(encoder_out.size())

#             trg_h = torch.cat((trg_h,context),2)
        
        
#         # Initialize the decoder GRU with the last hidden state of the encoder and 
#         # run target inputs through the decoder.
        
#         # Merge batch and time dimensions to pass to a linear layer
#         trg_h_reshape = trg_h.contiguous().view(
#             trg_h.size(0) * trg_h.size(1), trg_h.size(2)
#         )
        
#         # Affine transformation of all decoder hidden states
#         decoder2vocab = self.decoder2vocab(trg_h_reshape)
        
#         # Reshape
#         decoder2vocab = decoder2vocab.view(
#             trg_h.size(0), trg_h.size(1), decoder2vocab.size(1)
#         )
    
    
    def attention(self, hidden_to_attn, encoder_outputs, source_lengths, max_len):
                
        
#         print(hidden_to_attn.size())
#         print(encoder_outputs.size())
        # repeat the lstm out in third dimension and the encoder outputs in second dimension so we can make a meshgrid
        # so we can do elementwise mul for all possible combinations of h_j and s_i
        h_j = encoder_outputs.unsqueeze(1).repeat(1,hidden_to_attn.size(1),1,1)
        s_i = hidden_to_attn.unsqueeze(2).repeat(1,1,encoder_outputs.size(1),1)
#         print(h_j.size())
#         print(s_i.size())
        # get the dot product between the two to get the energy
        # the unsqueezes are there to emulate transposing. so we can use matmul as torch.dot doesnt accept matrices
        energy = s_i.unsqueeze(3).matmul(h_j.unsqueeze(4)).squeeze(4)
        
#         # this is concat attention, its a different form then the ones we need
#         cat = torch.cat((s_i,h_j),3)
        
#         energy = self.attn_layer(cat)

        # reshaping the encoder outputs for later
        encoder_outputs = encoder_outputs.unsqueeze(1)
        encoder_outputs = encoder_outputs.repeat(1,energy.size(1),1,1)
    
        # apply softmax to the energys 
        allignment = self.attn_soft(energy)
        
        # create a mask like : [1,1,1,0,0,0] whos goal is to multiply the attentions of the pads with 0, rest with 1
        idxes = torch.arange(0,max_len).unsqueeze(0).long().cuda()
        #print(idxes.size())
        mask = Variable((idxes<source_lengths.unsqueeze(1)).float())
        
        # format the mask to be same size() as the attentions
        mask = mask.unsqueeze(1).unsqueeze(3).repeat(1,allignment.size(1),1,1)
        
        # apply mask
        masked = allignment * mask
        
        # now we have to rebalance the other values so they sum to 1 again
        # this is done by dividing each value by the sum of the sequence
        # calculate sums
        msum = masked.sum(-2).repeat(1,1,masked.size(2)).unsqueeze(3)
        
        # rebalance
        attentions = masked.div(msum)
        
        # now we shape the attentions to be similar to context in size
        allignment = allignment.repeat(1,1,1,encoder_outputs.size(3))

        # make context vector by element wise mul
        context = attentions * encoder_outputs
        

        context2 = torch.sum(context,2)
        
        
        return context2
    
#     def attention2(self, hidden_to_attn, encoder_outputs, source_lengths, max_len):
#         # repeat the lstm out in third dimension and the encoder outputs in second dimension so we can make a meshgrid
#         # so we can do elementwise mul for all possible combinations of h_j and s_i
#         h_j = encoder_outputs.unsqueeze(1).repeat(1, hidden_to_attn.size(1), 1, 1)
#         s_i = hidden_to_attn.unsqueeze(2).repeat(1, 1, encoder_outputs.size(1), 1)

#         # get the dot product between the two to get the energy
#         # the unsqueezes are there to emulate transposing. so we can use matmul as torch.dot doesnt accept matrices
#         energy = s_i.unsqueeze(3).matmul(h_j.unsqueeze(4)).squeeze(4)

#         #         # this is concat attention, its a different form then the ones we need
#         #         cat = torch.cat((s_i,h_j),3)

#         #         energy = self.attn_layer(cat)

#         # reshaping the encoder outputs for later
#         encoder_outputs = encoder_outputs.unsqueeze(1)
#         encoder_outputs = encoder_outputs.repeat(1, energy.size(1), 1, 1)

#         # apply softmax to the energys
#         allignment = self.attn_soft(energy)

#         # create a mask like : [1,1,1,0,0,0] whos goal is to multiply the attentions of the pads with 0, rest with 1

#         idxes = torch.arange(0, max_len).unsqueeze(0).long()

#         if torch.cuda.is_available():
#             idxes = idxes.cuda()

#         mask = Variable((idxes < source_lengths.unsqueeze(1)).float())

#         # format the mask to be same size() as the attentions
#         mask = mask.unsqueeze(1).unsqueeze(3).repeat(1, allignment.size(1), 1, 1)

#         # apply mask
#         masked = allignment * mask

#         # now we have to rebalance the other values so they sum to 1 again
#         # this is done by dividing each value by the sum of the sequence
#         # calculate sums
#         msum = masked.sum(-2).repeat(1, 1, masked.size(2)).unsqueeze(3)

#         # rebalance
#         attentions = masked.div(msum)

#         # now we shape the attentions to be similar to context in size
#         allignment = allignment.repeat(1, 1, 1, encoder_outputs.size(3))

#         # make context vector by element wise mul
#         context = attentions * encoder_outputs

#         context2 = torch.sum(context, 2)

#         return context2
    
#     def decode(self, decoder2vocab):
#         # Turn decoder output into a probabiltiy distribution over vocabulary
#         decoder2vocab_reshape = decoder2vocab.view(-1, decoder2vocab.size(2))
#         word_probs = F.softmax(decoder2vocab_reshape)
#         word_probs = word_probs.view(
#             decoder2vocab.size(0), decoder2vocab.size(1), decoder2vocab.size(2)
#         )

#         return word_probs

In [8]:
class Encoder(nn.Module):
    def __init__(
        self, src_emb_dim,
        src_vocab_size,
        src_hidden_dim,
        pad_token_src,
        drop,
        position_based=True,
        LSTM_instead=False,
        bidirectional=False,
        nlayers_src=1
        
    ):
        super(Encoder, self).__init__()
        self.src_vocab_size = src_vocab_size
        self.src_emb_dim = src_emb_dim
        self.src_hidden_dim = src_hidden_dim
        self.pad_token_src = pad_token_src
        self.bidirectional = bidirectional
        self.nlayers_src = nlayers_src
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(drop)
        self.LSTM_instead=LSTM_instead
        self.position_based=position_based
 
        
        
        # Word Embedding look-up table for the soruce language
        self.src_embedding = nn.Embedding(
            self.src_vocab_size,
            self.src_emb_dim,
            self.pad_token_src,
        )
        self.pos_embedding = nn.Embedding(
            self.src_vocab_size,
            self.src_emb_dim,
            0,
        )
#         self.scaler = nn.Linear(
#             self.src_hidden_dim,
#             self.src_emb_dim*2,
#         )
        self.scale_h0 = nn.Linear(
            self.src_emb_dim*2, self.src_hidden_dim
        )
        
        
        
        # Encoder GRU
        self.encoder = nn.GRU(
            self.src_emb_dim // 2 if self.bidirectional else self.src_emb_dim,
            self.src_hidden_dim,
            self.nlayers_src,
            bidirectional=bidirectional,
            batch_first=True,
        )
        
    def forward(self, input_src, src_lengths, positions):
        src_emb = self.src_embedding(input_src) # BxSxE
        
        if (not self.position_based):
            src_emb = self.dropout(src_emb)
            src_emb = pack_padded_sequence(src_emb, src_lengths, batch_first=True)
            packed_output , src_h_t = self.encoder(src_emb) # out:
            h_t = torch.cat((src_h_t[-1], src_h_t[-2]), 1) if self.bidirectional else src_h_t[-1] # BxH
            out, _ = pad_packed_sequence(packed_output, batch_first=True)
            #out = self.scaler(out)
        else:
            src_pos = self.pos_embedding(positions)
            out = torch.cat((src_pos,src_emb),2)
            out = self.dropout(out)
            hidden = torch.mean(out,1)
            h_t = self.scale_h0(hidden)
            h_t = self.relu(h_t)
            
            
        # out = BxSxH
        return out, h_t

In [9]:
# def  get_parallel_minibatchget_par (lines, src_word2id, trg_word2id, index, batch_size, volatile=False):
        
#         # Get source sentences for this minibatch
#         src_lines = [
#             ['<s>'] + list(line[1]) + ['</s>']
#             for line in lines[index: index + batch_size]
#         ]

#         # Get target sentences for this minibatch
#         trg_lines = [
#             ['<s>'] + line[0].split() + ['</s>']
#             for line in lines[index: index + batch_size]
#         ]
        
#         # Sort source sentences by length for length masking in RNNs
#         src_lens = [len(line) for line in src_lines]
#         sorted_indices = np.argsort(src_lens)[::-1]
        
#         # Reorder sentences based on source lengths
#         sorted_src_lines = [src_lines[idx] for idx in sorted_indices]
#         sorted_trg_lines = [trg_lines[idx] for idx in sorted_indices]
        
#         # Compute new sentence lengths
#         sorted_src_lens = [len(line) for line in sorted_src_lines]
#         sorted_trg_lens = [len(line) for line in sorted_trg_lines]
        
#         # Get max source and target lengths to pad input and output sequences
#         max_src_len = max(sorted_src_lens)
#         max_trg_len = max(sorted_trg_lens)
        
#         # Construct padded source input sequence
#         input_lines_src = [
#             [src_word2id[w] if w in src_word2id else src_word2id['<unk>'] for w in line] +
#             [src_word2id['<pad>']] * (max_src_len - len(line))
#             for line in sorted_src_lines
#         ]

#         # Construct padded target input sequence
#         input_lines_trg = [
#             [trg_word2id[w] if w in trg_word2id else trg_word2id['<unk>'] for w in line[:-1]] +
#             [trg_word2id['<pad>']] * (max_trg_len - len(line))
#             for line in sorted_trg_lines
#         ]

#         # Construct padded target output sequence (Note: Output sequence is just the input shifted by 1 position)
#         # This is for teacher-forcing
#         output_lines_trg = [
#             [trg_word2id[w] if w in trg_word2id else trg_word2id['<unk>'] for w in line[1:]] +
#             [trg_word2id['<pad>']] * (max_trg_len - len(line))
#             for line in sorted_trg_lines
#         ]

#         input_lines_src = Variable(torch.LongTensor(input_lines_src), volatile=volatile)
#         input_lines_trg = Variable(torch.LongTensor(input_lines_trg), volatile=volatile)
#         output_lines_trg = Variable(torch.LongTensor(output_lines_trg), volatile=volatile)

#         return {
#             'input_src': input_lines_src,
#             'input_trg': input_lines_trg,
#             'output_trg': output_lines_trg,
#             'src_lens': sorted_src_lens
#         }

In [10]:
cuda_available = torch.cuda.is_available()

In [11]:
training_loader = DataLoader(training_set, batch_size=batch_size)
validation_loader = DataLoader(validation_set, batch_size=batch_size)
test_loader = DataLoader(test_set, batch_size=batch_size)

In [12]:
seq2seq = Seq2Seq(
    trg_emb_dim=embedding_dim,
    trg_vocab_size=training_set.target_vocab_size,
    trg_hidden_dim=hidden_dim,
    pad_token_trg=training_set.target_pad,
    drop=drop,
    bidirectional=bidirectional,
    context=context
)

encoder = Encoder(
    src_emb_dim=embedding_dim,
    src_vocab_size=training_set.source_vocab_size,
    src_hidden_dim=hidden_dim,
    pad_token_src=training_set.source_pad,
    drop=drop,
    bidirectional=bidirectional,
    position_based=position_based
)


if cuda_available:
    seq2seq = seq2seq.cuda()
    encoder = encoder.cuda()

In [13]:
optimizer = optim.Adam(list(seq2seq.parameters()) + list(encoder.parameters()), lr=learning_rate)
weight_mask = torch.ones(training_set.target_vocab_size)
if cuda_available:
    weight_mask = weight_mask.cuda()
weight_mask[training_set.target_pad] = 0
loss_criterion = nn.CrossEntropyLoss(weight=weight_mask)

In [14]:
output_dict = {"train_loss":[],
               "val_loss":[], 
               "test_loss":[],
               "best_epoch" : 0, 
               "best_val_loss" : 999,
               "num_epochs" : num_epochs,
               "batch_size" : batch_size,
               "learning_rate" : learning_rate,
               "embedding_dim" : embedding_dim,
               "hidden_dim" : hidden_dim,
               #hidden_dim = 2 * embedding_dim,
               "max_allowed_sentence_len" : max_allowed_sentence_len,
               "drop" : drop,
               #force = 1,
               "bidirectional" : bidirectional,
               "LSTM_instead" : LSTM_instead,
               "volatile" : volatile,
               "position_based" : position_based,
               "context" : context 
              }

In [15]:
iterations = len(training_loader)
for epoch in range(0, num_epochs):
    losses = []
    start = time.time()
    
    batch = 0
    for source_batch, target_batch, source_lengths, target_lengths, batch_positions in training_loader:
        batch_start = time.time()
        force = True
        
        source_batch = Variable(source_batch, volatile=volatile)
        output_batch = Variable(target_batch[:,1:], volatile=volatile)
        target_batch = Variable(target_batch[:,:-1], volatile=volatile)
        batch_positions = Variable(batch_positions, volatile=volatile)
        #print(target_batch.size())
        #print(output_batch.size())
        
        if cuda_available:
            source_batch.cuda()
            source_batch.cuda()
            output_batch.cuda()
            
        encoder_out, h_t = encoder(
            input_src=source_batch, src_lengths=source_lengths, positions=batch_positions,
        )    
            
        decoder_out = seq2seq(
            encoder_out=encoder_out, h_t=h_t,
            input_trg=target_batch,source_lengths=source_lengths, teacher=force
        )
        
#         print(decoder_out.contiguous().view(-1, decoder_out.size(2)).size())
#         print(output_batch.contiguous().view(-1).size())
        
        loss = loss_criterion(
            decoder_out.contiguous().view(-1, decoder_out.size(2)),
            output_batch.contiguous().view(-1)
        )
        optimizer.zero_grad()
        loss.backward()
        # Gradient clipping to avoid exploding gradients
        torch.nn.utils.clip_grad_norm(seq2seq.parameters(), 5.)
        optimizer.step()
        losses.append(loss.item())
        

        
        batch_time = time.time() - batch_start
        print('\r[Epoch {:03d}/{:03d}] Batch {:06d}/{:06d} [{:.1f}/s] '.format(epoch+1, num_epochs, batch+1, iterations, batch_time), end='')
        batch +=1
    dev_nll = []
    for source_batch, target_batch, source_lengths, target_lengths, batch_positions in validation_loader:
        force = False
        
        source_batch = Variable(source_batch, volatile=volatile)
        output_batch = Variable(target_batch[:,1:], volatile=volatile)
        target_batch = Variable(target_batch[:,:-1], volatile=volatile)
        #print(target_batch.size())
        #print(output_batch.size())
        
        if cuda_available:
            source_batch.cuda()
            source_batch.cuda()
            output_batch.cuda()
            
        encoder_out, h_t = encoder(
            input_src=source_batch, src_lengths=source_lengths, positions=batch_positions, 
        )    
            
        decoder_out = seq2seq(
            encoder_out=encoder_out, h_t=h_t,
            input_trg=target_batch,source_lengths=source_lengths, teacher=force
        )
        
        #print(decoder_out.contiguous().view(-1, decoder_out.size(2)).size())
        #print(output_batch.contiguous().view(-1).size())
        
        loss = loss_criterion(
            decoder_out.contiguous().view(-1, decoder_out.size(2)),
            output_batch.contiguous().view(-1)
        )
        dev_nll.append(loss.item())
        
        
    test_nll = []    
    for source_batch, target_batch, source_lengths, target_lengths, batch_positions in test_loader:
        force = False
        
        source_batch = Variable(source_batch, volatile=volatile)
        output_batch = Variable(target_batch[:,1:], volatile=volatile)
        target_batch = Variable(target_batch[:,:-1], volatile=volatile)
        #print(target_batch.size())
        #print(output_batch.size())
        
        if cuda_available:
            source_batch.cuda()
            source_batch.cuda()
            output_batch.cuda()
            
        encoder_out, h_t = encoder(
            input_src=source_batch, src_lengths=source_lengths, positions=batch_positions, 
        )    
            
        decoder_out = seq2seq(
            encoder_out=encoder_out, h_t=h_t,
            input_trg=target_batch,source_lengths=source_lengths, teacher=force
        )
        
        #print(decoder_out.contiguous().view(-1, decoder_out.size(2)).size())
        #print(output_batch.contiguous().view(-1).size())
        
        loss = loss_criterion(
            decoder_out.contiguous().view(-1, decoder_out.size(2)),
            output_batch.contiguous().view(-1)
        )
        test_nll.append(loss.item())
    
    output_dict["train_loss"].append(np.mean(losses))
    output_dict["val_loss"].append(np.mean(dev_nll))
    output_dict["test_loss"].append(np.mean(test_nll))
    
    if np.mean(dev_nll) < output_dict["best_val_loss"]:
        
        output_dict["best_epoch"] = epoch
        output_dict["best_val_loss"] = np.mean(dev_nll)
        torch.save(seq2seq, "{}{}_{}.model".format(save_dir, seq2seq.__class__.__name__.lower(), name))
        torch.save(encoder, "{}{}_{}.model".format(save_dir, encoder.__class__.__name__.lower(), name))
    
    
    print(time.time() - start)
    print('Epoch : %d Training Loss : %.3f' % (epoch, np.mean(losses)))
    print('Epoch : %d Dev Loss : %.3f' % (epoch, np.mean(dev_nll)))
    print('Epoch : %d Test Loss : %.3f' % (epoch, np.mean(test_nll)))
    print('-------------------------------------------------------------')

np.save( "{}{}.npy".format(save_dir2, name),output_dict)   



[Epoch 001/020] Batch 000453/000453 [0.2/s] 

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


394.5670876502991
Epoch : 0 Training Loss : 4.240
Epoch : 0 Dev Loss : 8.290
Epoch : 0 Test Loss : 8.231
-------------------------------------------------------------
[Epoch 002/020] Batch 000453/000453 [0.3/s] 394.66402769088745
Epoch : 1 Training Loss : 3.285
Epoch : 1 Dev Loss : 7.716
Epoch : 1 Test Loss : 7.789
-------------------------------------------------------------
[Epoch 003/020] Batch 000453/000453 [0.8/s] 451.659255027771
Epoch : 2 Training Loss : 2.838
Epoch : 2 Dev Loss : 7.552
Epoch : 2 Test Loss : 7.688
-------------------------------------------------------------
[Epoch 004/020] Batch 000453/000453 [0.2/s] 394.215811252594
Epoch : 3 Training Loss : 2.578
Epoch : 3 Dev Loss : 7.439
Epoch : 3 Test Loss : 7.588
-------------------------------------------------------------
[Epoch 005/020] Batch 000014/000453 [0.8/s] 

KeyboardInterrupt: 

In [None]:
_, sentence = torch.max(decoder_out[0],1)
test_pred = [training_set.target_i2w[word] for word in sentence.cpu().numpy()]
print(test_pred)
test_real = [training_set.target_i2w[word] for word in output_batch[0].cpu().numpy()]
print(test_real)

In [None]:
len(training_loader)

In [None]:
#output_dict = {"train_loss":[], "val_loss":[], "test_loss":[], "best_epoch" : 0, "best_val_loss" : 999}
output_dict["best_epoch"]

In [None]:
print(output_dict)

In [None]:
output_dict["train_loss"]