In [1]:
import time
import numpy as np
from __future__ import print_function

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence

In [4]:
import numpy as np
import codecs
import nltk

In [5]:
from corpus import ParallelCorpus
from torch.utils.data import Dataset, DataLoader

In [7]:
# Define hyperparameters
num_epochs = 1
batch_size = 64
learning_rate = 4e-4
embedding_dim = 512
#hidden_dim = 2 * embedding_dim
max_allowed_sentence_len = 50
#force = 1
volatile = False

In [9]:
# CHANGE: CORRECT DATA LOCATIONS.

training_set = ParallelCorpus(
        source_path="./data/train/train_bpe.fr", target_path="./data/train/train_bpe.en",
        max_sentence_length=max_allowed_sentence_len
    )

validation_set = ParallelCorpus(
        source_path="./data/val/val_bpe.fr", target_path="./data/val/val_bpe.en",
        max_sentence_length=max_allowed_sentence_len, use_indices_from=training_set
    )

test_set = ParallelCorpus(
        source_path="./data/test/test_2017_flickr_bpe.fr", target_path="./data/test/test_2017_flickr_bpe.en",
        max_sentence_length=max_allowed_sentence_len, use_indices_from=training_set
    )

print(len(training_set.target_w2i))
print(training_set.target_vocab_size)

1137
1136


In [10]:
print(len(training_set.target_w2i))
print(training_set.target_vocab_size)

1137
1136


In [11]:
class Seq2Seq(nn.Module):
    """A Vanilla Sequence to Sequence (Seq2Seq) model with LSTMs.
    Ref: Sequence to Sequence Learning with Neural Nets
    https://arxiv.org/abs/1409.3215
    """

    def __init__(
        self, src_emb_dim, trg_emb_dim, src_vocab_size,
        trg_vocab_size, src_hidden_dim, trg_hidden_dim,
        pad_token_src, pad_token_trg, bidirectional=False,
        nlayers_src=1, nlayers_trg=1
    ):
        """Initialize Seq2Seq Model."""
        super(Seq2Seq, self).__init__()
        self.src_vocab_size = src_vocab_size
        self.trg_vocab_size = trg_vocab_size
        self.src_emb_dim = src_emb_dim
        self.trg_emb_dim = trg_emb_dim
        self.src_hidden_dim = src_hidden_dim
        self.trg_hidden_dim = trg_hidden_dim
        self.bidirectional = bidirectional
        self.nlayers_src = nlayers_src
        self.nlayers_trg = nlayers_trg
        self.pad_token_src = pad_token_src
        self.pad_token_trg = pad_token_trg
        
        # Word Embedding look-up table for the soruce language
        self.src_embedding = nn.Embedding(
            self.src_vocab_size,
            self.src_emb_dim,
            self.pad_token_src,
        )

        # Word Embedding look-up table for the target language
        self.trg_embedding = nn.Embedding(
            self.trg_vocab_size,
            self.trg_emb_dim,
            self.pad_token_trg,
        )

        # Encoder GRU
        self.encoder = nn.GRU(
            self.src_emb_dim // 2 if self.bidirectional else self.src_emb_dim,
            self.src_hidden_dim,
            self.nlayers_src,
            bidirectional=bidirectional,
            batch_first=True,
        )

        # Decoder GRU
        self.decoder = nn.GRU(
            self.trg_emb_dim,
            self.trg_hidden_dim,
            self.nlayers_trg,
            batch_first=True
        )
        
        # Projection layer from decoder hidden states to target language vocabulary
        self.decoder2vocab = nn.Linear(trg_hidden_dim, trg_vocab_size)

    def forward(self, input_src, input_trg, src_lengths):
        # Lookup word embeddings in source and target minibatch
        src_emb = self.src_embedding(input_src)
        trg_emb = self.trg_embedding(input_trg)
        
        # Pack padded sequence for length masking in encoder RNN (This requires sorting input sequence by length)
        src_emb = pack_padded_sequence(src_emb, src_lengths, batch_first=True)
        
        # Run sequence of embeddings through the encoder GRU
        _, src_h_t = self.encoder(src_emb)
        
        # Extract the last hidden state of the GRU
        h_t = torch.cat((src_h_t[-1], src_h_t[-2]), 1) if self.bidirectional else src_h_t[-1]

        # Initialize the decoder GRU with the last hidden state of the encoder and 
        # run target inputs through the decoder.
        trg_h, _ = self.decoder(trg_emb, h_t.unsqueeze(0).expand(self.nlayers_trg, h_t.size(0), h_t.size(1)))
        
        # Merge batch and time dimensions to pass to a linear layer
        trg_h_reshape = trg_h.contiguous().view(
            trg_h.size(0) * trg_h.size(1), trg_h.size(2)
        )
        
        # Affine transformation of all decoder hidden states
        decoder2vocab = self.decoder2vocab(trg_h_reshape)
        
        # Reshape
        decoder2vocab = decoder2vocab.view(
            trg_h.size(0), trg_h.size(1), decoder2vocab.size(1)
        )

        return decoder2vocab
    
    def decode(self, decoder2vocab):
        # Turn decoder output into a probabiltiy distribution over vocabulary
        decoder2vocab_reshape = decoder2vocab.view(-1, decoder2vocab.size(2))
        word_probs = F.softmax(decoder2vocab_reshape)
        word_probs = word_probs.view(
            decoder2vocab.size(0), decoder2vocab.size(1), decoder2vocab.size(2)
        )

        return word_probs

In [12]:
# def  get_parallel_minibatchget_par (lines, src_word2id, trg_word2id, index, batch_size, volatile=False):
        
#         # Get source sentences for this minibatch
#         src_lines = [
#             ['<s>'] + list(line[1]) + ['</s>']
#             for line in lines[index: index + batch_size]
#         ]

#         # Get target sentences for this minibatch
#         trg_lines = [
#             ['<s>'] + line[0].split() + ['</s>']
#             for line in lines[index: index + batch_size]
#         ]
        
#         # Sort source sentences by length for length masking in RNNs
#         src_lens = [len(line) for line in src_lines]
#         sorted_indices = np.argsort(src_lens)[::-1]
        
#         # Reorder sentences based on source lengths
#         sorted_src_lines = [src_lines[idx] for idx in sorted_indices]
#         sorted_trg_lines = [trg_lines[idx] for idx in sorted_indices]
        
#         # Compute new sentence lengths
#         sorted_src_lens = [len(line) for line in sorted_src_lines]
#         sorted_trg_lens = [len(line) for line in sorted_trg_lines]
        
#         # Get max source and target lengths to pad input and output sequences
#         max_src_len = max(sorted_src_lens)
#         max_trg_len = max(sorted_trg_lens)
        
#         # Construct padded source input sequence
#         input_lines_src = [
#             [src_word2id[w] if w in src_word2id else src_word2id['<unk>'] for w in line] +
#             [src_word2id['<pad>']] * (max_src_len - len(line))
#             for line in sorted_src_lines
#         ]

#         # Construct padded target input sequence
#         input_lines_trg = [
#             [trg_word2id[w] if w in trg_word2id else trg_word2id['<unk>'] for w in line[:-1]] +
#             [trg_word2id['<pad>']] * (max_trg_len - len(line))
#             for line in sorted_trg_lines
#         ]

#         # Construct padded target output sequence (Note: Output sequence is just the input shifted by 1 position)
#         # This is for teacher-forcing
#         output_lines_trg = [
#             [trg_word2id[w] if w in trg_word2id else trg_word2id['<unk>'] for w in line[1:]] +
#             [trg_word2id['<pad>']] * (max_trg_len - len(line))
#             for line in sorted_trg_lines
#         ]

#         input_lines_src = Variable(torch.LongTensor(input_lines_src), volatile=volatile)
#         input_lines_trg = Variable(torch.LongTensor(input_lines_trg), volatile=volatile)
#         output_lines_trg = Variable(torch.LongTensor(output_lines_trg), volatile=volatile)

#         return {
#             'input_src': input_lines_src,
#             'input_trg': input_lines_trg,
#             'output_trg': output_lines_trg,
#             'src_lens': sorted_src_lens
#         }

In [13]:
print(len(training_set.target_w2i))
print(training_set.target_vocab_size)

1137
1136


In [14]:
cuda_available = torch.cuda.is_available()

In [15]:
training_loader = DataLoader(training_set, batch_size=batch_size)
validation_loader = DataLoader(validation_set, batch_size=batch_size)
test_loader = DataLoader(test_set, batch_size=batch_size)

In [16]:
print(len(training_set.target_w2i))
print(training_set.target_vocab_size)

1137
1136


In [17]:
seq2seq = Seq2Seq(
    src_emb_dim=128, trg_emb_dim=128,
    src_vocab_size=training_set.source_vocab_size,
    trg_vocab_size=training_set.target_vocab_size,
    src_hidden_dim=embedding_dim, trg_hidden_dim=embedding_dim,
    pad_token_src=training_set.source_pad,
    pad_token_trg=training_set.target_pad,
)


if cuda_available:
    seq2seq = seq2seq.cuda()

In [18]:
optimizer = optim.Adam(seq2seq.parameters(), lr=learning_rate)
weight_mask = torch.ones(len(training_set.target_w2i)-1)
if cuda_available:
    weight_mask = weight_mask.cuda()
weight_mask[training_set.target_pad] = 0
loss_criterion = nn.CrossEntropyLoss(weight=weight_mask)

In [19]:
#from corpus import ParallelCorpus

print(len(training_set.target_w2i))
print(training_set.target_vocab_size)

1137
1136


In [20]:
for epoch in range(0, num_epochs):
    losses = []
    for source_batch, target_batch, source_lengths, target_lengths, batch_positions in training_loader:
        
        
        source_batch = Variable(source_batch, volatile=volatile)
        output_batch = Variable(target_batch[:,1:], volatile=volatile)
        target_batch = Variable(target_batch[:,:-1], volatile=volatile)
        #print(target_batch.size())
        #print(output_batch.size())
        
        if cuda_available:
            source_batch.cuda()
            source_batch.cuda()
            output_batch.cuda()
            
        decoder_out = seq2seq(
            input_src=source_batch, input_trg=target_batch, src_lengths=source_lengths
        )
        
        #print(decoder_out.contiguous().view(-1, decoder_out.size(2)).size())
        #print(output_batch.contiguous().view(-1).size())
        
        loss = loss_criterion(
            decoder_out.contiguous().view(-1, decoder_out.size(2)),
            output_batch.contiguous().view(-1)
        )
        optimizer.zero_grad()
        loss.backward()
        # Gradient clipping to avoid exploding gradients
        torch.nn.utils.clip_grad_norm(seq2seq.parameters(), 5.)
        optimizer.step()
        losses.append(loss.item())
        
        
        
    dev_nll = []
    for source_batch, target_batch, source_lengths, target_lengths, batch_positions in validation_loader:
        
        
        source_batch = Variable(source_batch, volatile=volatile)
        output_batch = Variable(target_batch[:,1:], volatile=volatile)
        target_batch = Variable(target_batch[:,:-1], volatile=volatile)
        #print(target_batch.size())
        #print(output_batch.size())
        
        if cuda_available:
            source_batch.cuda()
            source_batch.cuda()
            output_batch.cuda()
            
        decoder_out = seq2seq(
            input_src=source_batch, input_trg=target_batch, src_lengths=source_lengths
        )
        
        #print(decoder_out.contiguous().view(-1, decoder_out.size(2)).size())
        #print(output_batch.contiguous().view(-1).size())
        
        loss = loss_criterion(
            decoder_out.contiguous().view(-1, decoder_out.size(2)),
            output_batch.contiguous().view(-1)
        )
        dev_nll.append(loss.item())
        
    test_nll = []    
    for source_batch, target_batch, source_lengths, target_lengths, batch_positions in test_loader:
        
        
        source_batch = Variable(source_batch, volatile=volatile)
        output_batch = Variable(target_batch[:,1:], volatile=volatile)
        target_batch = Variable(target_batch[:,:-1], volatile=volatile)
        #print(target_batch.size())
        #print(output_batch.size())
        
        if cuda_available:
            source_batch.cuda()
            source_batch.cuda()
            output_batch.cuda()
            
        decoder_out = seq2seq(
            input_src=source_batch, input_trg=target_batch, src_lengths=source_lengths
        )
        
        #print(decoder_out.contiguous().view(-1, decoder_out.size(2)).size())
        #print(output_batch.contiguous().view(-1).size())
        
        loss = loss_criterion(
            decoder_out.contiguous().view(-1, decoder_out.size(2)),
            output_batch.contiguous().view(-1)
        )
        test_nll.append(loss.item())
    
    
    print('Epoch : %d Training Loss : %.3f' % (epoch, np.mean(losses)))
    print('Epoch : %d Dev Loss : %.3f' % (epoch, np.mean(dev_nll)))
    print('Epoch : %d Test Loss : %.3f' % (epoch, np.mean(test_nll)))
    print('-------------------------------------------------------------')



Epoch : 0 Training Loss : 4.154
Epoch : 0 Dev Loss : 4.179
Epoch : 0 Test Loss : 4.340
-------------------------------------------------------------


In [18]:
_, sentence = torch.max(decoder_out[0],1)
test_pred = [training_set.target_i2w[word] for word in sentence.cpu().numpy()]
print(test_pred)
test_real = [training_set.target_i2w[word] for word in output_batch[0].cpu().numpy()]
print(test_real)

['a', 'of', 'people', 'waiting', 'for', 'the', 'train', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.']
['group', 'of', 'people', 'waiting', 'for', 'the', 'subway', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [21]:
torch.save(seq2seq, "seq2seq_10_alpha_cpu.model")

  "type " + obj.__name__ + ". It won't be checked "
