In [2]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


class VanillaEncoder(nn.Module):

    def __init__(self, vocab_size, embedding_size, output_size):
        """Define layers for a vanilla rnn encoder"""
        super(VanillaEncoder, self).__init__()

        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.gru = nn.GRU(embedding_size, output_size)

    def forward(self, input_seqs, input_lengths, hidden=None):
        embedded = self.embedding(input_seqs)
        packed = pack_padded_sequence(embedded, input_lengths)
        packed_outputs, hidden = self.gru(packed, hidden)
        outputs, output_lengths = pad_packed_sequence(packed_outputs)
        return outputs, hidden

    def forward_a_sentence(self, inputs, hidden=None):
        """Deprecated, forward 'one' sentence at a time which is bad for gpu utilization"""
        embedded = self.embedding(inputs)
        outputs, hidden = self.gru(embedded, hidden)
        return outputs, hidden

In [2]:
class VanillaDecoder(nn.Module):

    def __init__(self, hidden_size, output_size, max_length, teacher_forcing_ratio, sos_id, use_cuda):
        """Define layers for a vanilla rnn decoder"""
        super(VanillaDecoder, self).__init__()

        self.hidden_size = hidden_size
        self.output_size = output_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.log_softmax = nn.LogSoftmax()  # work with NLLLoss = CrossEntropyLoss

        self.max_length = max_length
        self.teacher_forcing_ratio = teacher_forcing_ratio
        self.sos_id = sos_id
        self.use_cuda = use_cuda

    def forward_step(self, inputs, hidden):
        # inputs: (time_steps=1, batch_size)
        batch_size = inputs.size(1)
        embedded = self.embedding(inputs)
        embedded.view(1, batch_size, self.hidden_size)  # S = T(1) x B x N
        rnn_output, hidden = self.gru(embedded, hidden)  # S = T(1) x B x H
        rnn_output = rnn_output.squeeze(0)  # squeeze the time dimension
        output = self.log_softmax(self.out(rnn_output))  # S = B x O
        return output, hidden

    def forward(self, context_vector, targets):

        # Prepare variable for decoder on time_step_0
        target_vars, target_lengths = targets
        batch_size = context_vector.size(1)
        decoder_input = Variable(torch.LongTensor([[self.sos_id] * batch_size]))

        # Pass the context vector
        decoder_hidden = context_vector

        max_target_length = max(target_lengths)
        decoder_outputs = Variable(torch.zeros(
            max_target_length,
            batch_size,
            self.output_size
        ))  # (time_steps, batch_size, vocab_size)

        if self.use_cuda:
            decoder_input = decoder_input.cuda()
            decoder_outputs = decoder_outputs.cuda()

        use_teacher_forcing = True if random.random() > self.teacher_forcing_ratio else False

        # Unfold the decoder RNN on the time dimension
        for t in range(max_target_length):
            decoder_outputs_on_t, decoder_hidden = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs[t] = decoder_outputs_on_t
            if use_teacher_forcing:
                decoder_input = target_vars[t].unsqueeze(0)
            else:
                decoder_input = self._decode_to_index(decoder_outputs_on_t)
            return decoder_outputs, decoder_hidden

In [3]:
class Seq2Seq(nn.Module):

    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, inputs, targets):
        input_vars, input_lengths = inputs
        encoder_outputs, encoder_hidden = self.encoder.forward(input_vars, input_lengths)
        decoder_outputs, decoder_hidden = self.decoder.forward(context_vector=encoder_hidden, targets=targets)
        return decoder_outputs, decoder_hidden

In [4]:
import config
class Trainer(object):

    def __init__(self, model, data_transformer, learning_rate, use_cuda,
                 checkpoint_name=config.checkpoint_name,
                 teacher_forcing_ratio=config.teacher_forcing_ratio):

        self.model = model

        # record some information about dataset
        self.data_transformer = data_transformer
        self.vocab_size = self.data_transformer.vocab_size
        self.PAD_ID = self.data_transformer.PAD_ID
        self.use_cuda = use_cuda

        # optimizer setting
        self.learning_rate = learning_rate
        self.optimizer= torch.optim.Adam(self.model.parameters(), lr=learning_rate)
        self.criterion = torch.nn.NLLLoss(ignore_index=self.PAD_ID, size_average=True)

        self.checkpoint_name = checkpoint_name

    def train(self, num_epochs, batch_size, pretrained=False):

        if pretrained:
            self.load_model()

        for epoch in range(0, num_epochs):
            mini_batches = self.data_transformer.mini_batches(batch_size=batch_size)
            for input_batch, target_batch in mini_batches:
                self.optimizer.zero_grad()
                decoder_outputs, decoder_hidden = self.model(input_batch, target_batch)
                # calculate the loss and back prop.
                cur_loss = self.get_loss(decoder_outputs, target_batch[0])
                cur_loss.backward()
                # optimize
                self.optimizer.step()

        self.save_model()

    def get_loss(self, decoder_outputs, targets):
        b = decoder_outputs.size(1)
        t = decoder_outputs.size(0)
        targets = targets.contiguous().view(-1)  # S = (B*T)
        decoder_outputs = decoder_outputs.view(b * t, -1)  # S = (B*T) x V
        return self.criterion(decoder_outputs, targets)

AttributeError: module 'config' has no attribute 'checkpoint_name'

In [27]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

embedding = nn.Embedding(51,3)
input = torch.LongTensor([[9,5,9,1,2],[4,1,2,2,2],[40,40,40,40,1],[4,4,1,2,2]])
embedded=embedding(input)
print(embedded)


leng = torch.tensor([4,2,5,3])
packed = pack_padded_sequence(embedded, leng, batch_first=True, enforce_sorted=False)
print('\n\n')
print(packed)

tensor([[[ 0.6583, -2.0731, -0.9316],
         [ 0.1076,  0.7563, -0.0412],
         [ 0.6583, -2.0731, -0.9316],
         [-1.6494,  1.2978, -0.6735],
         [-0.8738,  0.0227, -1.7554]],

        [[ 0.5711,  0.2388, -0.3003],
         [-1.6494,  1.2978, -0.6735],
         [-0.8738,  0.0227, -1.7554],
         [-0.8738,  0.0227, -1.7554],
         [-0.8738,  0.0227, -1.7554]],

        [[ 0.0391, -0.2575,  0.7341],
         [ 0.0391, -0.2575,  0.7341],
         [ 0.0391, -0.2575,  0.7341],
         [ 0.0391, -0.2575,  0.7341],
         [-1.6494,  1.2978, -0.6735]],

        [[ 0.5711,  0.2388, -0.3003],
         [ 0.5711,  0.2388, -0.3003],
         [-1.6494,  1.2978, -0.6735],
         [-0.8738,  0.0227, -1.7554],
         [-0.8738,  0.0227, -1.7554]]], grad_fn=<EmbeddingBackward>)



PackedSequence(data=tensor([[ 0.0391, -0.2575,  0.7341],
        [ 0.6583, -2.0731, -0.9316],
        [ 0.5711,  0.2388, -0.3003],
        [ 0.5711,  0.2388, -0.3003],
        [ 0.0391, -0.2575,  0.73

In [28]:
gru = nn.GRU(3, 3)

In [31]:
packed_outputs, hidden=gru(packed, None)
print(packed_outputs)
print('---------------')
print(hidden)

(PackedSequence(data=tensor([[ 0.0489,  0.0753, -0.2225],
         [ 0.5938,  0.7414,  0.2691],
         [ 0.1382,  0.1494, -0.0715],
         [ 0.1382,  0.1494, -0.0715],
         [ 0.0879,  0.0883, -0.3015],
         [ 0.3219,  0.5126, -0.1140],
         [ 0.2215,  0.2143, -0.0895],
         [ 0.0871, -0.2598, -0.7272],
         [ 0.1154,  0.0813, -0.3280],
         [ 0.6988,  0.8279,  0.2641],
         [ 0.1423, -0.2186, -0.7265],
         [ 0.1338,  0.0706, -0.3354],
         [ 0.3549,  0.3150, -0.5995],
         [ 0.1121, -0.3513, -0.7876]], grad_fn=<CatBackward>), batch_sizes=tensor([4, 4, 3, 2, 1]), sorted_indices=tensor([2, 0, 3, 1]), unsorted_indices=tensor([1, 3, 0, 2])),
 tensor([[[ 0.3549,  0.3150, -0.5995],
          [ 0.0871, -0.2598, -0.7272],
          [ 0.1121, -0.3513, -0.7876],
          [ 0.1423, -0.2186, -0.7265]]], grad_fn=<IndexSelectBackward>))