In [1]:
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

import jieba
import unicodedata
import codecs
import string
import re
import random
import time
import math
import numpy as np

import torch as t
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable as V
import torch.utils.data as Data
import torch.nn.functional as F
from torchvision import datasets, models
from torchvision import transforms as T
import torch.optim as optim

### 1.Load data

#### indexing words

In [2]:
PAD_token = 0
SOS_token = 1
EOS_token = 2

class Lang:
    
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2count = {}
        self.word2index = {}
        self.index2word = {0: "PAD", 1:"SOS", 2:"EOS"}
        self.n_words = 3  # Count default tokens
    
    
    def index_words(self, sentence, chi=False):
        if chi:
            for word in list(jieba.cut(sentence)):
                self.index_word(word)
        else:        
            for word in sentence.split(' '):
                self.index_word(word)
    
    def index_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
    
    # Remove words below a certain count threshold
    def trim(self, min_count):
        if self.trimmed: 
            return 
        # self.trimmed = True
        
        keep_words = []
        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)
        
        print("keep_words %d / %d = %.4f" % \
              (len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index) ))
        
        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "PAD", 1: "SOS", 2: "EOS"}
        self.n_words = 3       # Count default tokens
        for word in keep_words:
            self.index_word(word)     

#### Reading and decoding files

In [3]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

# Lowercase, trim, and remove non-letter characters
def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([,.!?])", r" \1 ", s)
    s = re.sub(r"[^a-zA-Z,.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

In [4]:
def read_langs(lang1, lang2, reverse=False):
    print("Reading lines...")
    
    # Read the file and split into lines
    filename = './data/eng-chi/%s-%s.txt' % (lang1, lang2)
    # lines = codecs.open(filename, 'r', 'utf-8').read().strip().split('\n')
    with open(filename, 'r') as f:
        lines = f.readlines()
    
    # Split every line into pairs and normalize
    # pairs = [[normalize_string(s) for s in l.split('\t')] for l in lines]
    pairs = [[s for s in l.split('\t')] for l in lines]
    
    
    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)
    
    return input_lang, output_lang, pairs  

In [5]:
input_lang, output_lang, pairs = read_langs('eng', 'chi')

Reading lines...


In [6]:
MIN_LENGTH = 5
MAX_LENGTH = 15

def filter_pairs(pairs):
    filtered_pairs = []
    for pair in pairs:
        if len(pair[0].split(' ')) >= MIN_LENGTH and len(pair[0].split(' ')) <= MAX_LENGTH \
         and len(list(jieba.cut(pair[1]))) >= MIN_LENGTH and len(list(jieba.cut(pair[1]))) <= MAX_LENGTH:
            filtered_pairs.append(pair)
    return filtered_pairs

In [7]:
def prepare_data(lang1_name, lang2_name, reverse=False):
    input_lang, output_lang, pairs = read_langs(lang1_name, lang2_name, reverse)
    print("Read %d sentence pairs" % len(pairs))
    
    pairs = filter_pairs(pairs)
    print("Filtered to %d pairs" % len(pairs))
    
    print("Indexing words...")
    for pair in pairs:
        input_lang.index_words(pair[0], chi=False)
        output_lang.index_words(pair[1], chi=True)
    print("Indexed %d words in input language, %d words in output" % 
          (input_lang.n_words, output_lang.n_words))
    return input_lang, output_lang, pairs
  

In [8]:
input_lang, output_lang, pairs = prepare_data('eng', 'chi')

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.287 seconds.


Reading lines...
Read 19777 sentence pairs


Prefix dict has been built succesfully.


Filtered to 15687 pairs
Indexing words...
Indexed 9318 words in input language, 12208 words in output


#### Filter words 

If don't filter words, there are 3226 words in input language, 3438 words in output language

In [9]:
MIN_COUNT= 2    # the word at least occurs times

input_lang.trim(MIN_COUNT)
output_lang.trim(MIN_COUNT)

keep_words 4831 / 9315 = 0.5186
keep_words 5222 / 12205 = 0.4279


In [10]:
print("input language words: ", input_lang.n_words)
print("output language words: ", output_lang.n_words)

input language words:  4834
output language words:  5225


#### Filtering pairs

Now we will go back to the set of all sentence pairs and remove those with unknown words.

In [11]:
# print("pairs length: ", len(pairs))
keep_pairs = []

for pair in pairs:
    input_sentence = pair[0]
    output_sentence = pair[1]
    
    keep_input = True
    keep_output = True
    
    for word in input_sentence.split(' '):
        if word not in input_lang.word2index:
            keep_input = False
            break


    for word in list(jieba.cut(output_sentence)):
        if word not in output_lang.word2index:
            keep_output = False
            break
    
    if keep_input and keep_output:
        keep_pairs.append(pair)
print("Trimmed from %d pairs to %d, %.4f of total" % \
      (len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))

Trimmed from 15687 pairs to 8942, 0.5700 of total


In [12]:
pair = random.choice(keep_pairs)
print("original input(English): ", pair[0])
print("translate output(Chinese): ", pair[1])

original input(English):  I have to go to sleep.
translate output(Chinese):  我该去睡觉了。



#### Turning training data into Tensors

To train we need to turn the sentences into something the neural network can understand, which of course means numbers. Each sentence will be split into words and turned into a LongTensor which represents the index (from the Lang indexes made earlier) of each word.While creating these tensors we will also append the EOS token to signal that the sentence is over.

In [13]:
# Return a list of indexes, one for each word in the sentence, plus EOS
def indexes_from_sentence(lang, sentence, chi=False):
    if chi:
        return [lang.word2index[word] for word in list(jieba.cut(sentence))] + [EOS_token]
    else:
        return [lang.word2index[word] for word in sentence.split(' ')] + [EOS_token]


We can make better use of the GPU by training on batches of many sequences at once, but doing so brings up the question of how to deal with sequences of varying lengths. The simple solution is to 'pad' the shorter sentences with some padding symbol(in this case 0), and ignore these padded spots when calculating the loss.

In [14]:
# Pad a with the PAD symbol
def pad_seq(seq, max_length):
    seq += [PAD_token for i in range(max_length - len(seq))]
    return seq

* * *
To create a Variable for a full batch of inputs(and targets) we get a random sample of sequences and pad them all to the length of the longest sequence.We'll keep track of the lengths of each batch in order to un-pad later.

Initializing a LongTensor with an array (batches) of arrays (sequences) gives us a (batch_size \* max_len) tensor - selecting the first dimension gives you a simple batch, which is a full sequence. When training the model we'll want a simple time step at once, so we'll transpose (max_len \* batch_size). Now selecting along the first dimension returns a single time step across batches.
* * *

In [15]:
print("input lang: ", input_lang)
print("output lang: ", output_lang)

input lang:  <__main__.Lang instance at 0x7f52e1f2d440>
output lang:  <__main__.Lang instance at 0x7f52e1f2d4d0>


In [16]:
def random_batch(batch_size):
    input_seqs = []
    target_seqs = []
    
    # Choose random pairs
    for i in range(batch_size):
        pair = random.choice(keep_pairs)
        input_seqs.append(indexes_from_sentence(input_lang, pair[0], chi=False))
        target_seqs.append(indexes_from_sentence(output_lang, pair[1], chi=True))
    
    # Zip into pairs, sort by length (descending), unzip
    seq_pairs = sorted(zip(input_seqs, target_seqs), key=lambda p: len(p[0]), reverse=True)
    input_seqs, target_seqs = zip(*seq_pairs)
    
    # For input and target sequences, get array of lengths and pad with 0s to max length
    input_lengths = map(len, input_seqs)
    input_max_length = max(input_lengths)
    input_padded = [pad_seq(s, input_max_length) for s in input_seqs]
    
    
    target_lengths = map(len, target_seqs)
    target_max_length = max(target_lengths)
    target_padded = [pad_seq(s, target_max_length) for s in target_seqs]
    
    # Turn padded arrays into (batch_size * max_len) tensors, transpose into (max_len * batch_size)
    input_var = V(t.LongTensor(input_padded)).transpose(0, 1)
    target_var = V(t.LongTensor(target_padded)).transpose(0, 1)
    
    # if t.cuda.is_available():
    #     input_var = input_var.cuda()
    #     target_var = target_var.cuda()
    return input_var, input_lengths, target_var, target_lengths
      
input_var, input_lengths, target_var, target_lengths = random_batch(3)

In [17]:
print("input variable: ", input_var)
print("input lengths: ", input_lengths)
print("target variable: ", target_var)
print("target lengths: ", target_lengths)

input variable:  Variable containing:
 2376  1804  3902
 4243  3063  2612
 3308   654   943
 2805   625   654
 3606   615  1418
 3663  1762  4223
  424     2     2
    2     0     0
[torch.LongTensor of size 8x3]

input lengths:  [8, 7, 7]
target variable:  Variable containing:
 4882  3768  3284
 3072  3868  4467
  492   500   825
 2010  3927  5080
 2779  2962  4683
 4911  4673  4673
 2553  3263  3263
 4673     2     2
 3263     0     0
    2     0     0
[torch.LongTensor of size 10x3]

target lengths:  [10, 8, 8]


## Building the models

#### The Encoder
 * * * 
The encoder will take a batch of word sequences, a LongTensor of size (max_len \* batch_size), and output an encoding for each word, a FloatTensor of size (max_len \* batch_size \* hidden_size)

The word inputs are fed through an embedding layer nn.Embedding to create an embedding for each word, with size seq_len \* hidden_size(as if it was a batch of words).This is resized to seq_len \* 1 \* hidden_size to fit the expected input of the GRU layer(nn.GRU).The GRU will return both an output sequence of size seq_len \* hidden_size

* * *

In [18]:
class EncoderRNN(nn.Module):
    
    def __init__(self, input_size, hidden_size, batch_size, 
                 num_layers=1, dropout=0.1, bidirectional=True):
        super(EncoderRNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.bidirectional = bidirectional
        
        self.embedding = nn.Embedding(num_embeddings=input_size, 
                                      embedding_dim=hidden_size)
        self.gru = nn.GRU(input_size=hidden_size, 
                          hidden_size=hidden_size, 
                          num_layers=num_layers, 
                          dropout=dropout, 
                          bidirectional=bidirectional)
    
    def forward(self, input_seqs, input_lengths, hidden=None):
        # Note: we run this all at once (over multiple batches of multiple sequences)
        print("input sequence size: ", input_seqs.size())
        embedded = self.embedding(input_seqs)
        print("embedded size: ", embedded.size())
        
        packed = pack_padded_sequence(embedded, input_lengths) 
        print("packed data size: ", packed.data.size())
        print("packed batch_sizes: ", packed.batch_sizes)
        outputs, hidden = self.gru(packed, hidden)
        
        outputs, output_lengths = pad_packed_sequence(outputs)  # unpack (back to padded)
        print("outputs size: ", outputs.size())
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]   
        return outputs, hidden
    
    def init_hidden(self):
        return V(t.zeros(self.num_layers * (self.bidirectional + 1), self.batch_size, self.hidden_size))
        

In [19]:
INPUT_SIZE = input_lang.n_words
HIDDEN_SIZE = 23
BATCH_SIZE = 3
# print("input size: ", INPUT_SIZE)
# print("hidden size: ", HIDDEN_SIZE)

encoder = EncoderRNN(input_size = INPUT_SIZE, 
                     hidden_size = HIDDEN_SIZE,
                     batch_size = BATCH_SIZE,
                     num_layers = 1,
                     dropout = 0.1
                     )
print("encoder: ", encoder)


encoder:  EncoderRNN(
  (embedding): Embedding(4834, 23)
  (gru): GRU(23, 23, dropout=0.1, bidirectional=True)
)


In [20]:
# print("input variable: ", input_var)
print("input_lengths: ", input_lengths)

encoder_hidden = encoder.init_hidden()
# print("encoder hidden: ", encoder_hidden)

outputs, hidden = encoder(input_var, input_lengths, encoder_hidden)

# print("outputs size: ", outputs.size())
# print("hidden size: ", hidden.size())

input_lengths:  [8, 7, 7]
input sequence size:  torch.Size([8, 3])
embedded size:  torch.Size([8, 3, 23])
packed data size:  torch.Size([22, 23])
packed batch_sizes:  [3, 3, 3, 3, 3, 3, 3, 1]
outputs size:  torch.Size([8, 3, 46])


#### Attention Decoder

##### Interpreting the Bahdanau et al.model

Each decoder output is conditioned on the previous outputs and some x, where x consists of the current hidden state (which takes into account previous outputs) and the attention 
"context", which is calculated below.The function g is fully-connected layer with a nonlinear activation, which takes as input the values $y_{i-1}$, $s_i$, and $c_i$ concatenated.

$$p(y_i|\{y_1, ..., y_{i-1}\}, x) = g(y_{i-1}, s_i, c_i)$$
In the code, the RNN will be a nn.GRU layer, the hidden state $s_i$ will be called *hidden*, the output $y_i$ called *output*, and context $c_i$ called context.

$$s_i = f(s_{i-1}, y_{i-1}, c_i)$$

The context vector $c_i$ is a weighted sum of all encoder outputs, where each weight $\alpha_{ij}$ is the the amount of "attention" paid to the corresponding encoder output $h_j$.

$$c_i = \sum_{j=1}^{T_x}\alpha_{ij}h_j$$

...where each weight $\alpha_{ij}$ is a normalized (over all steps) attention "energy"$e_{ij}$...

$$\alpha_{ij}=\frac{exp(e_{ij})}{\sum_{k=1}^{T}exp(e_{ik})}$$

...where each attention energy is calculated with some function $\alpha$(such as another linear layer) using the last hidden state $s_{i-1}$ and that particular encoder output $h_j$

$$e_{ij} = \alpha(s_{i-1},h_j)$$


##### Interpreting the Luong et al. models

A few more attention models that offer improvements and simplifications.They describe a few "global attention" models, the distinction between them being the way the attention scores are calculated.

The general form of the attention calculation relies on the target(decoder) side hidden state and corresponding source(encoder) side state, normalized over all states to get values summing to 1:

$$\alpha_t(s)=align(h_t, \bar{h_s}) = \frac{exp(score(h_t, \bar{h_s}))}{\sum_{s'}exp(score(h_t, \bar{h_{s'}}))}$$

The specific "score" function that compares two states is either *dot*, a simple dot product between the states; general, a dot product between the decoder hidden state and a linear transform of the encoder state; or concat, a dot product between a new parameter $v_{\alpha}$ and a linear transform of the states concatenated together.

$$\begin{equation}
score(h_t, \bar{h_s}) = \left\{
\begin{aligned}
h_t^T\bar{h_s} & & dot    \\
h_t^TW_{\alpha}\bar{h_s} & & general    \\
v_{\alpha}^TW_{\alpha}[h_t;\bar{h_s}] & & concat   \\
\end{aligned}
\right.
\end{equation}$$

The modular definition of these scoring functions gives us an opportunity to build specific attention module that can switch between the different score methods.
The input to this module is always the hidden state (of the decoder RNN) and set of encoder outputs.

#### Implementing an attention module

In [21]:
class Attn(nn.Module):
    
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        
        self.method = method
        self.hidden_size = hidden_size
        
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, self.hidden_size)
        if self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, self.hidden_size)
            self.v = nn.Parameter(t.FloatTensor(1, self.hidden_size))
    
    def forward(self, hidden, encoder_outputs):
        
        # hidden---> size: []
        # encoder_outputs---> size: [10, 3, 46]([seq_len, batch_size, hidden_size])
        max_len = encoder_outputs.size(0)
        this_batch_size = encoder_outputs.size(1)
        
        # Create variable to store attention energies
        attn_energies = V(t.zeros(this_batch_size, max_len))  # B * S
        
        # if t.cuda.is_available():
        #     attn_energies = attn_energies.cuda()
        for b in range(this_batch_size):
            # Calculate energy for each encoder output
            for i in range(max_len):            #              [1 * hidden_size]
                attn_energies[b, i] = self.score(hidden[:, b], encoder_outputs[i, b].unsqueeze(0))
        
        # Normalize energies to weights in range 0 to 1, resize to 1 * B * S
        return F.softmax(attn_energies).unsqueeze(1)

    def score(self, hidden, encoder_output):
        
        if self.method == 'dot':
            energy = hidden.dot(encoder_output)
            return energy
        elif self.method == 'general':
            energy = self.attn(encoder_output)
            energy = hidden.dot(energy)
            return energy
        elif self.method == 'concat':
            energy = self.attn(t.cat((hidden, encoder_output), 1))
            energy = self.v.dot(energy)
            return energy  

#### Implementing the Bahdanau et al. model

In summary our decoder should consist of four main parts - an embedding layer turning an input word into vector; a layer to calculate the attention energy per encoder output; a RNN layer; and an output layer.

The decoder's inputs are the last RNN hidden state $s_{i-1}$, last output $y_{i-1}$, and all encoder outputs $h$

- embedding layer with inputs $y_{i-1}$
  - embedded = embedding(last_rnn_output)

- attention layer $\alpha$ with inputs ($s_{i-1}$, $h_j$) and outputs $e_{ij}$, normalized to create $\alpha_{ij}$
  - attn_energies[j] = attn_layer(last_hidden, encoder_outputs[j])
  - attn_weights = normalize(attn_energies)

- context vector $c_i$ as an attention-weighted average of encoder outputs
  - context = sum(attn_weights * encoder_outputs)

- RNN layer(s) f with inputs ($s_{i-1}$, $y_{i-1}$, $c_i$) and internal hidden state, outputting $s_i$
  - rnn_input = concat(embedded, context)
  - rnn_output, rnn_hidden = rnn(rnn_input, last_hidden)

- an output layer g with inputs ($y_{i-1}$, $s_i$, $c_i$), outputting $y_i$
  - output = out(embedded, rnn_output, context)


In [22]:
class BahdanauAttnDecoderRNN(nn.Module):
    
    def __init__(self, hidden_size, output_size, num_layers=1, dropout_p=0.1):
        super(BahdanauAttnDecoderRNN, self).__init__()
        
        # Define parameters
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        # Define layers
        self.embedding = nn.Embedding(num_embeddings=output_size, 
                                      embedding_dim=hidden_size)
        self.dropout = nn.Dropout(dropout_p)
        self.attn = Attn(method='concat', 
                         hidden_size=hidden_size)
        self.gru = nn.GRU(input_size=hidden_size, 
                          hidden_size=hidden_size,
                          num_layers=num_layers,
                          dropout=dropout_p)
        self.out = nn.Linear(hidden_size, output_size)
        
    def forward(self, word_input, last_hidden, encoder_outputs):
        # Note: we run this one step at a time
        # TODO: FIX BATCHING
        
        # Get the embedding of the current input word (last output word)
        word_embedded = self.embedding(word_input).view(1, 1, -1)  # S = 1 * B * N
        word_embedded = self.dropout(word_embedded)
        
        # Calculate attention weigths and apply to encoder outputs
        attn_weights = self.attn(last_hidden[-1], encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B * 1 * N
        context = context.transpose(0, 1)   # 1 * B * N
        
        # Combine embedded input word and attended context, run through RNN
        rnn_input = t.cat((word_embedded, context), 2)
        output, hidden = self.gru(rnn_input, last_hidden)
        
        # Final output layer
        output = output.squeeze(0)
        output = F.log_softmax(self.out(t.cat((output, context), 1)))
        
        # Return final output, hidden state, and attention weights (for visualization)
        return output, hidden, attn_weights
    

Now we can build a decoder that plugs this Attn module in after the RNN to calculate attention weight, and apply those weights to the encoder output to get a context vector.

In [23]:
class LuongAttnDecoderRNN(nn.Module):
    
    def __init__(self, attn_model, hidden_size, output_size, num_layers=1, dropout_p=0.1):
        super(LuongAttnDecoderRNN, self).__init__()
        
        # Define parameters
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.dropout_p = dropout_p
        
        # Define layers
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.embedding_dropout = nn.Dropout(dropout_p)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers, dropout_p)
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        
        # Choose attention model
        if attn_model != None:
            self.attn = Attn(attn_model, hidden_size)
    
    def forward(self, input_seq, last_hidden, encoder_outputs):
        # Note: we run this one step at a time
        
        # Get the embedding of the current input word (last output word)
        batch_size = input_seq.size(0)
        embedded = self.embedding(input_seq)
        embedded = self.embedding_dropout(embedded)
        embedded = embedded.view(1, batch_size, self.hidden_size)    # S = 1 * B * N
            
        # Get current hidden state from input word and last hidden state
        rnn_output, hidden = self.gru(embedded, last_hidden)
        
        # Calculate attention from current RNN state and all encoder outputs:
        # apply to encoder outputs to get weighted average
        attn_weights = self.attn(rnn_output, encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))  # B * (S=1) * N
        
        # Attention vector using the RNN hidden state and context vector
        # concatenated together (Luong eq. 5)
        rnn_output = rnn_output.squeeze(0)     # (S=1) * B * N  -----> B * N
        context = context.squeeze(1)           # B * (S=1) * N  -----> B * N
        concat_input = t.cat((rnn_output, context), 1)
        
        concat_output = F.tanh(self.concat(concat_input))
        
        # Finally predict next token （Luong eq. 6, without softmax)
        output = self.out(concat_output)
        
        # Return final output, hidden state, and attention weights (for visualization)
        return output, hidden, attn_weights

#### Testing the models

To make sure the encoder and decoder modules are working (and working together) we'll do a full test with a small batch.

In [24]:
small_batch_size = 3

input_batches, input_lengths, target_batches, target_lengths = random_batch(small_batch_size)

print("input batches: ", input_batches.size())
print("target batches: ", target_batches.size())

input batches:  torch.Size([8, 3])
target batches:  torch.Size([9, 3])


In [30]:
print("input seqs: ", ' '.join([input_lang.index2word[eachnum] \
                                for eachnum in input_batches[:, 1].data.numpy().tolist()]))
print("target seqs: ", ''.join([output_lang.index2word[eachnum] \
                                for eachnum in target_batches[:, 1].data.numpy().tolist()]))

input seqs:  I want to travel with you. EOS PAD
target seqs:  我想和你去旅行。
EOS


Create models with a small size(a good idea for eyeball inspection):

In [33]:
small_hidden_size = 8
small_n_layers = 2

input_size = input_lang.n_words
output_size = output_lang.n_words
print("input size: ", input_size)
print("output size: ", output_size)

encoder_test = EncoderRNN(input_size=input_size, 
                          hidden_size=small_hidden_size, 
                          batch_size=small_batch_size,
                          num_layers=small_n_layers,
                          dropout=0.1,
                          bidirectional=True
                          )
decoder_test = LuongAttnDecoderRNN(attn_model='general', 
                                   hidden_size=small_hidden_size, 
                                   output_size=output_size, 
                                   num_layers=small_n_layers,
                                   dropout_p=0.1)
# if t.cuda.is_available():
#     encoder_test.cuda()
#     decoder_test.cuda()

print("encoder: ", encoder_test)
print("decoder: ", decoder_test)

input size:  4834
output size:  5225
encoder:  EncoderRNN(
  (embedding): Embedding(4834, 8)
  (gru): GRU(8, 8, num_layers=2, dropout=0.1, bidirectional=True)
)
decoder:  LuongAttnDecoderRNN(
  (embedding): Embedding(5225, 8)
  (embedding_dropout): Dropout(p=0.1)
  (gru): GRU(8, 8, num_layers=2, bias=0.1)
  (concat): Linear(in_features=16, out_features=8)
  (out): Linear(in_features=8, out_features=5225)
  (attn): Attn(
    (attn): Linear(in_features=8, out_features=8)
  )
)


To test the encoder, run the input batch through to get per-batch encoder outputs:

In [37]:
encoder_outputs, encoder_hidden = encoder_test(input_batches, input_lengths, hidden=None)
print("***************************************************")
print("input_batches: ", input_batches.size())
print("encoder outputs: ", encoder_outputs.size())   # max_len * batch_size * hidden_size
print("encoder hidden: ", encoder_hidden.size())     # (n_layers * 2) * batch_size * hidden_size

input sequence size:  torch.Size([8, 3])
embedded size:  torch.Size([8, 3, 8])
packed data size:  torch.Size([22, 8])
packed batch_sizes:  [3, 3, 3, 3, 3, 3, 3, 1]
outputs size:  torch.Size([8, 3, 16])
***************************************************
input_batches:  torch.Size([8, 3])
encoder outputs:  torch.Size([8, 3, 8])
encoder hidden:  torch.Size([4, 3, 8])


Then starting with a SOS token, run word tokens through the decoder to get each next word token. Instead of doing this with the whole sequence, it is done one at a time, to support using it's own predictions to make the next predition. This will be one time step at a time, but batched per time step. In order to get this to work for short padded sequences, the batch size is going to get smaller each time.

In [38]:
max_target_length = max(target_lengths)
print("target max length: ", max_target_length)


# Prepare decoder input and outputs
decoder_input = V(t.LongTensor([SOS_token] * small_batch_size))
decoder_hidden = encoder_hidden[:decoder_test.num_layers] 
# Use last (forward) hidden state from encoder
all_decoder_outputs = V(t.zeros(max_target_length, small_batch_size, decoder_test.output_size))

if t.cuda.is_available():
    all_decoder_outputs = all_decoder_outputs.cuda()
    decoder_input = decoder_input.cuda()

# Run through decoder one time step at a time
for t in range(max_target_length):
    decoder_output, decoder_hidden, decoder_attn = \
        decoder_test(decoder_input, decoder_hidden, encoder_outputs)
    
    all_decoder_outputs[t] = decoder_output   # Store this step's outputs
    # Teacher forcing
    decoder_input = target_batches[t]         # Next input is current target

# Test masked cross entropy loss
loss = 

target max length:  9
