We'll see how to get the last hidden states of Rnns in Tensorflow and PyTorch.

In [49]:
__author__ = "kyubyong"
__address__ = "https://github.com/kyubyong/nlp_made_easy"
__email__ = "kbpark.linguist@gmail.com"

In [1]:
import numpy as np
import tensorflow as tf
import torch
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [2]:
tf.__version__

'1.5.0'

In [3]:
torch.__version__

'1.0.1'

# Tensorflow

### Uni-directional

In [4]:
tf.reset_default_graph()

In [5]:
def rnn(x, bidirectional=False, seqlens=None, reuse=False):
    if not bidirectional:
        with tf.variable_scope("rnn", reuse=reuse):
            cell = tf.contrib.rnn.GRUCell(1)
            outputs, last_hidden = tf.nn.dynamic_rnn(cell, x, sequence_length=seqlens, dtype=tf.float32)
    else: 
        with tf.variable_scope("birnn", reuse=reuse):
            cell = tf.contrib.rnn.GRUCell(1)
            cell_bw = tf.contrib.rnn.GRUCell(1)
            outputs, last_hidden = tf.nn.bidirectional_dynamic_rnn(cell, cell_bw, x, sequence_length=seqlens, dtype=tf.float32)
            outputs, last_hidden = tf.concat(outputs,-1), tf.concat(last_hidden, -1)

    return outputs, last_hidden

In [6]:
def onehot(arry, size):
    '''
    arry: 2-d array of n, t
    size: output dimensions
    
    returns
    3-d array of (n, t, size)
    '''
    labels_one_hot = (arry.ravel()[np.newaxis] == np.arange(size)[:, np.newaxis]).T
    labels_one_hot.shape = arry.shape + (size,)
    return labels_one_hot.astype('float32')

In [7]:
_x1 = np.array([1, 2, 3], np.int32)
_x1 = onehot(np.expand_dims(_x1, 0), 4)

_x2 = np.array([1, 2, 3, 0], np.int32) # 0 means padding
_x2 = onehot(np.expand_dims(_x2, 0), 4)

In [8]:
# 1. no padding
x1 = tf.convert_to_tensor(_x1)
outputs1, last_hidden1 = rnn(x1)

In [9]:
# 2. zero padding, no seqlens
x2 = tf.convert_to_tensor(_x2)
outputs2, last_hidden2 = rnn(x2, reuse=True) # We want to sync the variables up to compare the results.

In [10]:
# 3. zero padding with explicit seqlens
outputs3, last_hidden3 = rnn(x2, seqlens=[3,], reuse=True) # Real sequence length is 3 as the last 0 is a padding.

In [11]:
# Session
init_op = tf.global_variables_initializer()
sess = tf.InteractiveSession()
sess.run(init_op)

In [12]:
outputs1.eval()

array([[[ 0.01561858],
        [ 0.19808015],
        [-0.00724778]]], dtype=float32)

In [13]:
outputs2.eval() # the last step has non-zero outputs. This is not we usually want.

array([[[ 0.01561858],
        [ 0.19808015],
        [-0.00724778],
        [-0.2128418 ]]], dtype=float32)

In [14]:
outputs3.eval() # the last step is masked to zeros. This is usually correct.

array([[[ 0.01561858],
        [ 0.19808015],
        [-0.00724778],
        [ 0.        ]]], dtype=float32)

In [15]:
last_hidden1.eval()

array([[-0.00724778]], dtype=float32)

In [16]:
last_hidden2.eval()

array([[-0.2128418]], dtype=float32)

In [17]:
last_hidden3.eval() # Now we have the same results as # 1.

array([[-0.00724778]], dtype=float32)

△ Comment: Paddings are mostly added to construct mini-batches from multiples samples of variable lengths. Therefore typically we want to get the same results as the case we treat them individually and do not pad. To that end, when you add paddings, you should add `seqlens`. Paddings are masked to zeros.

### Bi-directional

In [99]:
tf.reset_default_graph()

In [100]:
# 1. no padding
x1 = tf.convert_to_tensor(_x1)
outputs1, last_hidden1 = rnn(x1, bidirectional=True)

In [101]:
# 2. zero padding, no seqlens
x2 = tf.convert_to_tensor(_x2)
outputs2, last_hidden2 = rnn(x2, bidirectional=True, reuse=True)

In [102]:
# 3. zero padding with explicit seqlens
outputs3, last_hidden3 = rnn(x2, bidirectional=True, seqlens=[3,], reuse=True)

In [103]:
# Session
init_op = tf.global_variables_initializer()
sess = tf.InteractiveSession()
sess.run(init_op)

In [104]:
outputs1.eval()

array([[[-0.01149251,  0.16380599],
        [ 0.09828447,  0.043238  ],
        [-0.03214497,  0.04969781]]], dtype=float32)

In [105]:
outputs2.eval() # Again, this is not we want.

array([[[-0.01149251,  0.15307006],
        [ 0.09828447,  0.02596181],
        [-0.03214497,  0.02305467],
        [-0.10564233, -0.03366997]]], dtype=float32)

In [106]:
outputs3.eval() # Again, note that the last step is masked to zeros.

array([[[-0.01149251,  0.16380599],
        [ 0.09828447,  0.043238  ],
        [-0.03214497,  0.04969781],
        [ 0.        ,  0.        ]]], dtype=float32)

In [107]:
last_hidden1.eval()

array([[-0.03214497,  0.16380599]], dtype=float32)

In [108]:
last_hidden2.eval()

array([[-0.10564233,  0.15307006]], dtype=float32)

In [109]:
last_hidden3.eval()

array([[-0.03214497,  0.16380599]], dtype=float32)

△ Note that in bidirectional rnns, the last_hidden state of the forward rnn (=-0.03214497) is from the rightmost step in the sequence, while the last hidden state of the backward rnn (=0.16380599) is from the leftmost step.

# PyTorch

## Uni-directional

In [25]:
class Rnn(torch.nn.Module):
    def __init__(self, bidirectional=False):
        super().__init__()
        self.rnn = nn.GRU(4, 1, batch_first=True, bidirectional=bidirectional)
        
    def forward(self, x, seqlens=None):
        if seqlens is None:
            outputs, last_hidden = self.rnn(x)
            last_hidden = last_hidden.permute(1, 2, 0) # to (batch, hidden, num_directions)
            last_hidden = last_hidden.view(last_hidden.size()[0], -1) # to (batch, hidden*num_directions)
        else:
            # This may look complicated ... but it corresponds to the `seqlens` argument in TF. We'll see..
            # sorting by seqlens
            seqlens = torch.IntTensor(seqlens)
            seqlens_sorted, perm_idx = seqlens.sort(0, descending=True)
            _, unperm_idx = perm_idx.sort(0) # for recovery
            x = x[perm_idx]
            
            # packing -> rnn -> unpacking -> position recovery
            packed_input = pack_padded_sequence(x, seqlens_sorted, batch_first=True)   
            outputs, last_hidden = self.rnn(packed_input)
            outputs, _ = pad_packed_sequence(outputs, batch_first=True, total_length=x.size()[1])
            outputs = outputs[unperm_idx]

            # last hidden
            last_hidden = last_hidden.permute(1, 2, 0)
            last_hidden = last_hidden.view(last_hidden.size()[0], -1)
            last_hidden = last_hidden[unperm_idx]
        
        return outputs, last_hidden

In [28]:
# 1. no padding
x1 = torch.from_numpy(_x1)
model1 = Rnn()
outputs1, last_hidden1 = model1(x1)

In [29]:
# 2. zero padding, no seqlens
x2 = torch.from_numpy(_x2)
model2 = Rnn()
for p1, p2 in zip(model1.parameters(), model2.parameters()): # sync up the variables
    p2.data = p1.data
outputs2, last_hidden2 = model2(x2)

In [30]:
# 3. zero padding with explicit seqlens
model3 = Rnn()
for p1, p3 in zip(model1.parameters(), model3.parameters()):
    p3.data = p1.data
outputs3, last_hidden3 = model3(x2, seqlens=[3,])

In [31]:
outputs1

tensor([[[0.4626],
         [0.6538],
         [0.7014]]], grad_fn=<TransposeBackward0>)

In [32]:
outputs2

tensor([[[0.4626],
         [0.6538],
         [0.7014],
         [0.7997]]], grad_fn=<TransposeBackward0>)

In [33]:
outputs3

tensor([[[0.4626],
         [0.6538],
         [0.7014],
         [0.0000]]], grad_fn=<IndexBackward>)

In [425]:
last_hidden1

tensor([[-0.3023,  0.4439]], grad_fn=<ViewBackward>)

In [426]:
last_hidden2

tensor([[-0.1070,  0.2071]], grad_fn=<ViewBackward>)

In [427]:
last_hidden3

tensor([[-0.3023,  0.4439]], grad_fn=<IndexBackward>)

△ Since there's no such argument as seqlens in pytorch, a trick was used. 

## Bi-directional

In [40]:
# 1. no padding
model1 = Rnn(bidirectional=True)
outputs1, last_hidden1 = model1(x1)

In [41]:
# 2. zero padding without seqlens
model2 = Rnn(bidirectional=True)
for p1, p2 in zip(model1.parameters(), model2.parameters()):
    p2.data = p1.data
outputs2, last_hidden2 = model2(x2)

In [42]:
# 3. zero padding with explicit seqlens
model3 = Rnn(bidirectional=True)
for p1, p3 in zip(model1.parameters(), model3.parameters()):
    p3.data = p1.data
outputs3, last_hidden3 = model3(x2, seqlens=[3,])

In [43]:
outputs1

tensor([[[-0.5312, -0.4415],
         [ 0.0176,  0.1163],
         [ 0.2241,  0.0834]]], grad_fn=<TransposeBackward0>)

In [44]:
outputs2

tensor([[[-0.5312, -0.5033],
         [ 0.0176, -0.0370],
         [ 0.2241, -0.1416],
         [-0.3388, -0.3100]]], grad_fn=<TransposeBackward0>)

In [45]:
outputs3

tensor([[[-0.5312, -0.4415],
         [ 0.0176,  0.1163],
         [ 0.2241,  0.0834],
         [ 0.0000,  0.0000]]], grad_fn=<IndexBackward>)

In [46]:
last_hidden1

tensor([[ 0.2241, -0.4415]], grad_fn=<ViewBackward>)

In [47]:
last_hidden2

tensor([[-0.3388, -0.5033]], grad_fn=<ViewBackward>)

In [48]:
last_hidden3

tensor([[ 0.2241, -0.4415]], grad_fn=<IndexBackward>)

△ Same here.