We'll see how to get the last hidden states of Rnns in Tensorflow and PyTorch.

In [1]:
__author__ = "kyubyong"
__address__ = "https://github.com/kyubyong/nlp_made_easy"
__email__ = "kbpark.linguist@gmail.com"

In [2]:
import numpy as np
import tensorflow as tf
import torch
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [3]:
tf.__version__

'1.14.0'

In [4]:
torch.__version__

'1.2.0'

# Tensorflow

### Uni-directional

In [5]:
tf.reset_default_graph()

In [6]:
def rnn(x, bidirectional=False, seqlens=None, reuse=False):
    if not bidirectional:
        with tf.variable_scope("rnn", reuse=reuse):
            cell = tf.contrib.rnn.GRUCell(1)
            outputs, last_hidden = tf.nn.dynamic_rnn(cell, x, sequence_length=seqlens, dtype=tf.float32)
    else: 
        with tf.variable_scope("birnn", reuse=reuse):
            cell = tf.contrib.rnn.GRUCell(1)
            cell_bw = tf.contrib.rnn.GRUCell(1)
            outputs, last_hidden = tf.nn.bidirectional_dynamic_rnn(cell, cell_bw, x, sequence_length=seqlens, dtype=tf.float32)
            outputs, last_hidden = tf.concat(outputs,-1), tf.concat(last_hidden, -1)

    return outputs, last_hidden

In [7]:
def onehot(arry, size):
    '''
    arry: 2-d array of n, t
    size: output dimensions
    
    returns
    3-d array of (n, t, size)
    '''
    labels_one_hot = (arry.ravel()[np.newaxis] == np.arange(size)[:, np.newaxis]).T
    labels_one_hot.shape = arry.shape + (size,)
    return labels_one_hot.astype('float32')

In [8]:
_x1 = np.array([1, 2, 3], np.int32)
_x1 = onehot(np.expand_dims(_x1, 0), 4)

_x2 = np.array([1, 2, 3, 0], np.int32) # 0 means padding
_x2 = onehot(np.expand_dims(_x2, 0), 4)

In [9]:
# 1. no padding
x1 = tf.convert_to_tensor(_x1)
outputs1, last_hidden1 = rnn(x1)

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [10]:
# 2. zero padding, no seqlens
x2 = tf.convert_to_tensor(_x2)
outputs2, last_hidden2 = rnn(x2, reuse=True) # We want to sync the variables up to compare the results.



In [11]:
# 3. zero padding with explicit seqlens
outputs3, last_hidden3 = rnn(x2, seqlens=[3,], reuse=True) # Real sequence length is 3 as the last 0 is a padding.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [12]:
# Session
init_op = tf.global_variables_initializer()
sess = tf.InteractiveSession()
sess.run(init_op)

In [13]:
outputs1.eval()

array([[[-0.31044602],
        [-0.1496363 ],
        [ 0.0102744 ]]], dtype=float32)

In [14]:
outputs2.eval() # the last step has non-zero outputs. This is not we usually want.

array([[[-0.31044602],
        [-0.1496363 ],
        [ 0.0102744 ],
        [-0.2648457 ]]], dtype=float32)

In [15]:
outputs3.eval() # the last step is masked to zeros. This is usually correct.

array([[[-0.31044602],
        [-0.1496363 ],
        [ 0.0102744 ],
        [ 0.        ]]], dtype=float32)

In [16]:
last_hidden1.eval()

array([[0.0102744]], dtype=float32)

In [17]:
last_hidden2.eval()

array([[-0.2648457]], dtype=float32)

In [18]:
last_hidden3.eval() # Now we have the same results as # 1.

array([[0.0102744]], dtype=float32)

△ Comment: Paddings are mostly added to construct mini-batches from multiples samples of variable lengths. Therefore typically we want to get the same results as the case we treat them individually and do not pad. To that end, when you add paddings, you should add `seqlens`. Paddings are masked to zeros.

### Bi-directional

In [19]:
tf.reset_default_graph()

In [20]:
# 1. no padding
x1 = tf.convert_to_tensor(_x1)
outputs1, last_hidden1 = rnn(x1, bidirectional=True)

Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API


In [21]:
# 2. zero padding, no seqlens
x2 = tf.convert_to_tensor(_x2)
outputs2, last_hidden2 = rnn(x2, bidirectional=True, reuse=True)



In [22]:
# 3. zero padding with explicit seqlens
outputs3, last_hidden3 = rnn(x2, bidirectional=True, seqlens=[3,], reuse=True)



In [23]:
# Session
init_op = tf.global_variables_initializer()
sess = tf.InteractiveSession()
sess.run(init_op)



In [24]:
outputs1.eval()

array([[[ 0.03057988, -0.36812052],
        [ 0.11028282, -0.25675917],
        [ 0.03307126,  0.01558547]]], dtype=float32)

In [25]:
outputs2.eval() # Again, this is not we want.

array([[[ 0.03057988, -0.36047477],
        [ 0.11028282, -0.24538502],
        [ 0.03307126,  0.03696197],
        [ 0.1797748 ,  0.02438327]]], dtype=float32)

In [26]:
outputs3.eval() # Again, note that the last step is masked to zeros.

array([[[ 0.03057988, -0.36812052],
        [ 0.11028282, -0.25675917],
        [ 0.03307126,  0.01558547],
        [ 0.        ,  0.        ]]], dtype=float32)

In [27]:
last_hidden1.eval()

array([[ 0.03307126, -0.36812052]], dtype=float32)

In [28]:
last_hidden2.eval()

array([[ 0.1797748 , -0.36047477]], dtype=float32)

In [29]:
last_hidden3.eval()

array([[ 0.03307126, -0.36812052]], dtype=float32)

△ Note that in bidirectional rnns, the last_hidden state of the forward rnn (=0.03307126) is from the rightmost step in the sequence, while the last hidden state of the backward rnn (=-0.36812052) is from the leftmost step.

# PyTorch

## Uni-directional

In [30]:
class Rnn(torch.nn.Module):
    def __init__(self, bidirectional=False):
        super().__init__()
        self.rnn = nn.GRU(4, 1, batch_first=True, bidirectional=bidirectional)
        
    def forward(self, x, seqlens=None):
        if seqlens is not None:
            # packing -> rnn -> unpacking -> position recovery
            packed_input = pack_padded_sequence(x, seqlens, batch_first=True, enforce_sorted=False)   
            outputs, last_hidden = self.rnn(packed_input)
            outputs, _ = pad_packed_sequence(outputs, batch_first=True, total_length=x.size()[1])
        else:
            outputs, last_hidden = self.rnn(x)
        last_hidden = last_hidden.permute(1, 2, 0) # to (batch, hidden, num_directions)
        last_hidden = last_hidden.view(last_hidden.size()[0], -1) # to (batch, hidden*num_directions)
        
        return outputs, last_hidden

In [31]:
# 1. no padding
x1 = torch.from_numpy(_x1)
model1 = Rnn()
outputs1, last_hidden1 = model1(x1)

In [32]:
# 2. zero padding, no seqlens
x2 = torch.from_numpy(_x2)
model2 = Rnn()
for p1, p2 in zip(model1.parameters(), model2.parameters()): # sync up the variables
    p2.data = p1.data
outputs2, last_hidden2 = model2(x2)

In [33]:
# 3. zero padding with explicit seqlens
model3 = Rnn()
for p1, p3 in zip(model1.parameters(), model3.parameters()):
    p3.data = p1.data
outputs3, last_hidden3 = model3(x2, seqlens=[3,])

In [34]:
outputs1

tensor([[[0.0022],
         [0.1187],
         [0.2030]]], grad_fn=<TransposeBackward1>)

In [35]:
outputs2

tensor([[[ 0.0022],
         [ 0.1187],
         [ 0.2030],
         [-0.0466]]], grad_fn=<TransposeBackward1>)

In [36]:
outputs3

tensor([[[0.0022],
         [0.1187],
         [0.2030],
         [0.0000]]], grad_fn=<IndexSelectBackward>)

In [37]:
last_hidden1

tensor([[0.2030]], grad_fn=<ViewBackward>)

In [38]:
last_hidden2

tensor([[-0.0466]], grad_fn=<ViewBackward>)

In [39]:
last_hidden3

tensor([[0.2030]], grad_fn=<ViewBackward>)

△ Since there's no such argument as seqlens in pytorch, a trick was used. 

## Bi-directional

In [40]:
# 1. no padding
model1 = Rnn(bidirectional=True)
outputs1, last_hidden1 = model1(x1)

In [41]:
# 2. zero padding without seqlens
model2 = Rnn(bidirectional=True)
for p1, p2 in zip(model1.parameters(), model2.parameters()):
    p2.data = p1.data
outputs2, last_hidden2 = model2(x2)

In [42]:
# 3. zero padding with explicit seqlens
model3 = Rnn(bidirectional=True)
for p1, p3 in zip(model1.parameters(), model3.parameters()):
    p3.data = p1.data
outputs3, last_hidden3 = model3(x2, seqlens=[3,])

In [43]:
outputs1

tensor([[[ 0.1541,  0.1542],
         [ 0.1018,  0.1953],
         [ 0.3250, -0.2467]]], grad_fn=<TransposeBackward1>)

In [44]:
outputs2

tensor([[[ 0.1541,  0.1687],
         [ 0.1018,  0.2188],
         [ 0.3250, -0.1759],
         [ 0.2979,  0.0983]]], grad_fn=<TransposeBackward1>)

In [45]:
outputs3

tensor([[[ 0.1541,  0.1542],
         [ 0.1018,  0.1953],
         [ 0.3250, -0.2467],
         [ 0.0000,  0.0000]]], grad_fn=<IndexSelectBackward>)

In [46]:
last_hidden1

tensor([[0.3250, 0.1542]], grad_fn=<ViewBackward>)

In [47]:
last_hidden2

tensor([[0.2979, 0.1687]], grad_fn=<ViewBackward>)

In [48]:
last_hidden3

tensor([[0.3250, 0.1542]], grad_fn=<ViewBackward>)

△ Same here.