In [1]:
import numpy as np
import tensorflow as tf
import torch
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [2]:
tf.__version__

'1.5.0'

In [3]:
torch.__version__

'1.0.1'

# Check if the last hidden state is correct when there're paddings.

## Tensorflow

### Uni-directional

In [4]:
def rnn(x, seqlens=None, reuse=False):
    with tf.variable_scope("rnn", reuse=reuse):
        cell = tf.contrib.rnn.GRUCell(2)
        outputs, last_hidden = tf.nn.dynamic_rnn(cell, x, sequence_length=seqlens, dtype=tf.float32)
    return outputs, last_hidden

In [5]:
# no padding
x1 = tf.constant([1, 2, 3])
x1 = tf.one_hot(tf.expand_dims(x1, 0), 4)
outputs1, last_hidden1 = rnn(x1)

In [6]:
# zero padding, no seqlens
x2 = tf.constant([1, 2, 3, 0])
x2 = tf.one_hot(tf.expand_dims(x2, 0), 4)
outputs2, last_hidden2 = rnn(x2, reuse=True)

In [7]:
# zero padding with explicit seqlens
outputs3, last_hidden3 = rnn(x2, seqlens=[3,], reuse=True)

In [8]:
init_op = tf.global_variables_initializer()
sess = tf.InteractiveSession()
sess.run(init_op)


In [9]:
outputs1.eval()

array([[[ 0.11338668,  0.02229878],
        [-0.16635834, -0.19894187],
        [ 0.00041986, -0.24142244]]], dtype=float32)

In [10]:
outputs2.eval()

array([[[ 0.11338668,  0.02229878],
        [-0.16635834, -0.19894187],
        [ 0.00041986, -0.24142244],
        [-0.00597491, -0.20334211]]], dtype=float32)

In [11]:
outputs3.eval()

array([[[ 0.11338668,  0.02229878],
        [-0.16635834, -0.19894187],
        [ 0.00041986, -0.24142244],
        [ 0.        ,  0.        ]]], dtype=float32)

In [12]:
last_hidden1.eval()

array([[ 0.00041986, -0.24142244]], dtype=float32)

In [13]:
last_hidden2.eval()

array([[-0.00597491, -0.20334211]], dtype=float32)

In [14]:
last_hidden3.eval()

array([[ 0.00041986, -0.24142244]], dtype=float32)

conclusion: when you add paddings, you should add seqlens in order to get correct results. Paddings are masked to zeros.

### Bi-directional

In [15]:
tf.reset_default_graph()
def birnn(x, seqlens=None, reuse=False):
    with tf.variable_scope("birnn", reuse=reuse):
        cell = tf.contrib.rnn.GRUCell(1)
        cell_bw = tf.contrib.rnn.GRUCell(1)
        outputs, last_hidden = tf.nn.bidirectional_dynamic_rnn(cell, cell_bw, x, sequence_length=seqlens, dtype=tf.float32)
    return tf.concat(outputs,-1), tf.concat(last_hidden, -1)

In [16]:
x1 = tf.constant([1, 2, 3])
x1 = tf.one_hot(tf.expand_dims(x1, 0), 4)
outputs1, last_hidden1 = birnn(x1)

In [17]:
x2 = tf.constant([1, 2, 3, 0])
x2 = tf.one_hot(tf.expand_dims(x2, 0), 4)
outputs2, last_hidden2 = birnn(x2, seqlens=[3,], reuse=True)

In [18]:
init_op = tf.global_variables_initializer()
sess = tf.InteractiveSession()
sess.run(init_op)

In [19]:
outputs1.eval()

array([[[ 0.24943233, -0.08515487],
        [ 0.05219585, -0.21929769],
        [-0.21960783,  0.08647518]]], dtype=float32)

In [20]:
outputs2.eval()

array([[[ 0.24943233, -0.08515487],
        [ 0.05219585, -0.21929769],
        [-0.21960783,  0.08647518],
        [ 0.        ,  0.        ]]], dtype=float32)

In [21]:
last_hidden1.eval()

array([[-0.21960783, -0.08515487]], dtype=float32)

In [22]:
last_hidden2.eval()

array([[-0.21960783, -0.08515487]], dtype=float32)

Note that in bidirectional rnns, last_hidden of forward rnn (=-0.21960783), which is the rightmost one in the sequence is concatenated with the last hidden state of backward rnn (=-0.08515487), which is the leftmost one!.

# PyTorch

### Uni-directional

In [409]:
def onehot(arry, size):
    '''
    arry: 2-d array of n, t
    size: output dimensions
    
    returns
    3-d array of (n, t, size)
    '''
    labels_one_hot = (arry.ravel()[np.newaxis] == np.arange(size)[:, np.newaxis]).T
    labels_one_hot.shape = arry.shape + (size,)
    return labels_one_hot.astype('float32')

In [410]:
class Rnn(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.rnn = nn.GRU(4, 2, batch_first=True)
        
    def forward(self, x, seqlens=None):
        if seqlens is None:
            outputs, last_hidden = self.rnn(x)
            last_hidden = last_hidden.permute(1,2, 0)
            last_hidden = last_hidden.view(last_hidden.size()[0], -1)
        else:
            # sorting by seqlens
            seqlens = torch.IntTensor(seqlens)
            seqlens_sorted, perm_idx = seqlens.sort(0, descending=True)
            _, unperm_idx = perm_idx.sort(0) # for recovery
            x = x[perm_idx]
            
            # packing -> rnn -> unpacking -> position recovery
            packed_input = pack_padded_sequence(x, seqlens_sorted, batch_first=True)   
            outputs, last_hidden = self.rnn(packed_input)
            outputs, _ = pad_packed_sequence(outputs, batch_first=True, total_length=x.size()[1])
            outputs = outputs[unperm_idx]

            # last hidden
            last_hidden = last_hidden.permute(1,2, 0)
            last_hidden = last_hidden.view(last_hidden.size()[0], -1)
            last_hidden = last_hidden[unperm_idx]
        
        return outputs, last_hidden

In [411]:
x1 = np.array([1, 2, 3])
x1 = onehot(np.expand_dims(x1, 0), 4)
x1 = torch.from_numpy(x1)

In [412]:
x2 = np.array([1, 2, 3, 0])
x2 = onehot(np.expand_dims(x2, 0), 4)
x2 = torch.from_numpy(x2)

In [418]:
# no padding
model1 = Rnn()
outputs1, last_hidden1 = model1(x1)

In [419]:
# zero padding, no seqlens
model2 = Rnn()
for p1, p2 in zip(model1.parameters(), model2.parameters()):
    p2.data = p1.data
outputs2, last_hidden2 = model2(x2)

In [420]:
# zero padding with explicit seqlens
model3 = Rnn()
for p1, p3 in zip(model1.parameters(), model3.parameters()):
    p3.data = p1.data
outputs3, last_hidden3 = model3(x2, seqlens=[3,])

In [421]:
outputs1

tensor([[[-0.1044,  0.3741],
         [-0.3837,  0.3580],
         [-0.3023,  0.4439]]], grad_fn=<TransposeBackward0>)

In [422]:
outputs2

tensor([[[-0.1044,  0.3741],
         [-0.3837,  0.3580],
         [-0.3023,  0.4439],
         [-0.1070,  0.2071]]], grad_fn=<TransposeBackward0>)

In [423]:
outputs3

tensor([[[-0.1044,  0.3741],
         [-0.3837,  0.3580],
         [-0.3023,  0.4439],
         [ 0.0000,  0.0000]]], grad_fn=<IndexBackward>)

In [425]:
last_hidden1

tensor([[-0.3023,  0.4439]], grad_fn=<ViewBackward>)

In [426]:
last_hidden2

tensor([[-0.1070,  0.2071]], grad_fn=<ViewBackward>)

In [427]:
last_hidden3

tensor([[-0.3023,  0.4439]], grad_fn=<IndexBackward>)

Same here. Because there's no such argument as seqlens in pytorch, a trick was used. 

### Bi-directional

In [428]:
class BiRnn(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.rnn = nn.GRU(4, 1, batch_first=True, bidirectional=True)
        
    def forward(self, x, seqlens=None):
        if seqlens is None:
            outputs, last_hidden = self.rnn(x)
            last_hidden = last_hidden.permute(1,2, 0)
            last_hidden = last_hidden.view(last_hidden.size()[0], -1)
        else:
            # sorting by seqlens
            seqlens = torch.IntTensor(seqlens)
            seqlens_sorted, perm_idx = seqlens.sort(0, descending=True)
            _, unperm_idx = perm_idx.sort(0) # for recovery
            x = x[perm_idx]
            
            # packing -> rnn -> unpacking -> position recovery
            packed_input = pack_padded_sequence(x, seqlens_sorted, batch_first=True)   
            outputs, last_hidden = self.rnn(packed_input)
            outputs, _ = pad_packed_sequence(outputs, batch_first=True, total_length=x.size()[1])
            outputs = outputs[unperm_idx]

            # last hidden
            last_hidden = last_hidden.permute(1,2, 0)
            last_hidden = last_hidden.view(last_hidden.size()[0], -1)
            last_hidden = last_hidden[unperm_idx]
        
        return outputs, last_hidden

In [429]:
# no padding
model1 = BiRnn()
outputs1, last_hidden1 = model1(x1)

In [430]:
# zero padding with explicit seqlens
model2 = BiRnn()
for p1, p2 in zip(model1.parameters(), model2.parameters()):
    p2.data = p1.data
outputs2, last_hidden2 = model2(x2, seqlens=[3,])

In [431]:
outputs1

tensor([[[-0.2045,  0.9008],
         [-0.5576,  0.8139],
         [-0.5149,  0.4902]]], grad_fn=<TransposeBackward0>)

In [432]:
outputs2

tensor([[[-0.2045,  0.9008],
         [-0.5576,  0.8139],
         [-0.5149,  0.4902],
         [ 0.0000,  0.0000]]], grad_fn=<IndexBackward>)

In [433]:
last_hidden1

tensor([[-0.5149,  0.9008]], grad_fn=<ViewBackward>)

In [434]:
last_hidden2

tensor([[-0.5149,  0.9008]], grad_fn=<IndexBackward>)

Same here.

In [435]:
x2 = np.array([[1, 2, 0, 0], [3, 2, 1, 0]])
x2 = onehot(x2, 4)
x2 = torch.from_numpy(x2)

In [436]:
# zero padding with explicit seqlens
model3 = BiRnn()
for p1, p2 in zip(model1.parameters(), model3.parameters()):
    p2.data = p1.data
outputs2, last_hidden2 = model3(x2, seqlens=[2,3])

In [437]:
outputs2

tensor([[[-0.2045,  0.8737],
         [-0.5576,  0.7316],
         [ 0.0000,  0.0000],
         [ 0.0000,  0.0000]],

        [[-0.0955,  0.7689],
         [-0.4913,  0.8491],
         [-0.5968,  0.6703],
         [ 0.0000,  0.0000]]], grad_fn=<IndexBackward>)

In [438]:
last_hidden2

tensor([[-0.5576,  0.8737],
        [-0.5968,  0.7689]], grad_fn=<IndexBackward>)