In [55]:
import torch
import torch.nn as nn

from torch.nn.utils.rnn import pad_sequence

## Simple Example: Batch with single sequence

In [5]:
# This is batch consisted of a single sequence of length 9
seq_single = torch.tensor([
    [[1, 2, 1],  # <-- Head of seq
     [3, 2, 1],  # <-- Each step with input dim being 3
     [4, 4, 3],
     [0, 2, 1],
     [3, 1, 1],
     [5, 2, 1],
     [1, 9, 1],
     [0, 0, 1]]  # <-- Tail of seq
], dtype=torch.float)

# Thus, this is a batch with only ONE sample
#   Batch_Size x Seq_Length x Else (e.g., dim of word vector)
seq_single.size()

torch.Size([1, 8, 3])

In [34]:
# Initialize a basic rnn model
rnn_single = nn.RNN(
    input_size=3        # <-- 3D input, e.g. each word vec is of dim 3
    , hidden_size=2     # <-- Compress the input to a scalar
    , num_layers=3      # <-- Obviously, 3 hidden layers
    , batch_first=True  # <-- Default to false; in our sample, the first data dimension is the batch dim
)

# Feed in our batch
out_single, hid_single = rnn_single(seq_single)

In [35]:
# This would be the output of the last hidden layer at EACH STEP
out_single

tensor([[[0.4499, 0.2454],
         [0.6372, 0.2465],
         [0.3604, 0.3726],
         [0.4987, 0.4451],
         [0.6867, 0.3746],
         [0.4908, 0.4466],
         [0.3319, 0.5678],
         [0.7690, 0.3942]]], grad_fn=<TransposeBackward1>)

In [36]:
# The out put of EACH of the three hidden layers at the LAST STEP (single dir)
#   The shape would be Num_Hidden_Layer x Batch_Size x Dim_Hidden
#   Note how the output of last element corresponds to that of <out_single>
hid_single

tensor([[[-0.6758, -0.7543]],

        [[-0.5398, -0.4446]],

        [[ 0.7690,  0.3942]]], grad_fn=<StackBackward>)

In [42]:
# So this returns the LAST (3rd) LAYER's OUTPUT of the current batch's FIRST ELEMENT with ALL dimensions
print(hid_single[-1, 0, :])  # <-- Returns ONLY the first sample
print(hid_single[-1])        # <-- Returns the WHOLE batch

tensor([0.7690, 0.3942], grad_fn=<SliceBackward>)
tensor([[0.7690, 0.3942]], grad_fn=<SelectBackward>)


## A Slightly More Complex One: Multiple Sequences

In [49]:
# Let's do a size-4 batch
seq_multi = torch.tensor([
    # Seq 1
    [[1, 2],   # <-- Head seq 1
     [3, 2],
     [4, 4]],  # <-- Tail seq 1

    # Seq 2
    [[0, 2],
     [1, 3],
     [0, 2]],
    
    # Seq 3
    [[3, 1],
     [5, 2],
     [1, 9]],
    
    # Seq 4
    [[0, 0],   # <-- Head seq 4
     [3, 1],
     [2, 0]]   # <-- Tail seq 4
], dtype=torch.float)

# Batch_Size x Seq_Length x Else
#   Therefore 4 x 3 x 2
seq_multi.size()

torch.Size([4, 3, 2])

Note here I intentionally set the sequences to be of equal lengths, which is **NOT TRUE** in most cases. This is because a mini-batch of samples are essentially represented as a `torch.tensor` instead of a `list`. Later in this tutorial I will show the usage of `torch.nn.utils.rnn.pad_sequence` to make the sequences of equal length.

In [50]:
# Initialize a basic rnn model
rnn_multi = nn.RNN(
    input_size=2        # <-- 2D input
    , hidden_size=2     # <-- Doesn't change dimension
    , num_layers=3      
    , batch_first=True
)

# Feed in our batch
out_multi, hid_multi = rnn_multi(seq_multi)

In [51]:
# Since the seq length is three, we have a collection of length-3 outputs
out_multi

tensor([[[ 0.7996,  0.5762],
         [ 0.7158,  0.3885],
         [ 0.7258,  0.3503]],

        [[ 0.8350,  0.5161],
         [ 0.7691,  0.2316],
         [ 0.8361, -0.0117]],

        [[ 0.7375,  0.6521],
         [ 0.6912,  0.5084],
         [ 0.7789,  0.3409]],

        [[ 0.8241,  0.5449],
         [ 0.6293,  0.4592],
         [ 0.7589,  0.4235]]], grad_fn=<TransposeBackward1>)

In [52]:
# Note that each row of the last hidden layer's output corresponds to 
#   the each one of the four sequences' last rows of <out_multi> 
hid_multi

tensor([[[ 0.9260,  0.8400],
         [ 0.1617,  0.7666],
         [ 0.8454,  0.9899],
         [ 0.4740,  0.1308]],

        [[ 0.0110, -0.3907],
         [ 0.3394,  0.0455],
         [ 0.0873, -0.2406],
         [ 0.1557, -0.4205]],

        [[ 0.7258,  0.3503],
         [ 0.8361, -0.0117],
         [ 0.7789,  0.3409],
         [ 0.7589,  0.4235]]], grad_fn=<StackBackward>)

In [53]:
# Extract the outputs from last hidden layer (of last time step)
hid_multi[-1]

tensor([[ 0.7258,  0.3503],
        [ 0.8361, -0.0117],
        [ 0.7789,  0.3409],
        [ 0.7589,  0.4235]], grad_fn=<SelectBackward>)

In [54]:
# Extract the outputs from last hidden layer
out_multi[:, -1]

tensor([[ 0.7258,  0.3503],
        [ 0.8361, -0.0117],
        [ 0.7789,  0.3409],
        [ 0.7589,  0.4235]], grad_fn=<SelectBackward>)

## Padding

A more detailed tutorial can be found [here](https://suzyahyah.github.io/pytorch/2019/07/01/DataLoader-Pad-Pack-Sequence.html). In this notebook I will just show the very basic idea of padding.

In [64]:
# Suppose we have a batch of four sequences, stored in list
seq_raw_1d = [
    [[1], [0], [0]],
    [[0], [2], [4], [5], [7], [10]],
    [[2], [3]]
]

# Try with different values :)
pad_sequence(
    [torch.tensor(s) for s in seq_raw_1d]
    , batch_first=True    # Again, first dim denotes each sample in batch 
    , padding_value=-200  # Pad the shortage with value -200
)



tensor([[[   1],
         [   0],
         [   0],
         [-200],
         [-200],
         [-200]],

        [[   0],
         [   2],
         [   4],
         [   5],
         [   7],
         [  10]],

        [[   2],
         [   3],
         [-200],
         [-200],
         [-200],
         [-200]]])

In [70]:
# This works as well, and it is the usual case
seq_raw_1d = [
    [1, 0, 0],
    [0, 2, 4, 6, 7, 8, 10],
    [2, 3]
]

# Note how the shape of two outputs differ
pad_sequence(
    [torch.tensor(s) for s in seq_raw_1d]
    , batch_first=True    
    , padding_value=200
)

tensor([[  1,   0,   0, 200, 200, 200, 200],
        [  0,   2,   4,   6,   7,   8,  10],
        [  2,   3, 200, 200, 200, 200, 200]])

In [68]:
# Suppose we have a batch of four sequences, stored in list
seq_raw_2d = [
    # Seq 1
    [[1, 2],   # <-- Head seq 1
     [3, 2],
     [4, 4]],  # <-- Tail seq 1

    # Seq 2
    [[0, 2],
     [1, 3],
     [0, 2],
     [3, 1],
     [5, 2]],
    
    # Seq 3
    [[1, 9]]
]

pad_sequence(
    [torch.tensor(s) for s in seq_raw_2d]
    , batch_first=True    # Again, first dim denotes each sample in batch 
    , padding_value=-200  # Pad the shortage with value -200
)

tensor([[[   1,    2],
         [   3,    2],
         [   4,    4],
         [-200, -200],
         [-200, -200]],

        [[   0,    2],
         [   1,    3],
         [   0,    2],
         [   3,    1],
         [   5,    2]],

        [[   1,    9],
         [-200, -200],
         [-200, -200],
         [-200, -200],
         [-200, -200]]])

I think there are several details I should mention here. Firstly, the `padding_value` has to be a scalar. So, even though the 2-d example works, we are not able to pad the sequences with a desired **Embedding Vector**, say `[-200, -314]`. Further, the **first** 1-d example and the 2-d example can be interpreted as *a batch of four sequences with each word/timestep having 1-d/2-d embedding.* However, this is **NOT** a common practice. Instead, the **second** example is often what people adopt, and it can be interpreted as *a batch of of four sequences encoded by the corresponding **Index in Vocabulary**.* So, in that example, the number 200 is the index for a padding token, such as "<PAD>", which has its own **Embedding**. In another tutorial, I will show how to convert a batch of index sequences into a batch of word vector sequences using an **Embedding Layer**. Lastly, more on padding can be found [here](https://suzyahyah.github.io/pytorch/2019/07/01/DataLoader-Pad-Pack-Sequence.html).