# RNN basics



In [5]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import math
import time
import random

## Table of contents

1. Hidden state
2. Input
3. RNN layer
4. Send data to RNN
5. Word embedding using nn.Embedding
6. Word embedding using nn.Linear
7. Testing Linear layer - one hot encoding
8. Testing Linear layer - Integer
9. Testing embedding
10. Linear layer- one hot encoding - batch size 1
11. RNN score computation - one forward pass
12. Bidirectional RNN


# Hidden state

h_0 of shape (num_layers * num_directions, batch, hidden_size): 

tensor containing the initial hidden state for each element in the batch. 

Defaults to zero if not provided. 

If the RNN is bidirectional, num_directions should be 2, else it should be 1. 

In [None]:
num_layers = 1
num_directions = 1
batch_size = 2
hidden_size = 5

In [None]:
h_init = torch.zeros (num_layers * num_directions, batch_size, hidden_size)
print(h_init)
print(h_init.size())

# Input

Input of shape (seq_len, batch, input_size): 

tensor containing the features of the input sequence. 

The input can also be a packed variable length sequence.

seq_length = along row, batch size = along column, input_size = size of the word embedding

In [None]:
seq_length = 2
batch_size = 2
input_size = 3

In [None]:
input = torch.rand(seq_length,batch_size,input_size)
print(input)
print(input.size())

# RNN layer

 h_t = tanh(W_{ih} x_t + b_{ih} + W_{hh} h_{(t-1)} + b_{hh})

https://pytorch.org/docs/stable/generated/torch.nn.RNN.html

nonlinearity – The non-linearity to use. Can be either 'tanh' or 'relu'. Default: 'tanh'

In [None]:
input_size = 3
hidden_size = 5

In [None]:
rnn = nn.RNN(input_size,hidden_size,nonlinearity = 'tanh',bias = True)
print(rnn)

# Send data to RNN

output of shape (seq_len, batch, num_directions * hidden_size): tensor containing the output features (h_t) from the last layer of the RNN, for each t. If a torch.nn.utils.rnn.PackedSequence has been given as the input, the output will also be a packed sequence.

For the unpacked case, the directions can be separated using output.view(seq_len, batch, num_directions, hidden_size), with forward and backward being direction 0 and 1 respectively. Similarly, the directions can be separated in the packed case.

h_n of shape (num_layers * num_directions, batch, hidden_size): tensor containing the hidden state for t = seq_len.

In [None]:
h_seq, h_final = rnn(input,h_init)
print(h_seq)
print(h_final)
print(h_seq.size(),"\n", h_final.size())

# Word embedding using nn.Embedding

https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html

num_embeddings (int) – size of the dictionary of embeddings

embedding_dim (int) – the size of each embedding vector


In [11]:
num_embeddings = 6
embedding_dim = 5

In [12]:
seq_length = 3
batch_size = 2

In [15]:
w_emb = nn.Embedding(num_embeddings, embedding_dim)
print("weight",w_emb.weight)
input = torch.LongTensor([[1,2,3],[3,5,5]]).view(seq_length,batch_size)
print("input",input)
output = w_emb(input)
print(output)
print(output.size())

weight Parameter containing:
tensor([[-0.5925, -0.8228,  0.2863,  0.8048,  0.1445],
        [-0.4658,  0.8823, -0.2044, -1.0068,  0.1941],
        [ 1.4252,  0.2355, -0.9920, -0.3669,  0.3220],
        [ 1.4867,  0.3734,  0.1834, -2.3481, -0.7169],
        [-0.3742, -2.8740, -0.3611,  0.1126, -0.1899],
        [-1.6323,  0.2826,  0.8102,  0.4583,  0.0803]], requires_grad=True)
input tensor([[1, 2],
        [3, 3],
        [5, 5]])
tensor([[[-0.4658,  0.8823, -0.2044, -1.0068,  0.1941],
         [ 1.4252,  0.2355, -0.9920, -0.3669,  0.3220]],

        [[ 1.4867,  0.3734,  0.1834, -2.3481, -0.7169],
         [ 1.4867,  0.3734,  0.1834, -2.3481, -0.7169]],

        [[-1.6323,  0.2826,  0.8102,  0.4583,  0.0803],
         [-1.6323,  0.2826,  0.8102,  0.4583,  0.0803]]],
       grad_fn=<EmbeddingBackward>)
torch.Size([3, 2, 5])


# Word embedding using nn.Linear

https://pytorch.org/docs/stable/generated/torch.nn.Linear.html

In [None]:
in_features = num_embeddings
out_features = embedding_dim
#in_features =5
#out_features = 4


In [None]:
linear = nn.Linear(in_features, out_features, bias = False)
print(linear)
print(linear.weight)

In [None]:
#seq_length = 1
#batch_size = 2
input_size = out_features

In [None]:
# one hot encoding
batch_1 = torch.Tensor([[1,0,0,0],[0,0,1,0],[0,0,0,1]])
batch_2 = torch.Tensor([[0,0,1,0],[0,1,0,0],[1,0,0,0]])

sequence = torch.cat((batch_1,batch_2),dim = 0).view(seq_length,batch_size,in_features)
print(sequence)
print(sequence.size())
#print(inputs.size())
output = linear(sequence)
print(output.size())
#output = output.view(seq_length,batch_size,input_size)
print(output)

# Testing - Linear layer - One hot encoding

In [None]:
# Linear layer

in_features = 4
out_features = 5
seq_length = 3
batch_size = 2

# RNN layer

num_layers = 1
num_directions = 1
hidden_size = 5
input_size = out_features

# Linear layer
linear = nn.Linear(in_features, out_features, bias = False)
print(linear)
print(linear.weight)

# one hot encoding

batch_1 = torch.Tensor([[1,0,0,0],[0,0,1,0],[0,0,0,1]])
batch_2 = torch.Tensor([[0,0,1,0],[0,1,0,0],[1,0,0,0]])

input_data = torch.cat((batch_1,batch_2),dim = 0).view(seq_length,batch_size,in_features)
print("input_data",input_data)
print("Size of the input data",input_data.size())

output = linear(input_data)
print("Size of the Linear layer output",output.size())
#output = output.view(seq_length,batch_size,input_size)
print("linear layer output", output)

# Memory state
h_init = torch.zeros (num_layers * num_directions, batch_size, hidden_size)
print("Memory state initialisation", h_init)
print("Memory state size",h_init.size())

rnn = nn.RNN(input_size,hidden_size,nonlinearity = 'tanh',bias = True)
print("RNN",rnn)

h_seq, h_final = rnn(output,h_init)
print("H_seq",h_seq)
print("H_final",h_final)
print(h_seq.size(),"\n", h_final.size())

# Testing - Linear layer - Integer

In [None]:
# Linear layer

in_features = 4
out_features = 5
seq_length = 3
batch_size = 2

# RNN layer

num_layers = 1
num_directions = 1
hidden_size = 5
input_size = out_features

# Linear layer
linear = nn.Linear(out_features,in_features, bias = False)
print(linear)
print(linear.weight)

# one hot encoding

batch_1 = torch.LongTensor([1,2,3])
batch_2 = torch.LongTensor([3,2,1])

input_data = torch.cat((batch_1,batch_2),dim = 0).view(seq_length,batch_size)
print("input_data",input_data)
print("Size of the input data",input_data.size())

output = linear.weight.data[input_data]
print("Size of the Linear layer output",output.size())
#output = output.view(seq_length,batch_size,input_size)
print("linear layer output", output)

# Memory state
h_init = torch.zeros (num_layers * num_directions, batch_size, hidden_size)
print("Memory state initialisation", h_init)
print("Memory state size",h_init.size())

rnn = nn.RNN(input_size,hidden_size,nonlinearity = 'tanh',bias = True)
print("RNN",rnn)

h_seq, h_final = rnn(output,h_init)
print("H_seq",h_seq)
print("H_final",h_final)
print(h_seq.size(),"\n", h_final.size())

# Testing - Embedding

In [3]:
# Linear layer

in_features = 4
out_features = 5
seq_length = 3
batch_size = 2

# RNN layer

num_layers = 1
num_directions = 1
hidden_size = 5
input_size = out_features

# Word embedding

num_embeddings = 4
embedding_dim = 5

w_emb = nn.Embedding(num_embeddings, embedding_dim)
print("weight",w_emb.weight)

input = torch.LongTensor([[1,2,3],[3,2,1]])
input = torch.transpose(input,0,1)
#print("input",input.size())

output = w_emb(input)
print("Size of the Embedding layer output",output.size())
#output = output.view(seq_length,batch_size,input_size)
print("Embedding layer output", output)


# Memory state
h_init = torch.zeros (num_layers * num_directions, batch_size, hidden_size)
print("Memory state initialisation", h_init)
print("Memory state size",h_init.size())

rnn = nn.RNN(input_size,hidden_size,nonlinearity = 'tanh',bias = True)
print("RNN",rnn)

h_seq, h_final = rnn(output,h_init)
print("H_seq",h_seq)
print("H_final",h_final)
print(h_seq.size(),"\n", h_final.size())

weight Parameter containing:
tensor([[-1.2872, -1.5725,  0.9193,  0.8244,  0.8569],
        [ 0.1559,  0.9764, -1.3576,  1.1315, -1.7053],
        [ 0.4271,  0.9379, -0.1343, -0.0158,  0.5403],
        [-2.2810, -0.0063, -0.5422, -0.7179,  0.6794]], requires_grad=True)
input torch.Size([3, 2])
Size of the Embedding layer output torch.Size([3, 2, 5])
Embedding layer output tensor([[[ 0.1559,  0.9764, -1.3576,  1.1315, -1.7053],
         [-2.2810, -0.0063, -0.5422, -0.7179,  0.6794]],

        [[ 0.4271,  0.9379, -0.1343, -0.0158,  0.5403],
         [ 0.4271,  0.9379, -0.1343, -0.0158,  0.5403]],

        [[-2.2810, -0.0063, -0.5422, -0.7179,  0.6794],
         [ 0.1559,  0.9764, -1.3576,  1.1315, -1.7053]]],
       grad_fn=<EmbeddingBackward>)
Memory state initialisation tensor([[[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]]])
Memory state size torch.Size([1, 2, 5])
RNN RNN(5, 5)
H_seq tensor([[[ 0.2328, -0.2686, -0.3293, -0.6375, -0.6561],
         [ 0.8115, -0.1449, -0.1975, -0

# Linear layer- one hot encoding - batch size 1

In [None]:
# Linear layer

in_features = 4
out_features = 7
seq_length = 3
batch_size = 1

# RNN layer

num_layers = 1
num_directions = 1
hidden_size = 5
input_size = out_features

# Linear layer
linear = nn.Linear(in_features, out_features, bias = False)
print(linear)
print(linear.weight)

# one hot encoding

batch_1 = torch.Tensor([[1,0,0,0],[0,0,1,0],[0,0,0,1]])
#batch_2 = torch.Tensor([[0,0,1,0],[0,1,0,0],[1,0,0,0]])

input_data = batch_1.view(seq_length,batch_size,in_features)
print("input_data",input_data)
print("Size of the input data",input_data.size())

output = linear(input_data)
print("Size of the Linear layer output",output.size())
#output = output.view(seq_length,batch_size,input_size)
print("linear layer output", output)

# Memory state
h_init = torch.zeros (num_layers * num_directions, batch_size, hidden_size)
print("Memory state initialisation", h_init)
print("Memory state size",h_init.size())

rnn = nn.RNN(input_size,hidden_size,nonlinearity = 'tanh',bias = True)
print("RNN",rnn)

h_seq, h_final = rnn(output,h_init)
print("H_seq",h_seq)
print("H_final",h_final)
print(h_seq.size(),"\n", h_final.size())

# RNN score computation - one forward pass

In [None]:
# Linear layer

in_features = 4
out_features = 5
seq_length = 3
batch_size = 2

# RNN layer

num_layers = 1
num_directions = 1
hidden_size = 5
input_size = out_features

# Word embedding

num_embeddings = 4
embedding_dim = 5

w_emb = nn.Embedding(num_embeddings, embedding_dim)
print("weight",w_emb.weight)

input = torch.LongTensor([[1,2,3],[3,2,1]])
input = torch.transpose(input,0,1)

print("input",input)

labels = torch.LongTensor([[1,2,3],[3,2,1]])

output = w_emb(input)
print("Size of the Embedding layer output",output.size())
#output = output.view(seq_length,batch_size,input_size)
print("Embedding layer output", output)


# Memory state
h_init = torch.zeros (num_layers * num_directions, batch_size, hidden_size)
print("Memory state initialisation", h_init)
print("Memory state size",h_init.size())

rnn = nn.RNN(input_size,hidden_size,nonlinearity = 'tanh',bias = True)
print("RNN",rnn)

h_seq, h_final = rnn(output,h_init)
print("H_seq",h_seq)
print("H_final",h_final)
print(h_seq.size(),"\n", h_final.size())

### Linear layer - for next word prediction 

score_layer = nn.Linear(hidden_size,num_embeddings)

output_score = score_layer(h_seq).view(batch_size*seq_length,num_embeddings)
print(output_score)

labels = labels.view(batch_size*seq_length)
print(labels)

criterion = nn.CrossEntropyLoss()

loss= criterion(output_score,labels)

print("loss",loss)

#F = torch.softmax(output_score,dim=1)

#print(F)
#print(F[0].sum())

# Bidirectional RNN

In [None]:
# Linear layer

in_features = 4
out_features = 5
seq_length = 3
batch_size = 2

# RNN layer

num_layers = 1
num_directions = 2
hidden_size = 5
input_size = out_features

# Word embedding

num_embeddings = 4
embedding_dim = 5

w_emb = nn.Embedding(num_embeddings, embedding_dim)
print("weight",w_emb.weight)

input = torch.LongTensor([[1,2,3],[3,2,1]])
input = torch.transpose(input,0,1)
print("input",input)

output = w_emb(input)
print("Size of the Embedding layer output",output.size())
#output = output.view(seq_length,batch_size,input_size)
print("Embedding layer output", output)


# Memory state
h_init = torch.zeros (num_layers * num_directions, batch_size, hidden_size)
print("Memory state initialisation", h_init)
print("Memory state size",h_init.size())

rnn = nn.RNN(input_size,hidden_size,nonlinearity = 'tanh',bias = True, bidirectional = True)
print("RNN",rnn)

h_seq, h_final = rnn(output,h_init)
print("H_seq",h_seq)
print("H_final",h_final)
print(h_seq.size(),"\n", h_final.size())