In [93]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter

In [106]:
input = torch.rand(1, 32, 128) # N * L * D
layer = nn.RNN(128, 256, 10, batch_first=True)
output = layer(input)
print(output[0].shape)
print(output[1].shape)
for k, v in layer.named_parameters():
    print(k)

torch.Size([1, 32, 256])
torch.Size([10, 1, 256])
weight_ih_l0
weight_hh_l0
bias_ih_l0
bias_hh_l0
weight_ih_l1
weight_hh_l1
bias_ih_l1
bias_hh_l1
weight_ih_l2
weight_hh_l2
bias_ih_l2
bias_hh_l2
weight_ih_l3
weight_hh_l3
bias_ih_l3
bias_hh_l3
weight_ih_l4
weight_hh_l4
bias_ih_l4
bias_hh_l4
weight_ih_l5
weight_hh_l5
bias_ih_l5
bias_hh_l5
weight_ih_l6
weight_hh_l6
bias_ih_l6
bias_hh_l6
weight_ih_l7
weight_hh_l7
bias_ih_l7
bias_hh_l7
weight_ih_l8
weight_hh_l8
bias_ih_l8
bias_hh_l8
weight_ih_l9
weight_hh_l9
bias_ih_l9
bias_hh_l9


In [107]:
input = torch.rand(1, 32, 128) # N * L * D
layer = nn.RNN(128, 256, 10, batch_first=True, bidirectional=True)
output = layer(input)
print(output[0].shape)
print(output[1].shape)


torch.Size([1, 32, 512])
torch.Size([20, 1, 256])


In [105]:
# one layer

def rnn_forward(input, W_ih, W_hh, B_ih, B_hh, h_0):
    L = input.shape[1]
    output = torch.zeros(input.shape[0], input.shape[1], W_ih.shape[0])
    h_prev = h_0
    
    for t in range(0, L):
        x_t = input[:, t, :]
        h_prev = torch.tanh(x_t @ W_ih.T + B_ih + h_prev @ W_hh.T + B_hh)
        output[:, t, :] = h_prev
        
    return output, h_prev

input = torch.rand(2, 32, 128) # N * L * D
h_0 = torch.zeros(1, 1, 256)
layer = nn.RNN(128, 256, 1, batch_first=True)
output, h_n = layer(input)
print(output)
print(h_n)
output, h_n = rnn_forward(input, layer.weight_ih_l0, layer.weight_hh_l0, layer.bias_ih_l0, layer.bias_hh_l0, h_0)
print(output)


tensor([[[ 1.6322e-01,  2.4584e-02,  5.7583e-04,  ...,  4.0266e-02,
           1.5913e-01,  4.9608e-01],
         [ 4.3029e-02,  9.0179e-02, -6.4765e-02,  ...,  3.9127e-01,
           2.9561e-01,  5.1282e-01],
         [-9.0240e-02,  1.1956e-01, -6.9290e-02,  ...,  6.4814e-02,
           3.1744e-01,  5.1586e-01],
         ...,
         [-1.0407e-01,  1.6431e-01, -3.0555e-02,  ...,  2.4383e-01,
           1.5339e-01,  3.1201e-01],
         [-7.1087e-02,  2.3048e-01,  1.4084e-01,  ...,  1.3087e-02,
           1.8814e-01,  4.8251e-01],
         [ 3.8113e-02, -1.9510e-01, -2.2524e-01,  ...,  7.2509e-02,
           2.0123e-01,  4.0140e-01]],

        [[ 9.8481e-02,  4.8401e-02, -1.3581e-01,  ...,  2.9438e-01,
          -5.0514e-02,  2.4868e-01],
         [ 4.4125e-04,  1.5117e-01,  1.6450e-01,  ...,  2.0371e-01,
           2.7291e-01,  4.5843e-01],
         [-2.2823e-01,  5.2925e-03,  6.3953e-03,  ...,  5.8448e-02,
           3.5168e-02,  3.3025e-01],
         ...,
         [-1.8976e-01,  9

**Outputs are concatenated along the sequence length dimension. The reversed part should be reversed back**
**Hidden states are concatenated along the batch dimension. It is not reversed back to retain information it learnt**

In [75]:
# one layer

def bidirectional_rnn_forward(input, W_ih, W_hh, B_ih, B_hh, h_0, W_reverse_ih, W_reverse_hh, B_reverse_ih, B_reverse_hh, h_reverse_0):
    L = input.shape[1]
    N = input.shape[0]
    hidden_D = W_ih.shape[0]
    
    combined_output = torch.zeros(N, L, 2 * hidden_D)
    
    output, h_prev = rnn_forward(input, W_ih, W_hh, B_ih, B_hh, h_0)
    output_reverse, h_prev_reverse = rnn_forward(torch.flip(input, [1]), W_reverse_ih, W_reverse_hh, B_reverse_ih, B_reverse_hh, h_reverse_0)
    # output is reversed back, hidden is not 
    output_reverse = torch.flip(output_reverse, [1])
        
    combined_output[:, :, :hidden_D] = output
    combined_output[:, :, hidden_D:] = output_reverse
    return combined_output, torch.cat([h_prev, h_prev_reverse], dim=0)

input = torch.rand(2, 32, 128) # N * L * D
h_0 = torch.zeros(1, 1, 256)
h_0_reverse = torch.zeros(1, 1, 256)
layer = nn.RNN(128, 256, 1, batch_first=True, bidirectional=True)
output, h_n = layer(input)
print(output)
print(h_n)
output, h_n = bidirectional_rnn_forward(input, layer.weight_ih_l0, layer.weight_hh_l0, layer.bias_ih_l0, layer.bias_hh_l0, h_0, layer.weight_ih_l0_reverse, layer.weight_hh_l0_reverse, layer.bias_ih_l0_reverse, layer.bias_hh_l0_reverse, h_0_reverse)
print(output)
print(h_n)

tensor([[[ 0.0568,  0.1580,  0.0554,  ...,  0.0472, -0.3300, -0.3740],
         [ 0.1640,  0.1496, -0.1464,  ..., -0.1184, -0.2303, -0.2452],
         [-0.1713,  0.0829, -0.1965,  ...,  0.0542, -0.3465, -0.2295],
         ...,
         [ 0.0247, -0.0193, -0.3233,  ...,  0.0061, -0.3055, -0.2373],
         [ 0.0373,  0.0874,  0.0355,  ...,  0.0684, -0.2353, -0.3872],
         [-0.1355,  0.1523, -0.2585,  ...,  0.2241, -0.1634, -0.6356]],

        [[ 0.2082, -0.0737, -0.0546,  ...,  0.0469,  0.0127, -0.2052],
         [-0.0714, -0.1095, -0.2451,  ..., -0.0404, -0.4353, -0.2885],
         [ 0.2210, -0.0547, -0.1427,  ..., -0.1025, -0.2274, -0.4809],
         ...,
         [-0.0228,  0.0934, -0.0999,  ..., -0.1799, -0.3475, -0.3763],
         [-0.0653,  0.1755, -0.1629,  ..., -0.0107, -0.5088, -0.5674],
         [ 0.2802, -0.2470, -0.0702,  ...,  0.0027, -0.0331, -0.3965]]],
       grad_fn=<TransposeBackward1>)
tensor([[[-0.1355,  0.1523, -0.2585,  ..., -0.0304,  0.1876,  0.0636],
        

In [91]:
# multilayers
# the last hidden state of the pervious layer doesn't affect the next layer.

def rnn_forward_multilayers(input, W_ih, W_hh, B_ih, B_hh):
    h_0 = torch.zeros(1, 1, W_ih[0].shape[0])
    h_n = []
    
    for layer in range(len(W_ih)):
        output, h_prev = rnn_forward(input, W_ih[layer], W_hh[layer], B_ih[layer], B_hh[layer], h_0)
        input = output
        h_n.append(h_prev)
    return output, torch.cat(h_n, axis=0)



input = torch.rand(2, 32, 128) # N * L * D
layer = nn.RNN(128, 256, 3, batch_first=True)
torch_output, torch_h_n = layer(input)


W_ih = [layer.weight_ih_l0, layer.weight_ih_l1, layer.weight_ih_l2]
W_hh = [layer.weight_hh_l0, layer.weight_hh_l1, layer.weight_hh_l2] 
B_ih = [layer.bias_ih_l0, layer.bias_ih_l1, layer.bias_ih_l2]
B_hh = [layer.bias_hh_l0, layer.bias_hh_l1, layer.bias_hh_l2]
my_output, my_h_n = rnn_forward_multilayers(input, W_ih, W_hh, B_ih, B_hh)

print(torch_output)
print(my_output)

print(torch_h_n)
print(my_h_n)

tensor([[[ 0.0260,  0.0707, -0.0743,  ..., -0.1762, -0.2624,  0.0388],
         [-0.1117,  0.3146, -0.0922,  ..., -0.1021, -0.1647,  0.0743],
         [-0.0768,  0.3284,  0.0390,  ..., -0.2411, -0.1844,  0.1068],
         ...,
         [-0.0696,  0.1940,  0.0810,  ..., -0.0991, -0.1461,  0.0283],
         [-0.1369,  0.2195,  0.0966,  ..., -0.2227, -0.1920,  0.0370],
         [-0.0826,  0.2015,  0.1103,  ..., -0.1661, -0.2656,  0.0510]],

        [[ 0.0208,  0.0494, -0.0766,  ..., -0.2300, -0.2538,  0.0535],
         [-0.0821,  0.2166, -0.0123,  ..., -0.2222, -0.2386,  0.0378],
         [-0.0468,  0.2966,  0.1014,  ..., -0.1403, -0.1236, -0.0315],
         ...,
         [ 0.0252,  0.3453,  0.1335,  ..., -0.0645, -0.1282,  0.0203],
         [-0.0182,  0.3488,  0.1221,  ..., -0.0972, -0.2039,  0.1223],
         [ 0.0043,  0.3280,  0.0614,  ..., -0.2298, -0.1836, -0.0687]]],
       grad_fn=<TransposeBackward1>)
tensor([[[ 0.0260,  0.0707, -0.0743,  ..., -0.1762, -0.2624,  0.0388],
        

In [95]:
def bidirectional_rnn_forward_multilayers(input, W_ih, W_hh, B_ih, B_hh, W_reverse_ih, W_reverse_hh, B_reverse_ih, B_reverse_hh):
    h_0 = torch.zeros(1, 1, W_ih[0].shape[0])
    h_n = []
    
    for layer in range(len(W_ih)):
        output, h_prev = bidirectional_rnn_forward(input, W_ih[layer], W_hh[layer], B_ih[layer], B_hh[layer], h_0, W_reverse_ih[layer], W_reverse_hh[layer], B_reverse_ih[layer], B_reverse_hh[layer], h_0)
        input = output
        h_n.append(h_prev)
    return output, torch.cat(h_n, axis=0)

input = torch.rand(2, 32, 128) # N * L * D
layer = nn.RNN(128, 256, 3, batch_first=True, bidirectional=True)
torch_output, torch_h_n = layer(input)


W_ih = [layer.weight_ih_l0, layer.weight_ih_l1, layer.weight_ih_l2]
W_hh = [layer.weight_hh_l0, layer.weight_hh_l1, layer.weight_hh_l2] 
B_ih = [layer.bias_ih_l0, layer.bias_ih_l1, layer.bias_ih_l2]
B_hh = [layer.bias_hh_l0, layer.bias_hh_l1, layer.bias_hh_l2]
W_reverse_ih = [layer.weight_ih_l0_reverse, layer.weight_ih_l1_reverse, layer.weight_ih_l2_reverse]
W_reverse_hh = [layer.weight_hh_l0_reverse, layer.weight_hh_l1_reverse, layer.weight_hh_l2_reverse] 
B_reverse_ih = [layer.bias_ih_l0_reverse, layer.bias_ih_l1_reverse, layer.bias_ih_l2_reverse]
B_reverse_hh = [layer.bias_hh_l0_reverse, layer.bias_hh_l1_reverse, layer.bias_hh_l2_reverse]
my_output, my_h_n = bidirectional_rnn_forward_multilayers(input, W_ih, W_hh, B_ih, B_hh, W_reverse_ih, W_reverse_hh, B_reverse_ih, B_reverse_hh)

print(torch_output)
print(my_output)

print(torch_h_n)
print(my_h_n)

tensor([[[-2.2871e-02,  8.2766e-02,  9.8542e-02,  ...,  3.1438e-01,
          -1.8367e-01,  3.2801e-01],
         [-2.1307e-01,  2.2806e-01,  4.6400e-03,  ...,  1.4595e-01,
          -5.3105e-02,  1.0537e-01],
         [-2.6231e-01,  3.4964e-01,  5.0635e-02,  ...,  1.2524e-01,
          -3.6468e-01,  4.3890e-02],
         ...,
         [-3.0108e-01,  4.6312e-01, -2.1220e-01,  ...,  1.8264e-01,
           1.1459e-02,  2.7888e-01],
         [-3.3551e-01,  3.8755e-01, -2.7315e-02,  ...,  9.4507e-02,
          -7.3615e-04,  3.2940e-01],
         [-1.8453e-01,  4.9384e-01, -6.3438e-02,  ...,  8.1001e-02,
          -1.0529e-01,  1.8432e-01]],

        [[ 2.4146e-04,  1.4694e-01, -1.5009e-01,  ...,  3.8884e-02,
          -2.2971e-02,  2.8263e-01],
         [-1.0530e-01,  1.9030e-01,  8.2592e-02,  ...,  3.6136e-01,
          -8.9375e-02,  2.4751e-01],
         [-3.6790e-01,  2.6136e-01,  2.8915e-02,  ...,  7.5365e-02,
          -3.1295e-01,  4.0544e-01],
         ...,
         [-2.9997e-01,  3

In [96]:
for i in W_reverse_ih:
    print(i.shape)
for i in W_hh:
    print(i.shape)
for i in B_ih:
    print(i.shape)
for i in B_hh:
    print(i.shape)

torch.Size([256, 128])
torch.Size([256, 512])
torch.Size([256, 512])
torch.Size([256, 256])
torch.Size([256, 256])
torch.Size([256, 256])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([256])


In [100]:
class RNN(nn.Module):
    def __init__(self, dims, hidden_dims, num_layers, bidirectional=False):
        super(RNN, self).__init__()
        
        self.bidirectional = bidirectional
        
        self.W_ih = []
        self.W_hh = []
        self.B_ih = []
        self.B_hh = []
        
        if bidirectional:
            self.W_ih_reverse = []
            self.W_hh_reverse = []
            self.B_ih_reverse = []
            self.B_hh_reverse = []
            for layer in range(0, num_layers):
                if layer == 0:
                    self.W_ih.append(torch.rand(hidden_dims, dims))
                    self.W_ih_reverse.append(torch.rand(hidden_dims, dims))
                else:
                    self.W_ih.append(torch.rand(hidden_dims, 2 * hidden_dims))
                    self.W_ih_reverse.append(torch.rand(hidden_dims, 2 * hidden_dims))
                self.W_hh.append(torch.rand(hidden_dims, hidden_dims))
                self.W_hh_reverse.append(torch.rand(hidden_dims, hidden_dims))
                self.B_ih.append(torch.rand(hidden_dims, ))
                self.B_ih_reverse.append(torch.rand(hidden_dims, ))
                self.B_hh.append(torch.rand(hidden_dims, ))
                self.B_hh_reverse.append(torch.rand(hidden_dims, ))
            self.W_ih_reverse = nn.ParameterList(self.W_ih_reverse)
            self.W_hh_reverse = nn.ParameterList(self.W_hh_reverse)
            self.B_ih_reverse = nn.ParameterList(self.B_ih_reverse)
            self.B_hh_reverse = nn.ParameterList(self.B_hh_reverse)
        else:
            for layer in range(0, num_layers):
                if layer == 0:
                    self.W_ih.append(torch.rand(hidden_dims, dims))
                else:
                    self.W_ih.append(torch.rand(hidden_dims, hidden_dims))
                self.W_hh.append(torch.rand(hidden_dims, hidden_dims))
                self.B_ih.append(torch.rand(hidden_dims, ))
                self.B_hh.append(torch.rand(hidden_dims, ))
                
        self.W_ih = nn.ParameterList(self.W_ih)
        self.W_hh = nn.ParameterList(self.W_hh)
        self.B_ih = nn.ParameterList(self.B_ih)
        self.B_hh = nn.ParameterList(self.B_hh)
                
    def forward(self, x):
        if self.bidirectional:
            return bidirectional_rnn_forward_multilayers(x, self.W_ih, self.W_hh, self.B_ih, self.B_hh, self.W_ih_reverse, self.W_hh_reverse, self.B_ih_reverse, self.B_hh_reverse)
        else:
            return rnn_forward_multilayers(x, self.W_ih, self.W_hh, self.B_ih, self.B_hh)
        
         

In [104]:
input = torch.rand(2, 32, 128) # N * L * D
layer = RNN(128, 256, 3, bidirectional=True)
my_output, my_h_n = layer(input)
print(my_output.shape)
print(my_h_n.shape)

for k, v in layer.named_parameters():
    print(k)

torch.Size([2, 32, 512])
torch.Size([6, 2, 256])
W_ih_reverse.0
W_ih_reverse.1
W_ih_reverse.2
W_hh_reverse.0
W_hh_reverse.1
W_hh_reverse.2
B_ih_reverse.0
B_ih_reverse.1
B_ih_reverse.2
B_hh_reverse.0
B_hh_reverse.1
B_hh_reverse.2
W_ih.0
W_ih.1
W_ih.2
W_hh.0
W_hh.1
W_hh.2
B_ih.0
B_ih.1
B_ih.2
B_hh.0
B_hh.1
B_hh.2
