In [568]:
import torch
import torch.nn as nn
import numpy as np

<h1><b>Single LSTM Cell (With PyTorch)</b></h1>

In [569]:
model = nn.LSTMCell(input_size=20,hidden_size=100)

In [570]:
print(model.weight_hh.shape)
print(model.weight_ih.shape)
print(model.bias_hh.shape)
print(model.bias_ih.shape)

torch.Size([400, 100])
torch.Size([400, 20])
torch.Size([400])
torch.Size([400])


<h1><b>Single LSTM Cell (Without PyTorch)</b></h1>

In [571]:
def sigmoid(x):
  return 1 / (1 + np.exp(-x))

def lstm_cell(x,h,c,W_hh,W_ih,b):
  i,f,g,o = np.split(W_hh@h + W_ih@x + b , 4)
  i,f,g,o = sigmoid(i),sigmoid(f),np.tanh(g),sigmoid(o)
  c_t = f*c + i*g
  h_t = o * np.tanh(c_t)
  return h_t , c_t

<h2><b>LSTM Cell Validity Check</b></h2>

In [572]:
x0 = np.random.randn(1,20).astype(np.float32)
h0 = np.random.randn(1,100).astype(np.float32)
c0 = np.random.randn(1,100).astype(np.float32)

h_ , c_ = model(torch.tensor(x0) , (torch.tensor(h0) , torch.tensor(c0)))

In [573]:
h , c = lstm_cell(x0[0],h0[0],c0[0],
                model.weight_hh.detach().numpy(),
                model.weight_ih.detach().numpy(),
                (model.bias_hh + model.bias_ih).detach().numpy())

In [574]:
print(np.linalg.norm(c - c_[0].detach().numpy()))
print(np.linalg.norm(h - h_[0].detach().numpy()))

4.2234015e-07
2.483791e-07


<h1><b>Sequence LSTM (With PyTorch)</b></h1>

In [575]:
model = nn.LSTM(input_size=20,hidden_size=100,num_layers=1)

<h1><b>Sequence LSTM (Without PyTorch)</b></h1>

In [576]:
# Following the convention of PyTorch we will return the hidden state of
# all time steps up until now and the current memory state
def lstm(X,h,c,W_hh,W_ih,b):
  H = np.zeros((X.shape[0], h.shape[0]))
  for t in range(X.shape[0]):
    h , c = lstm_cell(X[t],h,c,W_hh,W_ih,b)
    H[t] = h
  return H , c

<h2><b>Sequence LSTM validity check</b></h2>

In [577]:
X = np.random.randn(50,20).astype(np.float32)
h0 = np.random.randn(1,100).astype(np.float32)
c0 = np.random.randn(1,100).astype(np.float32)

In [578]:
H,c = lstm(X,h0[0],c0[0],
            model.weight_hh_l0.detach().numpy(),
            model.weight_ih_l0.detach().numpy(),
           (model.bias_hh_l0 + model.bias_ih_l0).detach().numpy())

In [579]:
H_,(h_,c_) = model(torch.tensor(X),(torch.tensor(h0),torch.tensor(c0)))

In [580]:
np.linalg.norm(H - H_.detach().numpy())

np.float64(1.5254268484843015e-06)

<h1><b>Batching</b></h1>


> A true LSTM model, as is the case with PyTorch, takes a batch of sequences as input.




Looking at the definition of the LSTM model in PyTorch documentation we will realize that the input it takes by default is a tensor of shape `(T,B,n)` (unless you set `batch_first = true`) where :<br>
<h4><b>first dimension corresponds to the input at time t</b></h4>
<h4><b>Second dimension corresponds to the sequence in batch</b></h4>
<h4><b>Third dimension corresponds to positions in an input vector</b></h4><br>

However from intuition, input representation of shape `(B,T,n)` is more logical and matches the <b>top-down</b> structure of the training dataset that we input to the LSTM.

- <h2>Why ?</h2>

With the representation in the form `(B,T,n)` the matrix of input vectors at time step `t` accessed through `X[:,t,:]` will not be <u>retrieved contiguously and thus inefficient</u>.However using the `(T,B,n)` representation the matrix of inputs at time t will be accessed through `X[t,:,:]` in a way that all the elements are in a <b>contiguous</b> block of memory. Moreover these matrices themselves are also stored contiguously ` X[0,:,:], X[1,:,:] , X[2,:,:] , ...`.

<h1><b>Full LSTM model (Batch Version)</b></h1>

Here the only difference from the sequence LSTM cell is that now we have the `x_t,h_t-1 and c_t-1` not just from **one sequence** but from **a batch of sequences** and thus `x, h, c` would be matrices. So when splitting we have to specify with respect to which axis are we gonna split it.



> Only a slight change in sequence LSTM cell and LSTM model is needed.






In [581]:

def lstm_cell(x, h, c, W_hh, W_ih, b):
  i,f,g,o = np.split(W_hh@(h.T) + W_ih@(x.T) + b,4, axis = 0)
  i,f,g,o = sigmoid(i),sigmoid(f),np.tanh(g),sigmoid(o)
  i,f,g,o = i.T,f.T,g.T,o.T
  c_t = f*c + i*g
  h_t = o*np.tanh(c_t)

  return h_t , c_t


def lstm(X,h,c,W_hh,W_ih,b):
  H = np.zeros((X.shape[0],X.shape[1],h.shape[1]))
  for t in range(X.shape[0]):
    h , c = lstm_cell(X[t],h,c,W_hh,W_ih,b)
    H[t] = h
  return H , c

<h2><b>Batch LSTM Validity Check</b></h2>

In [582]:
X = np.random.randn(50,128,20).astype(np.float32)
h0 = np.random.randn(1,128,100).astype(np.float32)
c0 = np.random.randn(1,128,100).astype(np.float32)

In [583]:
H_,(h_,c_) = model(torch.tensor(X),(torch.tensor(h0),torch.tensor(c0)))

In [584]:
H , c = lstm(X,h0[0],c0[0],
             model.weight_hh_l0.detach().numpy(),
             model.weight_ih_l0.detach().numpy(),
             (model.bias_hh_l0 + model.bias_ih_l0)[:,None].detach().numpy())

In [585]:
print(np.linalg.norm(H - H_.detach().numpy()))

9.928616607572253e-06
