In [1]:
import numpy as np


In [2]:
# implement softmax
def sftmax(x):
    e=np.exp(x-np.max(x))
    return e/np.sum(e, axis=0)

In [3]:
# implement tanh function
def tanh(x):
    ex=np.exp(x)
    ex1=np.exp(-x)
    return (ex-ex1)/(ex+ex1)

In [53]:
# implement sigmoid function
def sigmoid(x):
    e=np.exp(-x)
    return 1/(1+e)

In [42]:
# implement rnn forward propagation
def fwdProp(hprev, xt, params):
    cache=[]
    # get the parameters
    U=params['U']
    W=params['W']
    V=params['V']
    bx=params['bx']
    by=params['by']
    # calculate the hidden state
    ht=tanh(np.dot(U.T,xt)+np.dot(W,hprev)+ bx)
    yt=sftmax(np.dot(V, ht)+by)
    cache.append((hprev, ht, xt, params))
    return ht, yt, cache

In [43]:
# test the code 
xt=np.random.randn(500, 200)
hprev=np.random.randn(500,200)
params={}
params['U']=np.random.rand(200, 500)
params['W']=np.random.rand(200, 500)
params['V']=np.random.rand(200,200)
params['bx']=np.random.randn(200)
params['by']=np.random.randn(200)

In [44]:
fwdProp(hprev, xt, params)

ValueError: shapes (500,200) and (500,200) not aligned: 200 (dim 1) != 500 (dim 0)

In [50]:
# compute the fwd prop for whole sequence
def fwdPropSeq(x,h0,params):
    Nx,M,T=x.shape
    
    y_n, h_n=params['V'].shape
    h_n,h_n=params['W'].shape
    Ypred=np.zeros((y_n,M,T))
    H=np.zeros((h_n,M,T))
    caches=[]
    
    h_next=h0
    for t in range(T):
        h_next,yt, cache=fwdProp(h_next, x[:,:,t], params)
        Ypred[:,:,t]=yt
        H[:,:,t]=h_next
        caches.append(cache)
    return Ypred, H, caches

In [51]:
# test the code 
x=np.random.randn(500, 200, 10)
h0=np.random.randn(300,200)
params={}
params['U']=np.random.rand(500, 300)
params['W']=np.random.rand(300, 300)
params['V']=np.random.rand(200,300)
params['bx']=np.random.randn(300,1)
params['by']=np.random.randn(200,1)
fwdPropSeq(x,h0, params)

(array([[[1.01451435e-06, 1.38291644e-04, 1.38291644e-04, ...,
          1.38291644e-04, 1.38291644e-04, 1.38291644e-04],
         [4.33373819e-04, 1.38291644e-04, 1.38291644e-04, ...,
          1.38291644e-04, 1.38291644e-04, 1.38291644e-04],
         [2.62201357e-05, 2.33176966e-08, 2.33176966e-08, ...,
          2.33176966e-08, 2.33176966e-08, 2.33176966e-08],
         ...,
         [1.01137994e-02, 1.38291644e-04, 1.38291644e-04, ...,
          1.38291644e-04, 1.38291644e-04, 1.38291644e-04],
         [9.99106013e-11, 2.33176966e-08, 2.33176966e-08, ...,
          2.33176966e-08, 2.33176966e-08, 2.33176966e-08],
         [3.83978991e-05, 1.38291644e-04, 1.38291644e-04, ...,
          1.38291644e-04, 1.38291644e-04, 1.38291644e-04]],
 
        [[3.52851207e-09, 1.53525004e-09, 1.53525004e-09, ...,
          1.53525004e-09, 1.53525004e-09, 1.53525004e-09],
         [3.93901571e-09, 1.53525004e-09, 1.53525004e-09, ...,
          1.53525004e-09, 1.53525004e-09, 1.53525004e-09],
       

In [28]:
h=np.random.randn(300,200)
o=np.dot(params['U'].T,x[:,:,0])+np.dot(params['W'].T,h)

In [36]:
o+params['bx']

array([[-11.92039102, -12.39168958,  15.09550363, ...,  -0.21508797,
         24.7400666 ,   1.86474612],
       [-11.00529197, -18.5583804 ,  22.07860765, ...,  -1.02500236,
         15.49248824, -20.84580541],
       [ -5.36429579, -22.87657713,   0.38521942, ..., -10.30832679,
         26.39869104,   1.15052224],
       ...,
       [ -7.2290048 ,  -5.4191744 ,   3.42809769, ...,  -8.95517186,
          6.29433727, -11.62126546],
       [ 12.4575044 ,  -0.48298026,  17.41731892, ...,  -7.46256275,
         19.25746471,  -3.25491462],
       [  2.10025367,  -3.38168645,   4.69849188, ..., -22.71797381,
         21.39383213, -18.29763778]])

LSTM implementation

In [52]:
import numpy as np

In [103]:
# we will assume that the entire sequence of input as a tuple (n,m,T), where
# n is the size of the vocabulary, m is the size of the mini batch, and T is the
# total time sequence. 

# we will create a function lstm fwd prop

def LSTMfwdProp(X, h0,c0, params):
    
    '''
    X= the entire input sequence of the shape (n, m, T).
    h0= the initial hidden state.
    params= the list of parameters that consist of Wxf, Whf, Wxi, Whi, Wxc, Whc, Wco,Who,
    bf,bi,bc,bo.
    
    '''
    # get the dimensions of X
    n, m, T=X.shape
    
    # get the shape of h
    h, h = params['Whf'].shape
    
    # get the shape of y
    h,y=params['Wy'].shape
    
    # initialize tensors h, c, and  y
    
    c=np.zeros((h,m,T))
    h=np.zeros((h,m,T))    
    y=np.zeros((y,m,T))
    
    
    # read the parameters
    Wxf=params['Wxf']
    Wxi=params['Wxi']
    Wxc=params['Wxc']
    Wxo=params['Wxo']
    Whf=params['Whf']
    Whi=params['Whi']
    Whc=params['Whc']
    Who=params['Who']
    Wy=params['Wy']
    bf=params['bf']
    bi=params['bi']
    bc=params['bc']
    bo=params['bo']
    by=params['by']
    
    
    hprev=h0
    cprev=c0
    # for each timestep calculate the forget gate, input gate, output gate
    for t in range(T):
        xt=X[:,:,t]  
        # calculate the forget gate
        ft= sigmoid(np.dot(Wxf.T, xt) + np.dot(Whf.T, hprev)+bf)
        # calculate the input gate
        it=sigmoid(np.dot(Wxi.T, xt) + np.dot(Whi.T, hprev) + bi)
        # calculate the input modulation gate
        c1t=tanh(np.dot(Wxc.T, xt)+ np.dot(Whc.T,hprev) + bc)
        # calculate the output gate
        ot=sigmoid(np.dot(Wxo.T,xt)+np.dot(Who.T,hprev)+bo)
        # calculate the next cell state
        ct=cprev * ft + it * c1t
        # calculate the next hidden state
        ht=ot * tanh(ct)
        # calculate the predicted value
        ytpred=sftmax(np.dot(Wy.T,ht)+by)
        
        # add the computations of timestamp t into respective variables
        h[:,:,t]=ht
        c[:,:,t]=ct
        y[:,:,t]=ytpred
        
        # set the present hidden state as the new hprev
        hprev=ht
    return h, c, y
        
        


In [104]:
# Test the forward propagation function
X=np.random.randn(8,4,10)
h0=np.random.randn(5,4)
c0=np.random.randn(5,4)
params['Wxf']=np.random.randn(8,5)
params['Wxi']=np.random.randn(8,5)
params['Wxc']=np.random.randn(8,5)
params['Wxo']=np.random.randn(8,5)
params['Whf']=np.random.randn(5,5)
params['Whi']=np.random.randn(5,5)
params['Whc']=np.random.randn(5,5)
params['Who']=np.random.randn(5,5)
params['Wy']=np.random.randn(5,4)
params['bf']=np.random.randn(5,1)
params['bi']=np.random.randn(5,1)
params['bc']=np.random.randn(5,1)
params['bo']=np.random.randn(5,1)
params['by']=np.random.randn(4,1)

In [105]:
LSTMfwdProp(X,h0,c0,params)

(array([[[-2.10806226e-01, -3.64964313e-01, -4.86750086e-03,
          -8.11851303e-01, -4.03623339e-01, -7.74151738e-01,
          -5.50628597e-01, -5.62802217e-01, -9.56196572e-01,
          -3.38368474e-01],
         [-8.22848621e-02, -4.44646797e-01, -7.50229068e-01,
          -3.64328351e-01, -3.20294173e-01, -3.59020878e-02,
          -5.67594721e-01, -2.37628338e-01, -2.03493960e-01,
          -3.94434180e-02],
         [-4.64309998e-01, -2.36485147e-01,  1.17028197e-01,
          -6.56452400e-03,  1.62156172e-03,  2.44577279e-01,
          -4.28678760e-02,  2.41968386e-01,  1.47295865e-01,
           9.50787571e-03],
         [-9.04086644e-01, -2.97172916e-01, -7.62079163e-01,
          -9.31241691e-01, -3.25616464e-02, -5.89451162e-02,
          -7.64780501e-01, -2.52579721e-01, -1.58146474e-01,
          -5.93323884e-02]],
 
        [[-5.38725201e-01, -1.34207313e-01, -4.53930518e-01,
           2.03465939e-02, -2.82079287e-01, -1.37479236e-01,
          -2.82527882e-02,  1.6