In [1]:
import numpy as np

In [2]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

In [3]:
def softmax(x):
    e_x = np.exp(x-np.max(x))
    return e_x/e_x.sum(axis=0)

![title](lstm_cell_forward.png)

In [29]:
def lstm_cell_forward(xt, a_prev, c_prev, parameters):
    """
    Implement a single forward step of the LSTM-cell
    
    xt: Input data at timestep 't', shape=(n_x,m)
    a_prev: Hidden state at timestep 't-1', shape=(n_a,m)
    c_prev: Memory state at timestep 't-1', shape=(n_a,m)
    
    Returns:
       a_next: next hidden state. shape=(n_a,m)
       c_next: next memory state. shape=(n_a,m)
       yt_pred: prediction at timestep't'. shape=(n_y,m)
       cache: tuple of values needed for the backward pass,
              contins(a_next, c_next,a_prev,c_prev,xt,parameters)
    """
    #Wf: Weight matrix of th forget gate. shape=(n_a,n_a+n_x)
    #bf: Bias of forget gate. shape=(n_a,1)
    #Wi: Weight matrix of the update gate. shape=(n_a,n_a+n_x)
    #bi: Bias of update gate. shape=(n_a,1)
    #Wc: Weight matrix of the first "tanh". shape=(n_a,n_a+n_x)
    #bc: Bias of the first 'tanh'. shape=(n_a,1)
    #Wy: Weight matrix relating the hidden state to the output. shape=(n_y,n_a)
    #by: Bias relating the hidden state to the output. shape=(n_y,1)
    
    #Retrieve parameters from 'parameters'
    Wf = parameters['Wf']
    bf = parameters['bf']
    Wi = parameters['Wi']
    bi = parameters['bi']
    Wc = parameters['Wc']
    bc = parameters['bc']
    Wo = parameters['Wo']
    bo = parameters['bo']
    Wy = parameters['Wy']
    by = parameters['by']
    
    n_x, m = xt.shape
    n_y, n_a = Wy.shape
    
    concat = np.concatenate((a_prev,xt),axis=0)
    
    ft = sigmoid(np.dot(Wf,concat)+bf)
    it = sigmoid(np.dot(Wi,concat)+bi)
    cct = np.tanh(np.dot(Wc,concat)+bc)
    c_next = ft*c_prev+it*cct
    ot = sigmoid(np.dot(Wo,concat)+bo)
    a_next = ot*np.tanh(c_next)
    
    yt_pred = softmax(np.dot(Wy,a_next)+by)
    
    cache = (a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt,parameters)
    
    return a_next, c_next, yt_pred, cache

![title](lstm_forward.png)

In [31]:
def lstm_forward(x, a0, parameters):
    """
    Implement the forward propagation of the RNN using lstm cell.
    
    x: Input data for every time-step. shape=(n_x,m,T_x)
    a0: Initial hidden state. shape=(n_a,m)
    
    Returns:
    a: Hidden states for every time-step. shape=(n_a,m,T_x)
    y: Predictions for every time-step. shape=(n_y,m,T_x)
    """
    
    caches = []
    
    n_x, m, T_x =  x.shape
    n_y, n_a = parameters['Wy'].shape
    
    # initialize 'a','c' and 'y'
    a = np.zeros((n_a, m, T_x))
    c = np.zeros((n_a, m, T_x))
    y = np.zeros((n_y, m, T_x))
    
    a_next = a0
    c_next = np.zeros((n_a,m))
    
    for t in range(T_x):
        a_next, c_next, yt, cache = lstm_cell_forward(x[:,:,t],a_next,c_next,parameters)
        a[:,:,t] = a_next
        y[:,:,t] = yt
        c[:,:,t] = c_next
        caches.append(cache)
    
    caches = (caches,x)
    return a,y,c,caches

In [32]:
np.random.seed(1)
x = np.random.randn(3,10,7)
a0 = np.random.randn(5,10)
Wf = np.random.randn(5, 5+3)
bf = np.random.randn(5,1)
Wi = np.random.randn(5, 5+3)
bi = np.random.randn(5,1)
Wo = np.random.randn(5, 5+3)
bo = np.random.randn(5,1)
Wc = np.random.randn(5, 5+3)
bc = np.random.randn(5,1)
Wy = np.random.randn(2,5)
by = np.random.randn(2,1)

parameters = {"Wf": Wf, "Wi": Wi, "Wo": Wo, "Wc": Wc, "Wy": Wy, "bf": bf, "bi": bi, "bo": bo, "bc": bc, "by": by}

a, y, c, caches = lstm_forward(x, a0, parameters)
print("a[4][3][6] = ", a[4][3][6])
print("a.shape = ", a.shape)
print("y[1][4][3] =", y[1][4][3])
print("y.shape = ", y.shape)
print("caches[1][1[1]] =", caches[1][1][1])
print("c[1][2][1]", c[1][2][1])
print("len(caches) = ", len(caches))

a[4][3][6] =  0.17211776753291672
a.shape =  (5, 10, 7)
y[1][4][3] = 0.9508734618501101
y.shape =  (2, 10, 7)
caches[1][1[1]] = [ 0.82797464  0.23009474  0.76201118 -0.22232814 -0.20075807  0.18656139
  0.41005165]
c[1][2][1] -0.8555449167181982
len(caches) =  2


In [33]:
def lstm_cell_backward(da_next, dc_next, cache):
    """
    Implement the backward pass for the LSTM-cell
    
    da_next: Gradients of next hidden state. shape=(n_a,m)
    dc_next: Gradients of next cell state. shape=(n_a,m)
    cache: cache storing information from the forward pass
    
    Returns:
            dxt: Gradient of input data at time-step t. shape=(n_x,m)
            da_prev: Gradient of the previous hidden state. shape=(n_x,m)
            dc_prev: Gradient of the previous memory state. shape=(n_a,m,T_x)
            dWf: Gradient of the weight matrix of the forget gate. shape=(n_a,n_a+n_x)
            dWi: Gradient of the weight matrix of the update gate. shape=(n_a,n_a+n_x)
            dWc: Gradient of the weight matrix of the memory gate. shape=(n_a,n_a+n_x)
            dWo: Gradient of the weight matrix of the output gate. shape=(n_a,n_a+n_x)
            dbf: Gradient of biases of the forget gate. shape=(n_a,1)
            dbi: Gradient of biases of the update gate. shape=(n_a,1)
            dbc: Gradient of biases of the memory gate. shape=(n_a,1)
            dbo: Gradient of biases of the output gate. shape=(n_a,1)
    """
    
    #Retrieve information
    (a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt, parameters)=cache
    
    n_x, m = xt.shape
    n_a, m = a_next.shape
    
    dot = da_next*np.tanh(c_next)*ot*(1-ot)
    dcct = (dc_next*it+ot*(1-np.square(np.tanh(c_next)))*it*da_next)*(1-np.square(cct))
    dit = (dc_next*cct+ot*(1-np.square(np.tanh(c_next)))*cct*da_next)*it*(1-it)
    dft = (dc_next*c_prev+ot*(1-np.square(np.tanh(c_next)))*c_prev*da_next)*ft*(1-ft)
    
    dWf = np.dot(dft,np.concatenate((a_prev,xt),axis=0).T)
    dWi = np.dot(dit,np.concatenate((a_prev,xt),axis=0).T)
    dWc = np.dot(dcct,np.concatenate((a_prev,xt),axis=0).T)
    dWo = np.dot(dot,np.concatenate((a_prev,xt),axis=0).T)
    dbf = np.sum(dft,axis=1,keepdims=True)
    dbi = np.sum(dit,axis=1,keepdims=True)
    dbc = np.sum(dcct,axis=1,keepdims=True)
    dbo = np.sum(dot,axis=1,keepdims=True)
    
    da_prev = np.dot(parameters['Wf'][:,:n_a].T,dft)+np.dot(parameters['Wi'][:,:n_a].T,dit) \
             +np.dot(parameters['Wc'][:,:n_a].T,dcct)+np.dot(parameters['Wo'][:,:n_a].T,dot)
    dc_prev = dc_next*ft+ot*(1-np.square(np.tanh(c_next)))*ft*da_next
    dxt = np.dot(parameters['Wf'][:,n_a:].T,dft)+np.dot(parameters['Wi'][:,n_a:].T,dit) \
            +np.dot(parameters['Wc'][:,n_a:].T,dcct)+np.dot(parameters['Wo'][:,n_a:].T,dot)
    gradients = {"dxt": dxt, "da_prev": da_prev, "dc_prev": dc_prev, "dWf": dWf,"dbf": dbf, "dWi": dWi,"dbi": dbi,
                "dWc": dWc,"dbc": dbc, "dWo": dWo,"dbo": dbo}

    return gradients

In [None]:
def lstm_backward(da, caches):
    
    """
    Implement the backward pass for the RNN with LSTM-cell.

    da: Gradients w.r.t the hidden states. shape (n_a, m, T_x)
    dc: Gradients w.r.t the memory states. shape (n_a, m, T_x)
    caches -- cache storing information from the forward pass (lstm_forward)

    Returns:
          dx: Gradient of inputs.(n_x, m, T_x)
          da0: Gradient w.r.t. the previous hidden state. shape (n_a, m)
          dWf: Gradient w.r.t. the weight matrix of the forget gate. shape (n_a, n_a + n_x)
          dWi: Gradient w.r.t. the weight matrix of the update gate. shape (n_a, n_a + n_x)
          dWc: Gradient w.r.t. the weight matrix of the memory gate. shape (n_a, n_a + n_x)
          dWo: Gradient w.r.t. the weight matrix of the save gate. shape (n_a, n_a + n_x)
          dbf: Gradient w.r.t. biases of the forget gate. shape (n_a, 1)
          dbi: Gradient w.r.t. biases of the update gate. shape (n_a, 1)
          dbc: Gradient w.r.t. biases of the memory gate. shape (n_a, 1)
          dbo: Gradient w.r.t. biases of the save gate. shape (n_a, 1)
    """

    # Retrieve values from the first cache (t=1) of caches.
    (caches, x) = caches
    (a1, c1, a0, c0, f1, i1, cc1, o1, x1, parameters) = caches[0]
    
    ### START CODE HERE ###
    # Retrieve dimensions from da's and x1's shapes (≈2 lines)
    n_a, m, T_x = da.shape
    n_x, m = x1.shape
    
    # initialize the gradients with the right sizes (≈12 lines)
    dx = np.zeros((n_x,m,T_x))
    da0 = np.zeros((n_a,m))
    da_prevt = np.zeros((n_a,m))
    dc_prevt = np.zeros((n_a,m))
    dWf = np.zeros((n_a,n_a+n_x))
    dWi = np.zeros((n_a,n_a+n_x))
    dWc = np.zeros((n_a,n_a+n_x))
    dWo = np.zeros((n_a,n_a+n_x))
    dbf = np.zeros((n_a,1))
    dbi = np.zeros((n_a,1))
    dbc = np.zeros((n_a,1))
    dbo = np.zeros((n_a,1))
    
    # loop back over the whole sequence
    for t in reversed(range(T_x)):
        # Compute all gradients using lstm_cell_backward
        gradients = lstm_cell_backward(da[:,:,t]+da_prevt,dc_prevt,caches[t])
        # Store or add the gradient to the parameters' previous step's gradient
        dx[:,:,t] = gradients['dxt']
        dWf = gradients['dWf']
        dWi = gradients['dWi']
        dWc = gradients['dWc']
        dWo = gradients['dWo']
        dbf = gradients['dbf']
        dbi = gradients['dbi']
        dbc = gradients['dbc']
        dbo = gradients['dbo']
    # Set the first activation's gradient to the backpropagated gradient da_prev.
    da0 = gradients['da_prev']
    
    ### END CODE HERE ###

    # Store the gradients in a python dictionary
    gradients = {"dx": dx, "da0": da0, "dWf": dWf,"dbf": dbf, "dWi": dWi,"dbi": dbi,
                "dWc": dWc,"dbc": dbc, "dWo": dWo,"dbo": dbo}
    
    return gradients