In [1]:
import numpy as np

In [2]:
def softmax(x):
    e_x = np.exp(x-np.max(x))
    return e_x/e_x.sum(axis=0)

In [6]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

![title](Rnn_forward.png)

In [7]:
def rnn_cell_forward(xt, a_prev, parameters):
    """ Implements a single forward step of the RNN-CELL
        xt(x^<t>): Input data at time step 't'. shape=(n_x,m) 
        a_prev(a^<t-1>): hidden state at time step 't-1'. shape=(n_a,m)
    """
    
    #Retrieve parameters from 'parameters' list
    #Wax: Weight matrix multiplying the input. shape=(n_a,n_x)
    #Waa: Weight matrix multiplying the hidden state. shape=(n_a,n_a)
    #Wya: Weight matrix relating the hidden-state to the output. shape=(n_y,n_a)
    #ba: Bias. shape=(n_a,1)
    #by: Bias. shape=(n_y,1)
    
    Wax = parameters['Wax']
    Waa = parameters['Waa']
    Wya = parameters['Wya']
    ba = parameters['ba']
    by = parameters['by']
    
    #Compute next activation (a^<t>)
    #a= tanh([Waa,Wax][\frac{a_prev}{xt}]+ba)
    a_next = np.tanh(np.dot(np.concatenate((Waa,Wax),axis=1),np.concatenate((a_prev,xt),axis=0))+ba)
    # compute output of this cell
    # y = softmax(np.dot(Wy,a)+by)
    yt_pred = softmax(np.dot(Wya,a_next)+by)
    
    cache = (a_next, a_prev, xt, parameters)
    
    return a_next,yt_pred,cache

![title](Rnn_forward2.png)

In [13]:
def rnn_forward(x, a0, parameters):
    """
    Implement the forward propagation of the recurrent neural network.
    x: Input data for every time-step. shape=(n_x,m,T_x)
    a0: Initial hidden state. shape=(n_a,m)
    
    Returns:
    a: Hidden states for every time-step. shape=(n_a,m,T_x)
    y_pred: Predictions for every time-step. shape=(n_y,m,T_x)
    caches: tuple of values needed for the backward pass, contains (list of caches,x)
    """
    
    #Initialize 'caches' which will contain the list of all caches
    caches = []
    
    #Retrieve dimensions from shapes of x and patameters['Wya']
    n_x, m ,T_x = x.shape
    n_y, n_a = parameters['Wya'].shape
    
    #Initialize 'a' and 'y' with zeros
    a = np.zeros((n_a,m,T_x))
    y_pred = np.zeros((n_y,m,T_x))
    
    a_next = a0
    
    #loop over all time-steps
    for t in range(T_x):
        a_next, yt_pred, cache = rnn_cell_forward(x[:,:,t],a_next,parameters)
        a[:,:,t]=a_next
        y_pred[:,:,t]=yt_pred
        caches.append(cache)
    caches = (caches, x)
    return a,y_pred,caches

In [14]:
np.random.seed(1)
x = np.random.randn(3,10,4)
a0 = np.random.randn(5,10)
Waa = np.random.randn(5,5)
Wax = np.random.randn(5,3)
Wya = np.random.randn(2,5)
ba = np.random.randn(5,1)
by = np.random.randn(2,1)
parameters = {"Waa": Waa, "Wax": Wax, "Wya": Wya, "ba": ba, "by": by}

a, y_pred, caches = rnn_forward(x, a0, parameters)
print("a[4][1] = ", a[4][1])
print("a.shape = ", a.shape)
print("y_pred[1][3] =", y_pred[1][3])
print("y_pred.shape = ", y_pred.shape)
print("caches[1][1][3] =", caches[1][1][3])
print("len(caches) = ", len(caches))

a[4][1] =  [-0.99999375  0.77911235 -0.99861469 -0.99833267]
a.shape =  (5, 10, 4)
y_pred[1][3] = [0.79560373 0.86224861 0.11118257 0.81515947]
y_pred.shape =  (2, 10, 4)
caches[1][1][3] = [-1.1425182  -0.34934272 -0.20889423  0.58662319]
len(caches) =  2


![title](Rnn_backward.png)

In [16]:
def rnn_cell_backward(da_next, cache):
    """
    Implements the backward pass for the RNN-cell (single time-step)
    
    da_next: Gradient of loss with respect to next hidden state dJ/da
    cache: output of rnn_forward() containing (caches,x) in which the caches is the 
            output of rnn_cell_forward() containing (a_next,a_pre,xt,parameters)
            
    Returns:
    gradients: python dictionary containing:
                      dx:Gradients of input data, of shape (n_x,m)
                      da_prev:Gradients of previous hidden state, of shape (n_a,m)
                      dWax:Gradients of input-to-hidden weights, of shape (n_a,n_x)
                      dWaa:Gradients of hidden-to-hidden weights, of shape (n_a,n_a)
                      dba:Gtadients of bias vector, of shape (n_a,1)
     """
    #Retrieve values from the first cache(t=1) of caches
    (a_next, a_prev, xt, parameters) = cache
    
    
    #Retrieve values from parameters
    Wax = parameters['Wax']
    Waa = parameters['Waa']
    Wya = parameters['Wya']
    ba = parameters['ba']
    by = parameters['by']
    
    #compute the gradient of tanh with respect to a_next
    dtanh = (1-a_next**2)*da_next
    
    dxt = np.dot(Wax.T,dtanh)
    dWax = np.dot(dtanh,xt.T)
    da_prev = np.dot(Waa.T,dtanh)
    dWaa = np.dot(dtanh,a_prev.T)
    
    dba = np.sum(dtanh,keepdims=True,axis=-1)
    
    gradients={'dxt':dxt,'da_prev':da_prev,'dWax':dWax,'dWaa':dWaa,'dba':dba}
    return gradients   

In [26]:
def rnn_backward(da,caches):
    """
    Implement the backward pass for a RNN over the entire sequence of input data.
    
    da: Upstream gradients of all hidden states, of shape (n_a,m,T_x)
    caches: tuple containing information from the forward pass (rnn_forward (a,y_pred,caches()))
    
    Returns:
    gradients:python dictionary containing:
              dx: Gradient of the input data, numpy-array of shape (n_x, m, T_x)
              da0: Gradient of the initial hidden state, numpy-array of shape (n_a, m)
              dWax: Gradient of the input's weight matrix, numpy-array of shape (n_a, n_x)
              dWaa: Gradient of the hidden state's weight matrix, numpy-arrayof shape (n_a, n_a)
              dba: Gradient of the bias, of shape (n_a, 1)
    """
    (caches,x) = caches
    (a1,a0,x1,parameters) = caches[0]
    
    #Retrieve dimensions from da's and x1's shapes
    n_a, m, T_x = da.shape
    n_x, m = x1.shape
    
    #initialize the gradients with the right size
    dx = np.zeros((n_x,m,T_x))
    dWax = np.zeros((n_a,n_x))
    dWaa = np.zeros((n_a,n_a))
    dba = np.zeros((n_a,1))
    da0 = np.zeros((n_a,m))
    da_prevt = np.zeros((n_a,m))
    
    #Loop through all the time steps
    for t in reversed(range(T_x)):
        gradients = rnn_cell_backward(da[:,:,t]+da_prevt,caches[t])
        dxt,da_prevt,dWaxt,dWaat,dbat = gradients['dxt'],gradients['da_prev'],gradients['dWax'],gradients['dWaa'],  gradients['dba']
        dx[:,:,t] = dxt
        dWax += dWaxt 
        dWaa += dWaat
        dba += dbat
    da0=da_prevt
    
    gradients={'dx':dx,'da0':da0,'dWax':dWax,'dWaa':dWaa,'dba':dba}
    return gradients

In [28]:
np.random.seed(1)
x = np.random.randn(3,10,4)
a0 = np.random.randn(5,10)
Wax = np.random.randn(5,3)
Waa = np.random.randn(5,5)
Wya = np.random.randn(2,5)
ba = np.random.randn(5,1)
by = np.random.randn(2,1)
parameters = {"Wax": Wax, "Waa": Waa, "Wya": Wya, "ba": ba, "by": by}
a, y, caches = rnn_forward(x, a0, parameters)
da = np.random.randn(5, 10, 4)
gradients = rnn_backward(da, caches)

print("gradients[\"dx\"][1][2] =", gradients["dx"][1][2])
print("gradients[\"dx\"].shape =", gradients["dx"].shape)
print("gradients[\"da0\"][2][3] =", gradients["da0"][2][3])
print("gradients[\"da0\"].shape =", gradients["da0"].shape)
print("gradients[\"dWax\"][3][1] =", gradients["dWax"][3][1])
print("gradients[\"dWax\"].shape =", gradients["dWax"].shape)
print("gradients[\"dWaa\"][1][2] =", gradients["dWaa"][1][2])
print("gradients[\"dWaa\"].shape =", gradients["dWaa"].shape)
print("gradients[\"dba\"][4] =", gradients["dba"][4])
print("gradients[\"dba\"].shape =", gradients["dba"].shape)


gradients["dx"][1][2] = [-2.07101689 -0.59255627  0.02466855  0.01483317]
gradients["dx"].shape = (3, 10, 4)
gradients["da0"][2][3] = -0.31494237512664985
gradients["da0"].shape = (5, 10)
gradients["dWax"][3][1] = 11.264104496527777
gradients["dWax"].shape = (5, 3)
gradients["dWaa"][1][2] = 2.3033331265798935
gradients["dWaa"].shape = (5, 5)
gradients["dba"][4] = [-0.74747722]
gradients["dba"].shape = (5, 1)
