In [0]:
import numpy as np

RNN Utils
-----------


In [0]:

def softmax(x):
  e_x = np.exp(x - np.max(x))
  return e_x / e_x.sum(axis=0)
def sigmoid(x):
  return 1 / (1 + np.exp(-x))
"""
Initializes v and s as two python dictionaries with:
- keys: "dW1", "db1", ..., "dWL", "dbL"
- values: numpy arrays of zeros of the same shape
as the corresponding gradients/parameters.
Arguments:
parameters -- python dictionary containing your parameters.
parameters["W" + str(l)] = Wl
parameters["b" + str(l)] = bl
Returns:
v -- python dictionary that will contain the exponentially
weighted average of the gradient.
v["dW" + str(l)] = ...
v["db" + str(l)] = ...
s -- python dictionary that will contain the exponentially
weighted average of the squared gradient.
s["dW" + str(l)] = ...
s["db" + str(l)] = ...
"""
def initialize_adam(parameters) :

  L = len(parameters) // 2 # number of layers in the neural networks
  v = {}
  s = {}
# Initialize v, s. Input: "parameters". Outputs: "v, s".
  for l in range(L):
### START CODE HERE ### (approx. 4 lines)
    v["dW" + str(l+1)] = np.zeros(parameters["W" + str(l+1)].shape)
    v["db" + str(l+1)] = np.zeros(parameters["b" + str(l+1)].shape)
    s["dW" + str(l+1)] = np.zeros(parameters["W" + str(l+1)].shape)
    s["db" + str(l+1)] = np.zeros(parameters["b" + str(l+1)].shape)
### END CODE HERE ###
  return v, s
"""
Update parameters using Adam
Arguments:
parameters -- python dictionary containing your parameters:
parameters['W' + str(l)] = Wl
parameters['b' + str(l)] = bl
grads -- python dictionary containing your gradients for each
parameters:
grads['dW' + str(l)] = dWl
grads['db' + str(l)] = dbl
v -- Adam variable, moving average of the first gradient,
python dictionary
s -- Adam variable, moving average of the squared gradient,
python dictionary
learning_rate -- the learning rate, scalar.
beta1 -- Exponential decay hyperparameter for the first moment
estimates
beta2 -- Exponential decay hyperparameter for the second
moment estimates
epsilon -- hyperparameter preventing division by zero in Adam
updates
Returns:
parameters -- python dictionary containing your updated
parameters
v -- Adam variable, moving average of the first gradient,
python dictionary
s -- Adam variable, moving average of the squared gradient,
python dictionary
"""
def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate = 0.01, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8):
  L = len(parameters) // 2 # number of layers in the neural networks
  v_corrected = {} # Initializing first moment estimate, python dictionary
  s_corrected = {} # Initializing second moment estimate, python dictionary
# Perform Adam update on all parameters
  for l in range(L):
# Moving average of the gradients. Inputs: "v, grads, beta1". Output: "v".
### START CODE HERE ### (approx. 2 lines)
    v["dW" + str(l+1)] = beta1 * v["dW" + str(l+1)] + (1 - beta1) * grads["dW" + str(l+1)]
    v["db" + str(l+1)] = beta1 * v["db" + str(l+1)] + (1 - beta1) * grads["db" + str(l+1)]
### END CODE HERE ###
# Compute bias-corrected first moment estimate. Inputs: "v, beta1, t". Output: "v_corrected".
### START CODE HERE ### (approx. 2 lines)
    v_corrected["dW" + str(l+1)] = v["dW" + str(l+1)] / (1 - beta1**t)
    v_corrected["db" + str(l+1)] = v["db" + str(l+1)] / (1 - beta1**t)
### END CODE HERE ###
# Moving average of the squared gradients. Inputs: "s, grads, beta2". Output: "s".
### START CODE HERE ### (approx. 2 lines)
    s["dW" + str(l+1)] = beta2 * s["dW" + str(l+1)] + (1 - beta2) * (grads["dW" + str(l+1)] ** 2)
    s["db" + str(l+1)] = beta2 * s["db" + str(l+1)] + (1 - beta2) * (grads["db" + str(l+1)] ** 2)
### END CODE HERE ###
# Compute bias-corrected second raw moment estimate. Inputs: "s, beta2, t". Output: "s_corrected".
### START CODE HERE ### (approx. 2 lines)
    s_corrected["dW" + str(l+1)] = s["dW" + str(l+1)] / (1 - beta2 ** t)
    s_corrected["db" + str(l+1)] = s["db" + str(l+1)] / (1 - beta2 ** t)
### END CODE HERE ###
# Update parameters. Inputs: "parameters, learning_rate, v_corrected, s_corrected, epsilon". Output: "parameters".
### START CODE HERE ### (approx. 2 lines)
    parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * v_corrected["dW" + str(l+1)]  /np.sqrt(s_corrected["dW" + str(l+1)] + epsilon)
    parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * v_corrected["db" + str(l+1)] /np.sqrt(s_corrected["db" + str(l+1)] + epsilon)
### END CODE HERE ###
  return parameters, v, s

RNN Forward
----------------


In [0]:
"""
Implements a single forward step of the RNN-cell as described
in Figure (2)
Arguments:
xt -- your input data at timestep "t", numpy array of shape
(n_x, m).
a_prev -- Hidden state at timestep "t-1", numpy array of shape
(n_a, m)
parameters -- python dictionary containing:
Wax -- Weight matrix multiplying the
input, numpy array of shape (n_a, n_x)
Waa -- Weight matrix multiplying the
hidden state, numpy array of shape (n_a, n_a)
Wya -- Weight matrix relating the hiddenstate
to the output, numpy array of shape (n_y, n_a)
ba -- Bias, numpy array of shape (n_a, 1)
by -- Bias relating the hidden-state to
the output, numpy array of shape (n_y, 1)
Returns:
a_next -- next hidden state, of shape (n_a, m)
yt_pred -- prediction at timestep "t", numpy array of shape
(n_y, m)
cache -- tuple of values needed for the backward pass,
contains (a_next, a_prev, xt, parameters)
"""
def rnn_cell_forward(xt, a_prev, parameters):

# Retrieve parameters from "parameters"
  Wax = parameters["Wax"]
  Waa = parameters["Waa"]
  Wya = parameters["Wya"]
  ba = parameters["ba"]
  by = parameters["by"]
### START CODE HERE ###
# compute next activation state using the formula given above
# compute output of the current cell using the formula given above
  a_next = np.tanh(np.add(np.add(np.dot(Wax,xt),np.dot(Waa, a_prev)), ba))
  yt_pred = softmax(np.dot(Wya, a_next) + by )
### END CODE HERE ###
# store values you need for backward propagation in cache
  cache = (a_next, a_prev, xt, parameters)
  return a_next, yt_pred, cache

In [0]:
"""
Implement the forward propagation of the recurrent neural
network described in Figure (3).
Arguments:
x -- Input data for every time-step, of shape (n_x, m, T_x).
a0 -- Initial hidden state, of shape (n_a, m)
parameters -- python dictionary containing:
Waa -- Weight matrix multiplying the
hidden state, numpy array of shape (n_a, n_a)
Wax -- Weight matrix multiplying the
input, numpy array of shape (n_a, n_x)
Wya -- Weight matrix relating the hiddenstate
to the output, numpy array of shape (n_y, n_a)
ba -- Bias numpy array of shape (n_a, 1)
by -- Bias relating the hidden-state to
the output, numpy array of shape (n_y, 1)
Returns:
a -- Hidden states for every time-step, numpy array of shape
(n_a, m, T_x)
y_pred -- Predictions for every time-step, numpy array of
shape (n_y, m, T_x)
caches -- tuple of values needed for the backward pass,
contains (list of caches, x)
"""
def rnn_forward(x, a0, parameters):

  # Initialize "caches" which will contain the list of all caches
  caches = []
  # Retrieve dimensions from shapes of x and Wy
  n_x, m, T_x = x.shape
  n_y, n_a = parameters["Wya"].shape
  ### START CODE HERE ###
  # initialize "a" and "y" with zeros
  a = np.zeros([n_a, m, T_x])
  y_pred = np.zeros([n_y, m, T_x])
  # Initialize a_next
  a_next = a0
  # loop over all time-steps
  for i in range(T_x):
    a_next, yt_pred, cache = rnn_cell_forward(x[:,:,i], a_next, parameters)
    y_pred[:, :, i] = yt_pred
    a[:, :, i] = a_next
    caches.append(cache)
  # Update next hidden state, compute the prediction, get the cache
  # Save the value of the new "next" hidden state in a
  # Save the value of the prediction in y
  # Append "cache" to "caches"
  ### END CODE HERE ###
  # store values needed for backward propagation in cache
  caches = (caches, x)
  return a, y_pred, caches

In [0]:
np.random.seed(1)
xt = np.random.randn(3,10)
a_prev = np.random.randn(5,10)
Waa = np.random.randn(5,5)
Wax = np.random.randn(5,3)
Wya = np.random.randn(2,5)
ba = np.random.randn(5,1)
by = np.random.randn(2,1)
parameters = {"Waa": Waa, "Wax": Wax, "Wya": Wya, "ba": ba, "by": by}
a_next, yt_pred, cache = rnn_cell_forward(xt, a_prev, parameters)
print("a_next[4] = ", a_next[4])
print("a_next.shape = ", a_next.shape)
print("yt_pred[1] =", yt_pred[1])
print("yt_pred.shape = ", yt_pred.shape)

a_next[4] =  [ 0.59584544  0.18141802  0.61311866  0.99808218  0.85016201  0.99980978
 -0.18887155  0.99815551  0.6531151   0.82872037]
a_next.shape =  (5, 10)
yt_pred[1] = [0.9888161  0.01682021 0.21140899 0.36817467 0.98988387 0.88945212
 0.36920224 0.9966312  0.9982559  0.17746526]
yt_pred.shape =  (2, 10)


In [0]:
np.random.seed(1)
x = np.random.randn(3,10,4)
a0 = np.random.randn(5,10)
Waa = np.random.randn(5,5)
Wax = np.random.randn(5,3)
Wya = np.random.randn(2,5)
ba = np.random.randn(5,1)
by = np.random.randn(2,1)
parameters = {"Waa": Waa, "Wax": Wax, "Wya": Wya, "ba": ba, "by": by}
a, y_pred, caches = rnn_forward(x, a0, parameters)
print("a[4][1] = ", a[4][1])
print("a.shape = ", a.shape)
print("y_pred[1][3] =", y_pred[1][3])
print("y_pred.shape = ", y_pred.shape)
print("caches[1][1][3] =", caches[1][1][3])
print("len(caches) = ", len(caches))

a[4][1] =  [-0.99999375  0.77911235 -0.99861469 -0.99833267]
a.shape =  (5, 10, 4)
y_pred[1][3] = [0.79560373 0.86224861 0.11118257 0.81515947]
y_pred.shape =  (2, 10, 4)
caches[1][1][3] = [-1.1425182  -0.34934272 -0.20889423  0.58662319]
len(caches) =  2


LSTM Forward
-----------------


In [0]:
"""
Implement a single forward step of the LSTM-cell as described
in Figure (4)
Arguments:
xt -- your input data at timestep "t", numpy array of shape (n_x, m).
a_prev -- Hidden state at timestep "t-1", numpy array of shape (n_a, m)
c_prev -- Memory state at timestep "t-1", numpy array of shape (n_a, m)
parameters -- python dictionary containing:
Wf -- Weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x)
bf -- Bias of the forget gate, numpy array of shape (n_a, 1)
Wi -- Weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
bi -- Bias of the update gate, numpy array of shape (n_a, 1)
Wc -- Weight matrix of the first "tanh", numpy array of shape (n_a, n_a + n_x)
bc -- Bias of the first "tanh", numpy array of shape (n_a, 1)
Wo -- Weight matrix of the output gate, numpy array of shape (n_a, n_a + n_x)
bo -- Bias of the output gate, numpy array of shape (n_a, 1)
Wy -- Weight matrix relating the hiddenstate to the output, numpy array of shape (n_y, n_a)
by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
Returns:
a_next -- next hidden state, of shape (n_a, m)
c_next -- next memory state, of shape (n_a, m)
yt_pred -- prediction at timestep "t", numpy array of shape
(n_y, m)
cache -- tuple of values needed for the backward pass,
contains (a_next, c_next, a_prev, c_prev, xt, parameters)
Note: ft/it/ot stand for the forget/update/output gates, cct
stands for the candidate value (c tilde),
c stands for the memory value
"""
# GRADED FUNCTION: lstm_cell_forward

def lstm_cell_forward(xt, a_prev, c_prev, parameters):

  # Retrieve parameters from "parameters"
  Wf = parameters["Wf"]
  bf = parameters["bf"]
  Wi = parameters["Wi"]
  bi = parameters["bi"]
  Wc = parameters["Wc"]
  bc = parameters["bc"]
  Wo = parameters["Wo"]
  bo = parameters["bo"]
  Wy = parameters["Wy"]
  by = parameters["by"]
  # Retrieve dimensions from shapes of xt and Wy
  n_x, m = xt.shape
  n_y, n_a = Wy.shape
  ### START CODE HERE ###
  # Concatenate a_prev and xt
  concat_ax = np.concatenate((a_prev, xt))
  ft = sigmoid(np.add(np.dot(Wf, concat_ax), bf))
  it = sigmoid(np.add(np.dot(Wi, concat_ax), bi))
  cct = np.tanh(np.add(np.dot(Wc, concat_ax), bc))
  c_next = np.add(np.multiply(ft, c_prev), np.multiply(it, cct))
  ot = sigmoid(np.add(np.dot(Wo, concat_ax), bo))
  a_next = np.multiply(ot, np.tanh(c_next))
  yt_pred = softmax(np.add(np.dot(Wy, a_next), by))
  # Compute values for ft, it, cct, c_next, ot, a_next using the formulas given figure (4)
  # Compute prediction of the LSTM cell
  ### END CODE HERE ###
  # store values needed for backward propagation in cache
  cache = (a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt, parameters)
  return a_next, c_next, yt_pred, cache

In [0]:
# GRADED FUNCTION: lstm_forward
"""
Implement the forward propagation of the recurrent neural
network using an LSTM-cell described in Figure (3).
Arguments:
x -- Input data for every time-step, of shape (n_x, m, T_x).
a0 -- Initial hidden state, of shape (n_a, m)
parameters -- python dictionary containing:
Wf -- Weight matrix of the forget gate,
numpy array of shape (n_a, n_a + n_x)
bf -- Bias of the forget gate, numpy array
of shape (n_a, 1)
Wi -- Weight matrix of the update gate,
numpy array of shape (n_a, n_a + n_x)
bi -- Bias of the update gate, numpy array
of shape (n_a, 1)
Wc -- Weight matrix of the first "tanh",
numpy array of shape (n_a, n_a + n_x)
bc -- Bias of the first "tanh", numpy
array of shape (n_a, 1)
Wo -- Weight matrix of the output gate,
numpy array of shape (n_a, n_a + n_x)
bo -- Bias of the output gate, numpy array
of shape (n_a, 1)
Wy -- Weight matrix relating the hiddenstate
to the output, numpy array of shape (n_y, n_a)
by -- Bias relating the hidden-state to
the output, numpy array of shape (n_y, 1)
Returns:
a -- Hidden states for every time-step, numpy array of shape
(n_a, m, T_x)
y -- Predictions for every time-step, numpy array of shape
(n_y, m, T_x)
caches -- tuple of values needed for the backward pass,
contains (list of all the caches, x)
"""
def lstm_forward(x, a0, parameters):
# Initialize "caches", which will track the list of all the caches
  caches = []
  ### START CODE HERE ###
  # Retrieve dimensions from shapes of x and Wy
  n_x, m, T_x = x.shape
  n_y, n_a = parameters["Wy"].shape  
  # initialize "a", "c" and "y" with zeros
  a = np.zeros([n_a, m, T_x])
  y = np.zeros([n_y, m, T_x])
  c = np.zeros([n_a, m, T_x])
  # Initialize a_next and c_next
  a_next = a0
  c_next = np.zeros([n_a, m])
  # loop over all time-steps
  for i in range(T_x):
    a_next, c_next, yt_pred, cache = lstm_cell_forward(x[:,:,i], a_next, c_next, parameters)
    y[:, :, i] = yt_pred
    a[:, :, i] = a_next
    c[:, :, i] = c_next
    caches.append(cache)
  # Update next hidden state, next memory state, compute the prediction, get the cache
  # Save the value of the new "next" hidden state in a
  # Save the value of the prediction in y
  # Save the value of the next cell state
  # Append the cache into caches
  ### END CODE HERE ###
  # store values needed for backward propagation in cache
  caches = (caches, x)
  return a, y, c, caches

In [0]:
np.random.seed(1)
xt = np.random.randn(3,10)
a_prev = np.random.randn(5,10)
c_prev = np.random.randn(5,10)
Wf = np.random.randn(5, 5+3)
bf = np.random.randn(5,1)
Wi = np.random.randn(5, 5+3)
bi = np.random.randn(5,1)
Wo = np.random.randn(5, 5+3)
bo = np.random.randn(5,1)
Wc = np.random.randn(5, 5+3)
bc = np.random.randn(5,1)
Wy = np.random.randn(2,5)
by = np.random.randn(2,1)
parameters = {"Wf": Wf, "Wi": Wi, "Wo": Wo, "Wc": Wc, "Wy": Wy,
              "bf": bf, "bi": bi, "bo": bo, "bc": bc, "by": by}
a_next, c_next, yt, cache = lstm_cell_forward(xt, a_prev, c_prev, parameters)
print("a_next[4] = ", a_next[4])
print("a_next.shape = ", c_next.shape)
print("c_next[2] = ", c_next[2])
print("c_next.shape = ", c_next.shape)
print("yt[1] =", yt[1])
print("yt.shape = ", yt.shape)
print("cache[1][3] =", cache[1][3])
print("len(cache) = ", len(cache))

a_next[4] =  [-0.66408471  0.0036921   0.02088357  0.22834167 -0.85575339  0.00138482
  0.76566531  0.34631421 -0.00215674  0.43827275]
a_next.shape =  (5, 10)
c_next[2] =  [ 0.63267805  1.00570849  0.35504474  0.20690913 -1.64566718  0.11832942
  0.76449811 -0.0981561  -0.74348425 -0.26810932]
c_next.shape =  (5, 10)
yt[1] = [0.79913913 0.15986619 0.22412122 0.15606108 0.97057211 0.31146381
 0.00943007 0.12666353 0.39380172 0.07828381]
yt.shape =  (2, 10)
cache[1][3] = [-0.16263996  1.03729328  0.72938082 -0.54101719  0.02752074 -0.30821874
  0.07651101 -1.03752894  1.41219977 -0.37647422]
len(cache) =  10


In [0]:
np.random.seed(1)
x = np.random.randn(3,10,7)
a0 = np.random.randn(5,10)
Wf = np.random.randn(5, 5+3)
bf = np.random.randn(5,1)
Wi = np.random.randn(5, 5+3)
bi = np.random.randn(5,1)
Wo = np.random.randn(5, 5+3)
bo = np.random.randn(5,1)
Wc = np.random.randn(5, 5+3)
bc = np.random.randn(5,1)
Wy = np.random.randn(2,5)
by = np.random.randn(2,1)
parameters = {"Wf": Wf, "Wi": Wi, "Wo": Wo, "Wc": Wc, "Wy": Wy, 
              "bf": bf, "bi": bi, "bo": bo, "bc": bc, "by": by}
a, y, c, caches = lstm_forward(x, a0, parameters)
print("a[4][3][6] = ", a[4][3][6])
print("a.shape = ", a.shape)
print("y[1][4][3] =", y[1][4][3])
print("y.shape = ", y.shape)
print("caches[1][1][1] =", caches[1][1][1])
print("c[1][2][1]", c[1][2][1])
print("len(caches) = ", len(caches))

a[4][3][6] =  0.17211776753291672
a.shape =  (5, 10, 7)
y[1][4][3] = 0.9508734618501101
y.shape =  (2, 10, 7)
caches[1][1][1] = [ 0.82797464  0.23009474  0.76201118 -0.22232814 -0.20075807  0.18656139
  0.41005165]
c[1][2][1] -0.8555449167181981
len(caches) =  2


GRU Forward
---------------


In [0]:
"""
Implement a single forward step of the GRU-cell as described in lectures
Arguments:
xt -- your input data at timestep "t", numpy array of shape (n_x, m).
a_prev -- Hidden state at timestep "t-1", numpy array of shape (n_a, m)
c_prev -- Memory state at timestep "t-1", numpy array of shape (n_a, m)
parameters -- python dictionary containing:
Wi -- Weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
bi -- Bias of the update gate, numpy array of shape (n_a, 1)
Wc -- Weight matrix of the first "tanh", numpy array of shape (n_a, n_a + n_x)
bc -- Bias of the first "tanh", numpy array of shape (n_a, 1)
Wr -- Weight matrix of the reset gate, numpy array of shape (n_a, n_a + n_x)
br -- Bias of the reset gate, numpy array of shape (n_a, 1)
Wy -- Weight matrix relating the hiddenstate to the output, numpy array of shape (n_y, n_a)
by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
Returns:
a_next -- next hidden state, of shape (n_a, m)
c_next -- next memory state, of shape (n_a, m)
yt_pred -- prediction at timestep "t", numpy array of shape (n_y, m)
cache -- tuple of values needed for the backward pass,
contains (a_next, c_next, a_prev, c_prev, xt, parameters)
Note: it/rt stand for the update/reset gates, cct
stands for the candidate value (c tilde),
c stands for the memory value
"""
# GRADED FUNCTION: gru_cell_forward

def gru_cell_forward(xt, c_prev, parameters):

  # Retrieve parameters from "parameters"
  Wi = parameters["Wi"]
  bi = parameters["bi"]
  Wc = parameters["Wc"]
  bc = parameters["bc"]
  Wr = parameters["Wr"]
  br = parameters["br"]
  Wy = parameters["Wy"]
  by = parameters["by"]
  # Retrieve dimensions from shapes of xt and Wy
  n_x, m = xt.shape
  n_y, n_a = Wy.shape
  ### START CODE HERE ###
  # Concatenate c_prev and xt
  concat_cx = np.concatenate((c_prev, xt), axis = 0)
  
  it = sigmoid(np.add(np.dot(Wi, concat_cx), bi))
  
  rt = sigmoid(np.add(np.dot(Wr, concat_cx), br))
  
  concat_rcx = np.concatenate((np.multiply(rt, c_prev), xt), axis = 0)
  
  cct = np.tanh(np.add(np.dot(Wc, concat_rcx), bc))
  
  c_next = np.add(np.multiply(it, cct), np.multiply((1 - it) , c_prev))
  
  a_next = c_next
  yt_pred = softmax(np.add(np.dot(Wy, a_next), by))
  
  # Compute values for rt, it, cct, c_next, a_next using the formulas in lectures
  # Compute prediction of the LSTM cell
  ### END CODE HERE ###
  # store values needed for backward propagation in cache
  cache = (a_next, c_next, c_prev, it, rt, cct, xt, parameters)
  return c_next, yt_pred, cache

In [0]:
# GRADED FUNCTION: gru_forward
"""
Implement the forward propagation of the recurrent neural
network using an gru-cell described in Lecture.
Arguments:
x -- Input data for every time-step, of shape (n_x, m, T_x).
c0 -- Initial hidden state, of shape (n_a, m)
parameters -- python dictionary containing:
Wi -- Weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
bi -- Bias of the update gate, numpy array of shape (n_a, 1)
Wc -- Weight matrix of the first "tanh", numpy array of shape (n_a, n_a + n_x)
bc -- Bias of the first "tanh", numpy array of shape (n_a, 1)
Wr -- Weight matrix of the reset gate, numpy array of shape (n_a, n_a + n_x)
br -- Bias of the reset gate, numpy array of shape (n_a, 1)
Wy -- Weight matrix relating the hiddenstate to the output, numpy array of shape (n_y, n_a)
by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
Returns:
a -- Hidden states for every time-step, numpy array of shape
(n_a, m, T_x)
y -- Predictions for every time-step, numpy array of shape
(n_y, m, T_x)
caches -- tuple of values needed for the backward pass,
contains (list of all the caches, x)
"""
def gru_forward(x, c0, parameters):
# Initialize "caches", which will track the list of all the caches
  caches = []
  ### START CODE HERE ###
  # Retrieve dimensions from shapes of x and Wy
  n_x, m, T_x = x.shape
  n_y, n_a = parameters["Wy"].shape  
  # initialize "a", "c" and "y" with zeros
  a = np.zeros([n_a, m, T_x])
  y = np.zeros([n_y, m, T_x])
  c = np.zeros([n_a, m, T_x])
  c_next = c0
  # loop over all time-steps
  for i in range(T_x):
    c_next, yt_pred, cache = gru_cell_forward(x[:,:,i], c_next, parameters)
    y[:, :, i] = yt_pred
    c[:, :, i] = c_next
    caches.append(cache)
  # Update next hidden state, next memory state, compute the prediction, get the cache
  # Save the value of the new "next" hidden state in a
  # Save the value of the prediction in y
  # Save the value of the next cell state
  # Append the cache into caches
  ### END CODE HERE ###
  # store values needed for backward propagation in cache
  caches = (caches, x)
  return y, c, caches

In [0]:
np.random.seed(1)
xt = np.random.randn(3,10)
c_prev = np.random.randn(5,10)
Wi = np.random.randn(5, 5+3)
bi = np.random.randn(5,1)
Wr = np.random.randn(5, 5+3)
br = np.random.randn(5,1)
Wc = np.random.randn(5, 5+3)
bc = np.random.randn(5,1)
Wy = np.random.randn(2,5)
by = np.random.randn(2,1)
parameters = {"Wi": Wi, "Wr": Wr, "Wc": Wc, "Wy": Wy,
              "bi": bi, "br": br, "bc": bc, "by": by}
c_next, yt, cache = gru_cell_forward(xt, c_prev, parameters)
print("c_next[2] = ", c_next[2])
print("c_next.shape = ", c_next.shape)
print("yt[1] =", yt[1])
print("yt.shape = ", yt.shape)
print("cache[1][3] =", cache[1][3])
print("len(cache) = ", len(cache))

AttributeError: ignored

In [0]:
np.random.seed(1)
x = np.random.randn(3,10,7)
c0 = np.random.randn(5,10)
Wi = np.random.randn(5, 5+3)
bi = np.random.randn(5,1)
Wr = np.random.randn(5, 5+3)
br = np.random.randn(5,1)
Wc = np.random.randn(5, 5+3)
bc = np.random.randn(5,1)
Wy = np.random.randn(2,5)
by = np.random.randn(2,1)
parameters = {"Wi": Wi, "Wr": Wr, "Wc": Wc, "Wy": Wy, 
              "bi": bi, "br": br, "bc": bc, "by": by}
y, c, caches = gru_forward(x, c0, parameters)

print("y[1][4][3] =", y[1][4][3])
print("y.shape = ", y.shape)
print("caches[1][1][1] =", caches[1][1][1])
print("c[1][2][1]", c[1][2][1])
print("len(caches) = ", len(caches))

y[1][4][3] = 0.41095408633521757
y.shape =  (2, 10, 7)
caches[1][1][1] = [ 0.82797464  0.23009474  0.76201118 -0.22232814 -0.20075807  0.18656139
  0.41005165]
c[1][2][1] 1.5179668926616001
len(caches) =  2


RNN Backward Propagation
--------------------------------


In [0]:
"""
Implements the backward pass for the RNN-cell (single timestep).
Arguments:
da_next -- Gradient of loss with respect to next hidden state
cache -- python dictionary containing useful values (output of rnn_cell_forward())
Returns:
gradients -- python dictionary containing:
dx -- Gradients of input data, of shape (n_x, m)
da_prev -- Gradients of previous hidden state, of shape (n_a, m)
dWax -- Gradients of input-to-hidden weights, of shape (n_a, n_x)
dWaa -- Gradients of hidden-to-hidden weights, of shape (n_a, n_a)
dba -- Gradients of bias vector, of shape (n_a, 1)
"""
def rnn_cell_backward(da_next, cache):

# Retrieve values from cache
  (a_next, a_prev, xt, parameters) = cache
# Retrieve values from parameters
  Wax = parameters["Wax"]
  Waa = parameters["Waa"]
  Wya = parameters["Wya"]
  ba = parameters["ba"]
  by = parameters["by"]
### START CODE HERE ###
# compute the gradient of tanh with respect to a_next
  dtanh = np.multiply((1 - np.multiply(a_next, a_next)), da_next)
# compute the gradient of the loss with respect to Wax
  dWax = np.dot(dtanh, xt.T)
# compute the gradient with respect to Waa
  dWaa = np.dot(dtanh, a_prev.T)
# compute the gradient with respect to b
  dba = dtanh.sum(axis=1, keepdims= True)
  da_prev = np.dot(Waa.T, dtanh)
  dxt = np.dot(Wax.T, dtanh)
### END CODE HERE ###
# Store the gradients in a python dictionary
  gradients = {"dxt": dxt, "da_prev": da_prev,
               "dWax": dWax, "dWaa": dWaa, "dba": dba}
  return gradients

In [0]:
"""
Implement the backward pass for a RNN over an entire sequence
of input data.
Arguments:
da -- Upstream gradients of all hidden states, of shape (n_a, m, T_x)
caches -- tuple containing information from the forward pass (rnn_forward)
Returns:
gradients -- python dictionary containing:
dx -- Gradient w.r.t. the input data, numpy-array of shape (n_x, m, T_x)
da0 -- Gradient w.r.t the initial hidden state, numpy-array of shape (n_a, m) 
dWax -- Gradient w.r.t the input's weight matrix, numpy-array of shape (n_a, n_x)
dWaa -- Gradient w.r.t the hidden state's weight matrix, numpy-arrayof shape (n_a, n_a)
dba -- Gradient w.r.t the bias, of shape (n_a, 1)
"""
def rnn_backward(da, caches):

### START CODE HERE ###
# Retrieve values from the first cache (t=1) of caches
  (caches, x) = caches
  (a1, a0, x1, parameters) = caches[0]
# Retrieve dimensions from da's and x1's shapes
  (n_a, m, T_x) = da.shape
  (n_x, m) = x1.shape
# initialize the gradients with the right sizes
  dx = np.zeros([n_x, m, T_x])
  da0 = np.zeros([n_a, m])
  dWax = np.zeros([n_a, n_x])
  dWaa = np.zeros([n_a, n_a])
  dba = np.zeros([n_a, 1])
  dat_prev = np.zeros([n_a, m])
# Loop through all the time steps
  for i in reversed(range(T_x)):    
# Compute gradients at time step t. Choose wisely the "da_next" and the "cache" to use in the backward propagation step.
    gradients = rnn_cell_backward(da[:,:,i] + dat_prev, caches[i])
    dx[:,:, i] = gradients["dxt"]
    dat_prev = gradients["da_prev"]
    dWax = np.add(dWax, gradients["dWax"])
    dWaa = np.add(dWaa, gradients["dWaa"])
    dba = np.add(dba, gradients["dba"])
# Retrieve derivatives from gradients
# Increment global derivatives w.r.t parameters by adding their derivative at time-step t
  da0 = dat_prev
# Set da0 to the gradient of a which has been backpropagated through all time-steps
### END CODE HERE ###
# Store the gradients in a python dictionary
  gradients = {"dx": dx, "da0": da0, 
               "dWax": dWax, "dWaa": dWaa,"dba": dba}
  return gradients


In [0]:
np.random.seed(1)
xt = np.random.randn(3,10)
a_prev = np.random.randn(5,10)
Wax = np.random.randn(5,3)
Waa = np.random.randn(5,5)
Wya = np.random.randn(2,5)
b = np.random.randn(5,1)
by = np.random.randn(2,1)
parameters = {"Wax": Wax, "Waa": Waa, "Wya": Wya, "ba": ba, "by": by}
a_next, yt, cache = rnn_cell_forward(xt, a_prev, parameters)
da_next = np.random.randn(5,10)
gradients = rnn_cell_backward(da_next, cache)
print("gradients[\"dxt\"][1][2] =", gradients["dxt"][1][2])
print("gradients[\"dxt\"].shape =", gradients["dxt"].shape)
print("gradients[\"da_prev\"][2][3] =", gradients["da_prev"][2][3])
print("gradients[\"da_prev\"].shape =", gradients["da_prev"].shape)
print("gradients[\"dWax\"][3][1] =", gradients["dWax"][3][1])
print("gradients[\"dWax\"].shape =", gradients["dWax"].shape)
print("gradients[\"dWaa\"][1][2] =", gradients["dWaa"][1][2])
print("gradients[\"dWaa\"].shape =", gradients["dWaa"].shape)
print("gradients[\"dba\"][4] =", gradients["dba"][4])
print("gradients[\"dba\"].shape =", gradients["dba"].shape)

gradients["dxt"][1][2] = -0.4605641030588796
gradients["dxt"].shape = (3, 10)
gradients["da_prev"][2][3] = 0.08429686538067724
gradients["da_prev"].shape = (5, 10)
gradients["dWax"][3][1] = 0.39308187392193034
gradients["dWax"].shape = (5, 3)
gradients["dWaa"][1][2] = -0.28483955786960663
gradients["dWaa"].shape = (5, 5)
gradients["dba"][4] = [0.80517166]
gradients["dba"].shape = (5, 1)


In [0]:
np.random.seed(1)
x = np.random.randn(3,10,4)
a0 = np.random.randn(5,10)
Wax = np.random.randn(5,3)
Waa = np.random.randn(5,5)
Wya = np.random.randn(2,5)
ba = np.random.randn(5,1)
by = np.random.randn(2,1)
parameters = {"Wax": Wax, "Waa": Waa, "Wya": Wya, "ba": ba, "by": by}
a, y, caches = rnn_forward(x, a0, parameters)
da = np.random.randn(5, 10, 4)
gradients = rnn_backward(da, caches)
print("gradients[\"dx\"][1][2] =", gradients["dx"][1][2])
print("gradients[\"dx\"].shape =", gradients["dx"].shape)
print("gradients[\"da0\"][2][3] =", gradients["da0"][2][3])
print("gradients[\"da0\"].shape =", gradients["da0"].shape)
print("gradients[\"dWax\"][3][1] =", gradients["dWax"][3][1])
print("gradients[\"dWax\"].shape =", gradients["dWax"].shape)
print("gradients[\"dWaa\"][1][2] =", gradients["dWaa"][1][2])
print("gradients[\"dWaa\"].shape =", gradients["dWaa"].shape)
print("gradients[\"dba\"][4] =", gradients["dba"][4])
print("gradients[\"dba\"].shape =", gradients["dba"].shape)

gradients["dx"][1][2] = [-2.07101689 -0.59255627  0.02466855  0.01483317]
gradients["dx"].shape = (3, 10, 4)
gradients["da0"][2][3] = -0.31494237512664996
gradients["da0"].shape = (5, 10)
gradients["dWax"][3][1] = 11.264104496527777
gradients["dWax"].shape = (5, 3)
gradients["dWaa"][1][2] = 2.303333126579893
gradients["dWaa"].shape = (5, 5)
gradients["dba"][4] = [-0.74747722]
gradients["dba"].shape = (5, 1)


GRU Backward Propagation
---------------------------------



In [0]:
"""
Implements the backward pass for the GRU-cell (single timestep).
Arguments:
da_next -- Gradient of loss with respect to next hidden state
dc_prev -- Gradient w.r.t. the previous memory state, of shape (n_a, m, T_x)
cache -- python dictionary containing useful values (output of GRU_cell_forward())
Returns:
gradients -- python dictionary containing:
dx -- Gradients of input data, of shape (n_x, m)
da_prev -- Gradients of previous hidden state, of shape (n_a, m)
dWax -- Gradients of input-to-hidden weights, of shape (n_a, n_x)
dWaa -- Gradients of hidden-to-hidden weights, of shape (n_a, n_a)
dba -- Gradients of bias vector, of shape (n_a, 1)
"""
def gru_cell_backward(da_next, cache):

# Retrieve values from cache
  (a_next, c_next, c_prev, it, rt, cct, xt, parameters) = cache
# Retrieve values from parameters
  Wi = parameters["Wi"]
  bi = parameters["bi"]
  Wc = parameters["Wc"]
  bc = parameters["bc"]
  Wr = parameters["Wr"]
  br = parameters["br"]
  Wy = parameters["Wy"]
  by = parameters["by"]
### START CODE HERE ###
  c_diff = (cct - c_prev)
  concat_cx = np.concatenate((c_prev, xt), axis = 0)
  concat_c0 = np.concatenate((c_prev, np.zeros(xt.shape)), axis = 0)
  concat_10 = np.concatenate((np.ones(c_prev.shape), np.zeros(xt.shape)), axis = 0)
  concat_01 = np.concatenate((np.zeros(c_prev.shape), np.ones(xt.shape)), axis = 0)
  concat_r0 = np.concatenate((rt, np.zeros(xt.shape)), axis = 0)
  concat_rcx = np.concatenate((np.multiply(c_prev, rt), xt), axis = 0)
  
  dWi = np.matmul(np.multiply(np.multiply(da_next, c_diff), np.multiply(it, (1 - it))),concat_cx.T)
  
  dbi = np.sum(np.multiply(np.multiply(da_next, c_diff), np.multiply(it, (1 - it))), axis = 1).reshape(-1, 1)
  
  dWr = np.matmul(np.multiply(np.multiply(np.multiply(np.multiply(da_next,
                                                                  it),
                                                      (1 - np.multiply(cct, cct))),
                                np.multiply(rt,(1- rt))),
                              Wc.dot(concat_c0)), concat_cx.T)
  
  dbr = np.sum(np.multiply(np.multiply(np.multiply(np.multiply(da_next, it), (1 - np.multiply(cct, cct))),
                                np.multiply(rt, (1- rt))), Wc.dot(concat_c0)), axis = 1).reshape(-1, 1)
  
  da_prev = np.multiply(np.multiply(da_next,
                                    np.add(np.add(1 - it,
                                                  np.multiply(np.multiply(c_diff, np.multiply(it, 1 - it)),
                                                              Wi.dot(concat_10))),
                                           np.multiply(it, 1 - np.multiply(cct, cct)))),
                       np.add(Wc.dot(concat_r0), 
                              np.multiply(np.multiply(Wc.dot(concat_c0),
                                                      np.multiply(rt, 1 - rt)), Wr.dot(concat_10)))) 
                        
  dxt = np.multiply(np.multiply(da_next,
                                    np.add(np.multiply(it, 1 - np.multiply(cct, cct)),
                                                  np.multiply(np.multiply(c_diff, np.multiply(it, 1 - it)),
                                                              Wi.dot(concat_01)))),
                       np.add(Wc.dot(concat_01), 
                              np.multiply(np.multiply(Wc.dot(concat_c0),
                                                      np.multiply(rt, 1 - rt)),
                                          Wr.dot(concat_01))))
  dc_prev = da_prev
  dWc = np.matmul(np.multiply(da_next,
                                np.multiply(it,  (1 - np.multiply(cct, cct)))) ,
                                            concat_rcx.T)
  
  dbc = np.sum(np.multiply(da_next, np.multiply(it,  (1 - np.multiply(cct, cct)))), axis = 1).reshape((-1, 1))
### END CODE HERE ###
# Store the gradients in a python dictionary
  gradients = {"dxt": dxt, "dc_prev": dc_prev, "da_prev": da_prev,
               "dWi": dWi, "dWr": dWr, "dWc": dWc, "dbi": dbi, "dbr": dbr, "dbc": dbc}
  return gradients

In [0]:
np.random.seed(1)
xt = np.random.randn(3,10)
c_prev = np.random.randn(5,10)
Wi = np.random.randn(5, 5+3)
bi = np.random.randn(5,1)
Wr = np.random.randn(5, 5+3)
br = np.random.randn(5,1)
Wc = np.random.randn(5, 5+3)
bc = np.random.randn(5,1)
Wy = np.random.randn(2,5)
by = np.random.randn(2,1)

parameters = {"Wi": Wi, "Wr": Wr, "Wc": Wc, "Wy": Wy, "bi": bi, "br": br, "bc": bc, "by": by}

c_next, yt, cache = gru_cell_forward(xt, c_prev, parameters)

da_next = np.random.randn(5,10)
gradients = gru_cell_backward(da_next, cache)
print("gradients[\"dxt\"][1][2] =", gradients["dxt"][1][2])
print("gradients[\"dxt\"].shape =", gradients["dxt"].shape)
print("gradients[\"da_prev\"][2][3] =", gradients["da_prev"][2][3])
print("gradients[\"da_prev\"].shape =", gradients["da_prev"].shape)
print("gradients[\"dWi\"][3][1] =", gradients["dWi"][3][1])
print("gradients[\"dWr\"].shape =", gradients["dWr"].shape)
print("gradients[\"dWc\"][1][2] =", gradients["dWc"][1][2])
print("gradients[\"dWc\"].shape =", gradients["dWc"].shape)
print("gradients[\"dbr\"][4] =", gradients["dbr"][4])
print("gradients[\"dbr\"].shape =", gradients["dbr"].shape)

gradients["dxt"][1][2] = 0.0048620781818170715
gradients["dxt"].shape = (5, 10)
gradients["da_prev"][2][3] = -0.3499913522344784
gradients["da_prev"].shape = (5, 10)
gradients["dWi"][3][1] = -0.09076245489943323
gradients["dWr"].shape = (5, 8)
gradients["dWc"][1][2] = -0.2826884118186385
gradients["dWc"].shape = (5, 8)
gradients["dbr"][4] = [0.08794573]
gradients["dbr"].shape = (5, 1)


Sentiment Analysis
----------------------

In [0]:
from keras.datasets import imdb
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras import Sequential
from keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense, Dropout
 


Using TensorFlow backend.


In [0]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(path="imdb.npz")

In [0]:
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.4, random_state=1)
lengths = []
for i in range(len(x_train)):
  lengths.append(len(x_train[i]))
for i in range(len(x_test)):
  lengths.append(len(x_test[i]))
for i in range(len(x_val)):
  lengths.append(len(x_val[i]))  
maximum = np.max(lengths)
x_train = sequence.pad_sequences(x_train, maxlen=maximum)
x_test = sequence.pad_sequences(x_test, maxlen=maximum)
x_val = sequence.pad_sequences(x_val, maxlen=maximum)  


x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
x_val = np.reshape(x_val, (x_val.shape[0], x_val.shape[1], 1))
print(x_train.shape)
print(x_test.shape)
print(x_val.shape)

(25000, 2494, 1)
(15000, 2494, 1)
(10000, 2494, 1)


In [0]:
def build_rnn_model(input_shape):
  model=Sequential()
  model.add(SimpleRNN(150, input_shape= input_shape, activation = 'softmax'))
  model.add(Dropout(0.2))
  model.add(Dense(1, activation='sigmoid'))
  return model

In [0]:
def build_lstm_model(input_shape):
  model=Sequential()
  model.add(LSTM(50, input_shape= input_shape, activation = 'softmax'))
  model.add(Dropout(0.2))
  model.add(Dense(1, activation='sigmoid'))
  return model
  
  

In [0]:
def build_gru_model(input_shape):
  model=Sequential()
  model.add(GRU(50, input_shape= input_shape, activation = 'softmax'))
  model.add(Dropout(0.2))
  model.add(Dense(1, activation='sigmoid'))
  return model

In [0]:
model = build_rnn_model(x_train.shape[1:])
print(model.summary())

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size= 2048, epochs=4)

test_acc = model.evaluate(x_test, y_test, batch_size=32)
print('Test accuracy:', test_acc)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_1 (SimpleRNN)     (None, 150)               22800     
_________________________________________________________________
dropout_1 (Dropout)          (None, 150)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 151       
Total params: 22,951
Trainable params: 22,951
Non-trainable params: 0
_________________________________________________________________
None
Instructions for updating:
Use tf.cast instead.
Train on 25000 samples, validate on 10000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Test accuracy: [0.6931396690686544, 0.5053333333174388]


In [0]:
model = build_lstm_model(x_train.shape[1:])
print(model.summary())

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size= 512, epochs=4)

test_acc = model.evaluate(x_test, y_test, batch_size=32)
print('Test accuracy:', test_acc)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 50)                10400     
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 51        
Total params: 10,451
Trainable params: 10,451
Non-trainable params: 0
_________________________________________________________________
None
Train on 25000 samples, validate on 10000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Test accuracy: [0.6896654514630636, 0.5408]


In [0]:
model = build_gru_model(x_train.shape[1:])
print(model.summary())

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size= 512, epochs=3)

test_acc = model.evaluate(x_test, y_test, batch_size=32)
print('Test accuracy:', test_acc)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_1 (GRU)                  (None, 50)                7800      
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 51        
Total params: 7,851
Trainable params: 7,851
Non-trainable params: 0
_________________________________________________________________
None
Instructions for updating:
Use tf.cast instead.
Train on 25000 samples, validate on 10000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test accuracy: [0.6928202206611633, 0.5124]


Accuracies of models
------------------------
- Simple RNN :  50.5%
- LSTM :  54.08%
- GRU :  51.24%