In [1]:
import numpy as np

# Utility functions

In [4]:
def softmax(x):
  e_x = np.exp(x - np.max(x))
  
  return e_x / e_x.sum(axis=0)

In [7]:
def rnn_cell_forward(timestep_t, a_prev, parameters):
  """
  Implements a single forward step of an RNN cell

  Args:
    timestep_t: (numpy array) input data at timestep "t"
    a_prev: (numpy array) hidden state at timestep "t-1"
    parameters: (dict) containing:
      Wax: (numpy array) weight matrix multiplying the input
      Waa: (numpy array) weight matrix multiplying the hidden state
      Wya: (numpy array) weight matrix relating the hiden state
        to the output
      ba: (numpy array) bias
      by: (numpy array) bias relating the hidden state to the output

  Returns:
    a_next: (numpy array) next hidden state
    yt_pred: (numpy array) prediction at timestep "t"
    cache: (tuple) with values needed for the backward pass
      it contains (a_nex, a_prev, timestep_t, parameters)
  """

  Wax = parameters['Wax']
  Waa = parameters['Waa']
  Wya = parameters['Wya']
  ba = parameters['ba']
  by = parameters['by']

  # Compute next activation state
  a_next = np.tanh(np.dot(Waa, a_prev) + np.dot(Wax, timestep_t) + ba)

  # Compute output of the current cell
  yt_pred = softmax(np.dot(Wya, a_next) + by)

  cache = (a_next, a_prev, timestep_t, parameters)

  return a_next, yt_pred, cache

# RNN forward pass

In [11]:
def rnn_forward(x, act0, parameters):
  """
  Implements the forward propagation of a recurrent neural
  network

  Args:
    x: (numpy array) input data for every timestep
    act0: (numpy array) initial hidden state
    parameters: (dict) containing:
      Wax: (numpy array) weight matrix multiplying the input
      Waa: (numpy array) weight matrix multiplying the hidden state
      Wya: (numpy array) weight matrix relating the hiden state
        to the output
      ba: (numpy array) bias
      by: (numpy array) bias relating the hidden state to the output

  Returns:
    activations: (numpy array) hidden states for every timestep
    y_pred: (numpy array) predictions for every timestep
    caches: (tuple) with values needed for the backward pass
      it contains the list of caches and x
  """

  caches = []

  # get dimensions from shapes of x and Wy
  n_x, m, T_x = x.shape
  n_y, n_a = parameters['Wya'].shape

  # initialize 'a' and 'y' with zeros
  activations = np.zeros((n_a, m, T_x))
  y_pred = np.zeros((n_y, m, T_x))

  # initialize a_next
  a_next = act0

  for timestep in range(T_x):
    # update next hidden state, compute the prediction
    a_next, yt_pred, cache = rnn_cell_forward(x[:, :, timestep], a_next, parameters)

    # save the value of the new 'next' hidden state and the prediction
    activations[:, :, timestep] = a_next
    y_pred[:, :, timestep] = yt_pred
    caches.append(cache)

  caches = (caches, x)

  return activations, y_pred, caches

# Backprop in a RNN


In [13]:
def rnn_cell_backward(da_next, cache):
  """
  Implements the backward pass for a single step in the RNN-cell

  Args:
    da_next: (numpy array) gradient loss with respect to next hidden state
    cache: (dict) contains outputs of the rnn_cell_forward()

  Returns:
    gradients: (dict) containing:
      dx: (numpy array)
      da_prev: (numpy array) gradients of previous hidden state
      dWax: (numpy array) gradients of input to hidden weights
      dWaa: (numpy array) gradients of hidden to hidden weights
      dba: (numpy array) gradients of bias vector
  """

  a_next, a_prev, timestep_t, parameters = cache

  Wax = parameters['Wax']
  Waa = parameters['Waa']
  Wya = parameters['Wya']
  ba = parameters['ba']
  by = parameters['by']

  # compute the gradient of tanh with respect to a_next
  dtanh = (1 - a_next**2) * da_next

  # compute the gradient of the loss with respect to Wax
  dxt = np.dot(Wax.T, dtanh)
  dWax = np.dot(dtanh, timestep_t.T)

  # Compute the gradient with respect to Waa
  da_prev = np.dot(Waa.T, dtanh)
  dWaa = np.dot(dtanh, a_prev.T)

  # compute the gradient with respect to b
  dba = np.sum(dtanh, 1, keepdims=True)

  # store gradients
  gradients = {'dxt': dxt,
               'dWax': dWax,
               'da_prev': da_prev,
               'dWaa': dWaa,
               'dba': dba}

  return gradients

In [15]:
def rnn_backward(da, caches):
  """
  Implements the backward pass for a RNN over an entire sequence
  of input data

  Args:
    da: upstream gradient of all hidden states
    caches: (tuple) containing information from the forward pass

  Returns:
    gradients: (dict) containing:
      dx: gradients of the input data
      da0: gradients of the initial hidden state
      dWax: gradients of the input's weights matrix
      dWaa: gradients of the hidden state's weights matrix
      dba: gradients of the bias
  """

  caches, x = caches
  a1, a0, x1, parameters = caches[0]

  # get dimensions
  n_a, m, T_x = da.shape
  n_x, m = x1.shape

  # initialize the gradients with the right sizes
  dx = np.zeros((n_x, m, T_x))
  dWax = np.zeros((n_a, n_x))
  dWaa = np.zeros((n_a, n_a))
  dba = np.zeros((n_a, 1))
  da0 = np.zeros((n_a, m))
  da_prevt = np.zeros((n_a, m))

  for timestep in reversed(range(T_x)):
    # compute gradients in current timestep
    gradients = rnn_cell_backward(da[:, :, timestep] + da_prevt, caches[timestep])

    # get derivatives from gradients
    dxt = gradients['dxt'] 
    da_prevt = gradients['da_prev']
    dWaxt = gradients['dWax']
    dWaat = gradients['dWaa']
    dbat = gradients['dba']

    # increment the global derivatives
    dx[:, :, timestep] = dxt
    dWax += dWaxt
    dWaa += dWaat
    dba += dbat

  # set da0 to the gradient of "a" which has been backpropagated thorugh
  # all the timesteps
  da0 = da_prevt

  gradients = {'dx': dx,
               'da0': da0,
               'dWax': dWax,
               'dWaa': dWaa,
               'dba': dba}

  return gradients  
