In [1]:
import numpy as np

### softmax

In [2]:
def softmax(x):
    """
    Compute softmax values for each column of the input x.

    Args:
        x -- numpy array of shape (n_y, m)

    Returns:
        s -- softmax probabilities, same shape as x
    """
    e_x = np.exp(x - np.max(x, axis=0, keepdims=True))  # for numerical stability
    s = e_x / np.sum(e_x, axis=0, keepdims=True)
    return s

### Long Short-Term Memory (LSTM) Network

### lstm_cell_forward

In [5]:
import numpy as np

def sigmoid(x):
    """
    Compute the sigmoid of x.

    Arguments:
    x -- A scalar or numpy array.

    Returns:
    s -- sigmoid(x)
    """
    s = 1 / (1 + np.exp(-x))
    return s


In [6]:
def lstm_cell_forward(xt, a_prev, c_prev, parameters):


    # Retrieve parameters from "parameters"
    Wf = parameters["Wf"] # forget gate weight
    bf = parameters["bf"]
    Wi = parameters["Wi"] # update gate weight (notice the variable name)
    bi = parameters["bi"] # (notice the variable name)
    Wc = parameters["Wc"] # candidate value weight
    bc = parameters["bc"]
    Wo = parameters["Wo"] # output gate weight
    bo = parameters["bo"]
    Wy = parameters["Wy"] # prediction weight
    by = parameters["by"]

    # Retrieve dimensions from shapes of xt and Wy
    n_x, m = xt.shape
    n_y, n_a = Wy.shape


    # Concatenate a_prev and xt
    concat = np.concatenate((a_prev,xt),axis=0)

    # Compute values for ft, it, cct, c_next, ot, a_next using the formulas
    ft = sigmoid(Wf @ concat + bf)
    it = sigmoid(Wi @ concat + bi)
    cct = np.tanh(Wc @ concat + bc)
    c_next = ft * c_prev + it * cct
    ot = sigmoid(Wo @ concat + bo)
    a_next = ot * np.tanh(c_next)

    # Prediction
    yt_pred = softmax(Wy @ a_next + by)


    # store values needed for backward propagation in cache
    cache = (a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt, parameters)

    return a_next, c_next, yt_pred, cache

### LSTM Forward

In [7]:
def lstm_forward(x, a0, parameters):

    # Initialize "caches", which will track the list of all the caches
    caches = []


    Wy = parameters['Wy'] # saving parameters['Wy'] in a local variable in case students use Wy instead of parameters['Wy']
    # Retrieve dimensions from shapes of x and parameters['Wy']
    n_x, m, T_x = x.shape
    n_y, n_a = Wy.shape

    # initialize "a", "c" and "y" with zeros
    a = np.zeros((n_a,m,T_x))
    c = np.zeros((n_a,m,T_x))
    y = np.zeros((n_y,m,T_x))

    # Initialize a_next and c_next
    a_next = a0
    c_next = np.zeros((n_a,m))

    # loop over all time-steps
    for t in range(T_x):
        # Get the 2D slice 'xt' from the 3D input 'x' at time step 't'
        xt = x[:,:,t]
        # Update next hidden state, next memory state, compute the prediction, get the cache
        a_next, c_next, yt, cache = lstm_cell_forward(xt, a_next, c_next, parameters)
        # Save the value of the new "next" hidden state in a
        a[:,:,t] = a_next
        # Save the value of the next cell state
        c[:,:,t]  = c_next
        # Save the value of the prediction in y
        y[:,:,t] = yt
        # Append the cache into caches
        caches.append(cache)



    # store values needed for backward propagation in cache
    caches = (caches, x)

    return a, y, c, caches

### lstm_cell_backward

In [8]:
def lstm_cell_backward(da_next, dc_next, cache):


    # Retrieve information from cache
    (a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt, parameters) = cache

    # Retrieve parameters
    Wf = parameters["Wf"]
    Wi = parameters["Wi"]
    Wc = parameters["Wc"]
    Wo = parameters["Wo"]

    # Retrieve dimensions
    n_x, m = xt.shape
    n_a, m = a_next.shape

    # Compute gates derivatives
    tanh_c_next = np.tanh(c_next)

    # Intermediate gradients
    dot = da_next * tanh_c_next * ot * (1 - ot)
    dcct = (dc_next * it + ot * (1 - tanh_c_next ** 2) * it * da_next) * (1 - cct ** 2)
    dit = (dc_next * cct + ot * (1 - tanh_c_next ** 2) * cct * da_next) * it * (1 - it)
    dft = (dc_next * c_prev + ot * (1 - tanh_c_next ** 2) * c_prev * da_next) * ft * (1 - ft)

    # Compute derivatives of cell state
    dc_prev = dc_next * ft + ot * (1 - tanh_c_next ** 2) * ft * da_next

    # Concatenate a_prev and xt
    concat = np.concatenate((a_prev, xt), axis=0)  # shape: (n_a + n_x, m)

    # Compute parameters' gradients
    dWf = np.dot(dft, concat.T)
    dWi = np.dot(dit, concat.T)
    dWc = np.dot(dcct, concat.T)
    dWo = np.dot(dot, concat.T)
    dbf = np.sum(dft, axis=1, keepdims=True)
    dbi = np.sum(dit, axis=1, keepdims=True)
    dbc = np.sum(dcct, axis=1, keepdims=True)
    dbo = np.sum(dot, axis=1, keepdims=True)

    # Compute gradients w.r.t a_prev and xt
    d_concat = (
        np.dot(Wf.T, dft)
        + np.dot(Wi.T, dit)
        + np.dot(Wc.T, dcct)
        + np.dot(Wo.T, dot)
    )

    da_prev = d_concat[:n_a, :]
    dxt = d_concat[n_a:, :]

    # Store gradients in dictionary
    gradients = {
        "dxt": dxt, "da_prev": da_prev, "dc_prev": dc_prev,
        "dWf": dWf, "dbf": dbf,
        "dWi": dWi, "dbi": dbi,
        "dWc": dWc, "dbc": dbc,
        "dWo": dWo, "dbo": dbo
    }

    return gradients


### LSTM Back word


In [13]:
# UNGRADED FUNCTION: lstm_backward

def lstm_backward(da, caches):
    """
    Implement the backward pass for the RNN with LSTM-cell (over a whole sequence).

    Arguments:
    da -- Gradients w.r.t the hidden states, numpy-array of shape (n_a, m, T_x)
    caches -- cache storing information from the forward pass (lstm_forward)

    Returns:
    gradients -- python dictionary containing:
                        dx -- Gradient of inputs, of shape (n_x, m, T_x)
                        da0 -- Gradient w.r.t. the previous hidden state, numpy array of shape (n_a, m)
                        dWf, dWi, dWc, dWo -- Gradients w.r.t. gate weight matrices
                        dbf, dbi, dbc, dbo -- Gradients w.r.t. gate biases
    """

    (caches_list, x) = caches
    (a1, c1, a0, c0, f1, i1, cc1, o1, x1, parameters) = caches_list[0]

    n_a, m, T_x = da.shape
    n_x, m = x1.shape

    # Initialize gradients with zeros
    dx = np.zeros((n_x, m, T_x))
    da0 = np.zeros((n_a, m))
    da_prevt = np.zeros((n_a, m))
    dc_prevt = np.zeros((n_a, m))
    dWf = np.zeros((n_a, n_a + n_x))
    dWi = np.zeros((n_a, n_a + n_x))
    dWc = np.zeros((n_a, n_a + n_x))
    dWo = np.zeros((n_a, n_a + n_x))
    dbf = np.zeros((n_a, 1))
    dbi = np.zeros((n_a, 1))
    dbc = np.zeros((n_a, 1))
    dbo = np.zeros((n_a, 1))

    # Loop through all time steps in reverse
    for t in reversed(range(T_x)):
        gradients = lstm_cell_backward(da[:,:,t] + da_prevt, dc_prevt, caches_list[t])

        dx[:,:,t] = gradients["dxt"]
        dWf += gradients["dWf"]
        dWi += gradients["dWi"]
        dWc += gradients["dWc"]
        dWo += gradients["dWo"]
        dbf += gradients["dbf"]
        dbi += gradients["dbi"]
        dbc += gradients["dbc"]
        dbo += gradients["dbo"]

        da_prevt = gradients["da_prev"]
        dc_prevt = gradients["dc_prev"]

    da0 = da_prevt

    # Return all gradients in a dictionary
    gradients = {
        "dx": dx,
        "da0": da0,
        "dWf": dWf, "dbf": dbf,
        "dWi": dWi, "dbi": dbi,
        "dWc": dWc, "dbc": dbc,
        "dWo": dWo, "dbo": dbo
    }

    return gradients

### Basic Use

In [14]:
import numpy as np


# Dummy data
np.random.seed(1)
n_x, n_a, m, T_x = 3, 5, 10, 7

x = np.random.randn(n_x, m, T_x)
a0 = np.random.randn(n_a, m)

parameters = {
    "Wf": np.random.randn(n_a, n_a + n_x),
    "bf": np.random.randn(n_a, 1),
    "Wi": np.random.randn(n_a, n_a + n_x),
    "bi": np.random.randn(n_a, 1),
    "Wc": np.random.randn(n_a, n_a + n_x),
    "bc": np.random.randn(n_a, 1),
    "Wo": np.random.randn(n_a, n_a + n_x),
    "bo": np.random.randn(n_a, 1),
    "Wy": np.random.randn(n_x, n_a),
    "by": np.random.randn(n_x, 1),
}

# Forward pass
a, y, c, caches = lstm_forward(x, a0, parameters)
print("a.shape:", a.shape)
print("y.shape:", y.shape)
print("c.shape:", c.shape)

# Backward pass
da = np.random.randn(*a.shape)
gradients = lstm_backward(da, caches)
print("dx.shape:", gradients["dx"].shape)
print("da0.shape:", gradients["da0"].shape)

a.shape: (5, 10, 7)
y.shape: (3, 10, 7)
c.shape: (5, 10, 7)
dx.shape: (3, 10, 7)
da0.shape: (5, 10)
