In [1]:
import theano
import theano.tensor as T
import lasagne
import numpy as np
import sklearn.datasets
from __future__ import print_function
import os
import matplotlib.pyplot as plt
%matplotlib inline
import IPython.display
from recurrent import gen_data

In [2]:
# By setting the first and second dimensions to None, we allow
# arbitrary minibatch sizes with arbitrary sequence lengths.
# The number of feature dimensions is 2, as described above.
l_in = lasagne.layers.InputLayer(shape=(None, None, 2))
# This input will be used to provide the network with masks.
# Masks are expected to be matrices of shape (n_batch, n_time_steps);
# both of these dimensions are variable for us so we will use
# an input shape of (None, None)
l_mask = lasagne.layers.InputLayer(shape=(None, None))

In [3]:
# All gates have initializers for the input-to-gate and hidden state-to-gate
# weight matrices, the cell-to-gate weight vector, the bias vector, and the nonlinearity.
# The convention is that gates use the standard sigmoid nonlinearity,
# which is the default for the Gate class.
gate_parameters = lasagne.layers.recurrent.Gate(
 W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(),
 b=lasagne.init.Constant(0.))
cell_parameters = lasagne.layers.recurrent.Gate(
 W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(),
 # Setting W_cell to None denotes that no cell connection will be used.
 W_cell=None, b=lasagne.init.Constant(0.),
 # By convention, the cell nonlinearity is tanh in an LSTM.
 nonlinearity=lasagne.nonlinearities.tanh)

In [4]:
# Our LSTM will have 10 hidden/cell units
N_HIDDEN = 10
l_lstm = lasagne.layers.recurrent.LSTMLayer(
 l_in, N_HIDDEN,
 # We need to specify a separate input for masks
 mask_input=l_mask,
 # Here, we supply the gate parameters for each gate
 ingate=gate_parameters, forgetgate=gate_parameters,
 cell=cell_parameters, outgate=gate_parameters,
 # We'll learn the initialization and use gradient clipping
 learn_init=True, grad_clipping=100.)

In [5]:
# The "backwards" layer is the same as the first,
# except that the backwards argument is set to True.
l_lstm_back = lasagne.layers.recurrent.LSTMLayer(
 l_in, N_HIDDEN, ingate=gate_parameters,
 mask_input=l_mask, forgetgate=gate_parameters,
 cell=cell_parameters, outgate=gate_parameters,
 learn_init=True, grad_clipping=100., backwards=True)
# We'll combine the forward and backward layer output by summing.
# Merge layers take in lists of layers to merge as input.
l_sum = lasagne.layers.ElemwiseSumLayer([l_lstm, l_lstm_back])

In [6]:
# First, retrieve symbolic variables for the input shape
n_batch, n_time_steps, n_features = l_in.input_var.shape
# Now, squash the n_batch and n_time_steps dimensions
l_reshape = lasagne.layers.ReshapeLayer(l_sum, (-1, N_HIDDEN))
# Now, we can apply feed-forward layers as usual.
# We want the network to predict a single value, the sum, so we'll use a single unit.
l_dense = lasagne.layers.DenseLayer(
 l_reshape, num_units=1, nonlinearity=lasagne.nonlinearities.tanh)
# Now, the shape will be n_batch*n_timesteps, 1. We can then reshape to
# n_batch, n_timesteps to get a single value for each timstep from each sequence
l_out = lasagne.layers.ReshapeLayer(l_dense, (n_batch, n_time_steps))

In [10]:
# Symbolic variable for the target network output.
# It will be of shape n_batch, because there's only 1 target value per sequence.
target_values = T.vector('target_output')
# This matrix will tell the network the length of each sequences.
# The actual values will be supplied by the gen_data function.
mask = T.matrix('mask')
# lasagne.layers.get_output produces an expression for the output of the net
network_output = lasagne.layers.get_output(l_out)
# The value we care about is the final value produced for each sequence
# so we simply slice it out.
predicted_values = network_output[:, -1]
# Our cost will be mean-squared error
cost = T.mean((predicted_values - target_values)**2)
# Retrieve all parameters from the network
all_params = lasagne.layers.get_all_params(l_out)
# Compute adam updates for training
updates = lasagne.updates.adam(cost, all_params)
# Theano functions for training and computing cost
train = theano.function(
 [l_in.input_var, target_values, l_mask.input_var],
 cost, updates=updates)
compute_cost = theano.function(
 [l_in.input_var, target_values, l_mask.input_var], cost)
# We'll use this "validation set" to periodically check progress
X_val, y_val, mask_val = gen_data()
# We'll train the network with 10 epochs of 100 minibatches each
NUM_EPOCHS = 10
EPOCH_SIZE = 100
for epoch in range(NUM_EPOCHS):
    for _ in range(EPOCH_SIZE):
        X, y, m = gen_data()
        train(X, y, m)
    cost_val = compute_cost(X_val, y_val, mask_val)
    print("Epoch {} validation cost = {}".format(epoch + 1, cost_val))

Epoch 1 validation cost = 0.046245187521
Epoch 2 validation cost = 0.0390758290887
Epoch 3 validation cost = 0.0487103126943
Epoch 4 validation cost = 0.0274621322751
Epoch 5 validation cost = 0.0209543332458
Epoch 6 validation cost = 0.0191643983126
Epoch 7 validation cost = 0.0116968825459
Epoch 8 validation cost = 0.0152641050518
Epoch 9 validation cost = 0.00905874744058
Epoch 10 validation cost = 0.00728162936866
