In [29]:
import theano
from theano import tensor as T
import lasagne
from lasagne.layers import *
from lasagne.nonlinearities import *
from lasagne.objectives import *
from lasagne.regularization import *
from lasagne.random import get_rng
from lasagne.updates import *
from lasagne.init import *
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from urllib import urlretrieve
import cPickle as pickle
import gzip

In [59]:
"""
   Binomial dropout layer

   Samples a binomial mask on the first axis (i.e. batch size)
   and multiplies it with the input. This has the effect of
   zeroing the output for some examples in the batch (according
   to the survival probability p)
   
   Parameters
   ----------
   
   incoming : a :class:`Layer` instance
   p : float
       The survival probability for an example in the batch

"""
class BinomialDropLayer(Layer):
    def __init__(self, incoming, p=0.5, **kwargs):
        super(BinomialDropLayer, self).__init__(incoming, **kwargs)
        self._srng = RandomStreams(get_rng().randint(1, 2147462579))
        self.p = p

    def get_output_for(self, input, deterministic=False, **kwargs):
        if deterministic:
            return self.p*input
        else:
            mask = self._srng.binomial(n=1, p=(self.p), size=(input.shape[0],),
                dtype=input.dtype)
            mask = mask.dimshuffle(0,'x','x','x')
            return mask*input

In [60]:
"""
   http://arxiv.org/abs/1603.09382
   
   "...we replace the identity connections in these blocks
   by an averaging pooling layer followed by zero paddings
   to match the dimensions."
   
   To explain this method, let us consider two consecutive
   convolution layers, `conv1` and `conv2`. Let us assume
   they have different output shapes. To create the
   identity connection between `conv1` and `conv2`, do 2x2
   average pooling on `conv1`. Then, pad the result so that
   it has the same dimensions as `conv2`. Afterwards, we have
   to see if the final result has the same number of feature
   maps as `conv2`; if not, we have to add all-zero feature
   maps to either side of the result. Then, we construct a 
   binomial drop layer so that we can compute the final 
   equation:
   
   binomial_mask*conv2 + id(conv1)
   
   If we pass this through a nonlinearity layer, we can then
   do: g( binomial_mask*conv2 + id(conv1) )
   
   Parameters
   ----------
   
   incoming : a :class:`Layer` instance
   p : float
       The survival probability for the binomial mask

"""
def stochastic_depth_block(incoming, p, nonlinearity=linear):
    layer_before_incoming = None
    for prev_layer in get_all_layers(incoming)[::-1][1::]:
        if "ignore" not in prev_layer.name and not isinstance(prev_layer, NonlinearityLayer):
            layer_before_incoming = prev_layer
            break
    if layer_before_incoming == None:
        raise Exception("Cannot find an appropriate layer before layer: %s" % incoming.name)
        
    if layer_before_incoming.output_shape != incoming.output_shape:    
        l_pool = Pool2DLayer(layer_before_incoming, pool_size=(2,2), mode="average_inc_pad", name="ignore_pool")
        if (l_pool.output_shape[2] % 2 == 1 and incoming.output_shape[2] % 2 == 0) or \
            (l_pool.output_shape[2] % 2 == 0 and incoming.output_shape[2] % 2 == 1):
            l_pad = pad( l_pool, width=((0,1),(0,1)), name="ignore_prelim_pad" )
        else:
            l_pad = l_pool
        nd1 = (incoming.output_shape[2]-l_pad.output_shape[2])/2
        if nd1 > 0:
            l_pad = pad(l_pad, width=(nd1,nd1), name="ignore_pad")
        # what if the layer_before_incoming num feature maps is
        # less than the incoming_layer num feature maps?
        if layer_before_incoming.output_shape[1] < incoming.output_shape[1]:
            diff_in_fms = incoming.output_shape[1]-layer_before_incoming.output_shape[1]
            if diff_in_fms % 2 == 0: 
                width_tp = ((diff_in_fms/2, diff_in_fms/2),)
            else:
                width_tp = (((diff_in_fms/2)+1, diff_in_fms/2),)
            l_pad = pad(
                l_pad, 
                batch_ndim=1, 
                width=width_tp,
                name="ignore_fm_pad"
            )
        l_binom_drop = BinomialDropLayer(incoming, p=p, name="ignore_binom")
        l_sum = ElemwiseSumLayer([l_binom_drop, l_pad], name="ignore_elemsum")
        l_nonlinearity = NonlinearityLayer(l_sum, nonlinearity=nonlinearity, name="ignore_nonlinearity")
        return l_nonlinearity
    else:
        l_binom_drop = BinomialDropLayer(incoming, p=p, name="ignore_binom")
        l_sum = ElemwiseSumLayer([l_binom_drop, layer_before_incoming], name="ignore_elemsum")
        l_nonlinearity = NonlinearityLayer(l_sum, nonlinearity=nonlinearity, name="ignore_nonlinearity")
        return l_nonlinearity

Let us create a simple convolution network

In [61]:
l_in = InputLayer( (None, 1, 28, 28), name="input" )

l_conv1 = Conv2DLayer(l_in, num_filters=8, filter_size=3, name="l_conv1", nonlinearity=None)
l_sd1 = stochastic_depth_block(l_conv1, p=0.5, nonlinearity=rectify)

l_mp1 = MaxPool2DLayer(l_sd1, pool_size=(2,2), name="l_mp1")
l_sd2 = stochastic_depth_block(l_mp1, p=0.5)

l_conv2 = Conv2DLayer(l_sd2, num_filters=8, filter_size=3, name="l_conv2", nonlinearity=None)
l_sd3 = stochastic_depth_block(l_conv2, p=0.5, nonlinearity=rectify)

l_mp2 = MaxPool2DLayer(l_sd3, pool_size=(2,2), name="l_mp2")
l_sd4 = stochastic_depth_block(l_mp2, p=0.5)

l_conv3 = Conv2DLayer(l_sd4, num_filters=16, filter_size=3, name="l_conv3", nonlinearity=None)
l_sd5 = stochastic_depth_block(l_conv3, p=0.5, nonlinearity=rectify)

l_fc = DenseLayer(l_sd5, num_units=10, nonlinearity=softmax, name="l_fc")

l_out = l_fc

for layer in get_all_layers(l_out):
    print layer.name, layer.output_shape
print "num of params: %i" % count_params(l_out)

input (None, 1, 28, 28)
l_conv1 (None, 8, 26, 26)
ignore_binom (None, 8, 26, 26)
ignore_pool (None, 1, 14, 14)
ignore_pad (None, 1, 26, 26)
ignore_fm_pad (None, 8, 26, 26)
ignore_elemsum (None, 8, 26, 26)
ignore_nonlinearity (None, 8, 26, 26)
l_mp1 (None, 8, 13, 13)
ignore_binom (None, 8, 13, 13)
ignore_pool (None, 8, 13, 13)
ignore_elemsum (None, 8, 13, 13)
ignore_nonlinearity (None, 8, 13, 13)
l_conv2 (None, 8, 11, 11)
ignore_binom (None, 8, 11, 11)
ignore_pool (None, 8, 6, 6)
ignore_prelim_pad (None, 8, 7, 7)
ignore_pad (None, 8, 11, 11)
ignore_elemsum (None, 8, 11, 11)
ignore_nonlinearity (None, 8, 11, 11)
l_mp2 (None, 8, 5, 5)
ignore_binom (None, 8, 5, 5)
ignore_pool (None, 8, 5, 5)
ignore_elemsum (None, 8, 5, 5)
ignore_nonlinearity (None, 8, 5, 5)
l_conv3 (None, 16, 3, 3)
ignore_binom (None, 16, 3, 3)
ignore_pool (None, 8, 2, 2)
ignore_prelim_pad (None, 8, 3, 3)
ignore_fm_pad (None, 16, 3, 3)
ignore_elemsum (None, 16, 3, 3)
ignore_nonlinearity (None, 16, 3, 3)
l_fc (None, 10)
num

Ok, let's download the MNIST dataset

In [27]:
url_ret = urlretrieve("http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz", "/tmp/mnist.pkl.gz")

In [32]:
with gzip.open("/tmp/mnist.pkl.gz") as f:
    dat = pickle.load(f)
train_data, _, _ = dat
X_train, y_train = train_data
X_train = X_train.reshape( (X_train.shape[0], 1, 28, 28) ).astype( theano.config.floatX )
y_train = y_train.astype("int32")

In [33]:
X_train.shape, y_train.shape

((50000, 1, 28, 28), (50000,))

Define the Lasagne-related stuff we'll need for training the network

In [62]:
X = T.tensor4('X')
y = T.ivector('y')

net_out = get_output(l_out, X)
net_out_det = get_output(l_out, X, deterministic=True)
loss = categorical_crossentropy(net_out, y).mean()
params = get_all_params(l_out, trainable=True)
grads = T.grad(loss, params)
updates = nesterov_momentum(grads, params, learning_rate=0.01, momentum=0.9)
train_fn = theano.function(inputs=[X, y], outputs=loss, updates=updates)
out_fn = theano.function(inputs=[X], outputs=net_out_det)

In [63]:
bs = 32 
n_batches = X_train.shape[0] // bs
num_epochs = 10
for epoch in range(0, num_epochs):
    train_losses = []
    for b in range(0, n_batches):
        train_losses.append( train_fn(X_train[b*bs : (b+1)*bs], y_train[b*bs : (b+1)*bs]) )
    print (epoch+1), np.mean(train_losses)

1 1.04170554822
2 0.838029498493
3 0.794951380409
4 0.759657711097
5 0.733128651309
6 0.715255246341
7 0.718663093263
8 0.710821818392
9 0.721151590125
10 0.707448014221


What accuracy do we get on the training set? (Between 0 and 1)

In [64]:
np.sum( np.argmax( out_fn(X_train), axis=1 ) == y_train ) / float(X_train.shape[0])

0.92969999999999997