In [None]:
%env THEANO_FLAGS='device=gpu0','floatX=float32'

import os
import os.path as osp

import numpy as np

import theano
import theano.tensor as T
from lasagne import *

%matplotlib nbagg
import matplotlib.pyplot as plt

from mldm import NNWatcher, Net

In [None]:
%%sh

wget -q -nc https://raw.githubusercontent.com/amitgroup/amitgroup/master/amitgroup/io/mnist.py

In [None]:
### http://g.sweyla.com/blog/2012/mnist-numpy/
import mnist

In [None]:
%%sh

mkdir -p mnist && {
    cd mnist;
    wget -q -nc http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz &&
    wget -q -nc http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz &&
    wget -q -nc http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz &&
    wget -q -nc http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz &&
    gunzip *.gz
}

In [None]:
X, y = mnist.load_mnist(dataset='training', path='mnist/')
X = X.reshape(-1, 1, 28, 28).astype('float32')

X_test, y_test = mnist.load_mnist(dataset='testing', path='mnist/')
X_test = X_test.reshape(-1, 1, 28, 28).astype('float32')

In [None]:
def one_hot(y, n_classes=10):
    onehot = np.zeros(shape=(y.shape[0], n_classes), dtype='float32')

    onehot[np.arange(y.shape[0]), y] = 1.0
    return onehot

In [None]:
y = one_hot(y)
y_test = one_hot(y_test)

In [None]:
np.prod(X.shape[1:])

## Going deep
but this time not exponentially

In [None]:
def sparseness(W, c):
    n_units = W.get_value().shape[1]
    
    l1_units = T.sum(abs(W), axis=1)
    l2_units = T.sqrt(T.sum(abs(W), axis=1))
    
    sp = (np.sqrt(n_units) - l1_units / l2_units) / (np.sqrt(n_units) - 1)
    
    constraints = T.nnet.softplus(c - sp)
    penalty = T.sum(constraints)
    return penalty

class SparseLayer(layers.Layer):
    def __init__(self, incoming, num_units,
                 W=init.GlorotUniform(),
                 b=init.Constant(0.),
                 nonlinearity=nonlinearities.rectify,
                 **kwargs):
        super(SparseLayer, self).__init__(incoming, **kwargs)

        num_inputs = self.input_shape[1]
        self.num_units = num_units

        self.W = self.add_param(W, (num_inputs, num_units), name='W')
        self.b = self.add_param(b, (num_units, ), name='b', regularizable=False)
        
        self.nonlinearity = nonlinearity

    def get_output_for(self, input, **kwargs):
        activation = T.dot(input, self.W) + self.b

        return self.nonlinearity(activation)

    def get_output_shape_for(self, input_shape):
        return input_shape[:1] + (self.num_units,)
    
    def get_sparseness_penalty(self, sparsness_c=0.85):
        return sparseness(self.W, sparsness_c)

In [None]:
class DeepSparseMNISTNet(Net):
    def __init__(self, sparseness_c=0.85):
        self.X_batch = T.ftensor4(name='X_batch')
        self.y_batch = T.fmatrix(name='y_batch')
        
        self.layers = []
        input_l = layers.InputLayer(shape=(None, ) + X.shape[1:], input_var=self.X_batch, name='Input')
        self.layers.append(input_l)
        
        reshape_l = layers.FlattenLayer(input_l)
        self.layers.append(reshape_l)

        for i, n_units in enumerate([128, 64, 16]):
            dense = SparseLayer(
                self.layers[-1],
                num_units=n_units,
                nonlinearity=nonlinearities.sigmoid,
                name='SparseLayer %d' % i
            )

            self.layers.append(dense)
        """
        self.layers.append(
            layers.DenseLayer(
                self.layers[-1],
                num_units = 16,
                nonlinearity=nonlinearities.sigmoid,
                name = 'DenseLayer 1'
            )
        )
        """
        
        output_l = layers.DenseLayer(
            self.layers[-1],
            num_units=10,
            nonlinearity=nonlinearities.softmax,
            name= 'Softmax'
        )
        
        self.layers.append(output_l)

        self.net = output_l
        
        self.predictions = layers.get_output(self.net)
        self.pure_loss = T.mean(objectives.categorical_crossentropy(self.predictions, self.y_batch))
        
        self.sparsness_penalty_coef = T.fscalar('sparseness_penalty_coef')
        self.regularization_coef = T.fscalar('regularization_coef')
        
        self.regularization = self.sparsness_penalty_coef * reduce(lambda a, b: a + b, [
                layer.get_sparseness_penalty(sparseness_c)
                for layer in self.layers
                if hasattr(layer, 'get_sparseness_penalty')
        ]) + self.regularization_coef * regularization.regularize_network_params(
            self.net,
            penalty=regularization.l2
        )
        
        self.loss = self.pure_loss + self.regularization
        
        self.learning_rate = T.fscalar('learning rate')
        params = layers.get_all_params(self.net)

        upd = updates.adadelta(self.loss, params, learning_rate=self.learning_rate)

        self.train = theano.function(
            [
                self.X_batch, self.y_batch,
                self.regularization_coef, self.sparsness_penalty_coef,
                self.learning_rate
            ],
            [self.pure_loss, self.regularization],
            updates=upd
        )

        self.get_loss = theano.function([self.X_batch, self.y_batch], self.pure_loss)
        
        super(DeepSparseMNISTNet, self).__init__()
            
    @staticmethod
    def batch_stream(n, batch_size=32):
        n_batches = n / batch_size
        
        for i in xrange(n_batches):
            indx = np.random.choice(n, size=batch_size)
            yield indx
    
    def fit(self, X, y, n_epoches = 1, batch_size=32,
            regularization_coef=1.0e-3, sparsness_penalty_coef=1.0e-3,
            learning_rate = 1.0):
        regularization_coef = np.float32(regularization_coef)
        learning_rate = np.float32(learning_rate)
        sparsness_penalty_coef = np.float32(sparsness_penalty_coef)
        
        n_batches = X.shape[0] / batch_size
        losses = np.zeros(shape=(n_epoches, n_batches), dtype='float32')
        regs = np.zeros(shape=(n_epoches, n_batches), dtype='float32')
        
        for epoch in xrange(n_epoches):
            for i, indx in enumerate(self.batch_stream(X.shape[0], batch_size=batch_size)):
                losses[epoch, i], regs[epoch, i] = \
                self.train(X[indx], y[indx], regularization_coef, sparsness_penalty_coef, learning_rate)
            
            yield losses[:(epoch + 1)], regs[:(epoch + 1)]

In [None]:
deep_net = DeepSparseMNISTNet()

In [None]:
deep_net.save('deep-sparse-net-0.npz')

In [None]:
watcher = NNWatcher(labels=('loss', 'sparseness penalty'), colors=('blue', 'red'))

for loss, reg in deep_net.fit(
    X, y, n_epoches=64, batch_size=64,
    learning_rate=1.0, 
    regularization_coef=1.0e-5, sparsness_penalty_coef=1.0e-5
):
    watcher.draw(loss, reg)

In [None]:
predict = theano.function([deep_net.X_batch], deep_net.predictions)

y_proba = predict(X_test)
print 'accuracy:', np.mean(np.argmax(y_test, axis=1) == np.argmax(y_proba, axis=1))

In [None]:
watcher = NNWatcher(labels=('loss', 'sparseness penalty'), colors=('blue', 'red'))

for loss, reg in deep_net.fit(X, y, n_epoches=64, learning_rate=1.0e-1, batch_size=64, regularization_coef=1.0e-3):
    watcher.draw(loss, reg)

In [None]:
watcher = NNWatcher(labels=('loss', 'sparseness penalty'), colors=('blue', 'red'))

for loss, reg in deep_net.fit(X, y, n_epoches=64, learning_rate=1.0, batch_size=128, regularization_coef=2.5e-3):
    watcher.draw(loss, reg)

In [None]:
deep_net.save('deep-low-parameter-net-1.npz')

In [None]:
predict = theano.function([deep_net.X_batch], deep_net.predictions)

In [None]:
y_proba = predict(X_test)
print 'accuracy:', np.mean(np.argmax(y_test, axis=1) == np.argmax(y_proba, axis=1))

In [None]:
l = deep_net.layers[2]

In [None]:
W = l.get_params()[0].get_value()

In [None]:
plt.figure(figsize=(12, 6))
plt.imshow(W, interpolation='None', aspect = 0.1, cmap=plt.cm.viridis)
plt.colorbar()

In [None]:
import itertools

n = 5
fig, ax = plt.subplots(n, n, figsize=(12, 12))

plt.suptitle('Filters of the first dense layer')

vmin = np.min(np.abs(W))
vmax = np.max(np.abs(W))

for i, j in itertools.product(range(n), range(n)):
    k = i * n + j
    ax[i, j].imshow(
        W[:, k].reshape(28, 28), interpolation='None',
        cmap=plt.cm.viridis, vmin=vmin, vmax=vmax
    )