# MNIST (Hand-written Digit) Classification

First, import necessary libraries

In [1]:
import tensorflow as tf
import numpy as np
import scipy.misc
import os
import shutil

from tensorflow.contrib.tensorboard.plugins import projector
from tensorflow.examples.tutorials.mnist import input_data

Download the MNIST data.

In [2]:
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
mnist_dim = mnist.train.images.shape[1]
mnist_size = np.int(np.sqrt(mnist_dim))
mnist_classes = mnist.train.labels.shape[1]

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


## Log Directory
TensorFlow need a log directory to write summaries and save checkpoints. We can log the result of different runs into different subdirectory (e.g. `./log/run0001`), and we can view the results from all runs by launching TensorBoard with `--logdir=PARENT_LOG_DIR` (e.g. `--logdir=./log`)

In [3]:
def logdir(mkdir=False):
    """Generates new log directory path.

    The log directory is of the format ./log/run####

    Args:
        mkdir (bool): Creates the directory if mkdir is true. Returns
            the path without creating the directory otherwise.

    Returns:
        string: Log directory path. Returns None if failed to generate
            a new log path.

    """
    MAX_RUNS = 10000
    base = os.path.abspath('./log')
    for i in range(MAX_RUNS):
        result = os.path.join(base, 'run{:04d}'.format(i))
        if not os.path.exists(result):
            if mkdir:
                os.makedirs(result)
            return result
    return None

We will have two summary writers: one for the train dataset and one for the validation dataset. 

In [4]:
LOGDIR = logdir(mkdir=True)
train_writer = tf.summary.FileWriter(os.path.join(LOGDIR, 'train'))
test_writer = tf.summary.FileWriter(os.path.join(LOGDIR, 'test'))
print('log dir is: {0}'.format(LOGDIR))

## Defining Layers
We will build a feed-forward neural network for MNIST classification. Our network architecture will be composed convolution-max-pooling layer, fully-connected layer, and readout layer (predicting the label). It will be more organized to define a layer function for creating a layer as opposed to putting all the graph construction code in one place. Below, we are defining:
- readout layer
- fully-connected (fc) layer
- convolution-max-pooling (cp) layer

Note that in each function, we use the `variable_scope` so the computation nodes and the variables in the graph will be organized by the layer. Also, we create a histogram summary of the layer weights, biases and activations.

<br>
<font color=red>__EXERCISE: Implements 'add_cp_layer'__</font>

In [5]:
def add_readout_layer(input_tensor, name=None):
    """Constructs a readout layer.

    A readout layer takes in a tensor of shape [batch_size, input_dim], and produce a softmax
    output of shape [batch_size, num_classes] according to

            output = softmax(W * input + b)

    Args:
        input_tensor (tensor): 2D tensor of shape [batch_size, input_dim].
        name (string): Name for the layer variable scope. Default to 'readout_layer'.

    Returns:
        (logits, softmax): both are 2D tensor of shape [batch_size, num_classes]

    """
    input_dim = input_tensor.shape[1]
    with tf.variable_scope(name, 'readout_layer', [input_tensor]):
        W = tf.get_variable('W', shape=[input_dim, mnist_classes], initializer=tf.truncated_normal_initializer())
        b = tf.get_variable('b', shape=[1, mnist_classes], initializer=tf.zeros_initializer())
        logits = tf.add(tf.matmul(input_tensor, W), b, name='logits')
        tf.summary.histogram('W', W)
        tf.summary.histogram('b', b)
        tf.summary.histogram('logits', logits)
        return logits, tf.nn.softmax(logits, name='softmax')

In [6]:
def add_fc_layer(input_tensor, output_dim, activation_fn=tf.nn.relu, name=None):
    """Constructs a fully-connected layer.

    A fully-connected layer takes in a tensor of shape [batch_size, input_dim], and produce an
    output of shape [batch_size, output_dim] according to
    
            output = activation_fn(W * input + b)

    Args:
        input_tensor (tensor): 2D tensor of shape [batch_size, input_dim].
        output_dim (int): An integer specifying the output dimension.
        activation_fn (element-wise function): A TensorFlow element-wise function for the
            layer activation function. Default to ReLU.
        name (string): Name for the layer variable scope. Default to 'fc_layer'.

    Returns:
        Tensor: of shape [batch_size, output_dim]

    """
    input_dim = input_tensor.shape[1]
    with tf.variable_scope(name, 'fc_layer', [input_tensor]):
        W = tf.get_variable('W', shape=[input_dim, output_dim], initializer=tf.truncated_normal_initializer())
        b = tf.get_variable('b', shape=[1, output_dim], initializer=tf.zeros_initializer())
        output_tensor = activation_fn(tf.matmul(input_tensor, W) + b, name='activations')
        tf.summary.histogram('W', W)
        tf.summary.histogram('b', b)
        tf.summary.histogram('activations', output_tensor)
        return output_tensor

In [7]:
def add_cp_layer(input_tensor, out_channels, pool_size, filter_size=3, activation_fn=tf.nn.relu, name=None):
    """Constructs a convolution-max-pooling layer.

    A layer that convolves an input with filters and then performing max-pooling. The input shape should
    be [batch_size, height, width, channels]. The output tensor shape will be [batch_size, out_height,
    out_width, out_channels] where out_height and out_width depends on the pool_size and filter_size

    Args:
        input_tensor (tensor): 2D tensor of shape [batch_size, height, width, channels].
        output_channels (int): An integer specifying the number of output channels.
        pool_size (int): The max pooling window is pool_size x pool_size.
        filter_size (int): The filter shape is filter_size x filter_size.
        activation_fn (element-wise function): A TensorFlow element-wise function for the
            layer activation function. Default to ReLU.
        name (string): Name for the layer variable scope. Default to 'cp_layer'.

    Returns:
        Tensor: of shape [batch_size, out_height, out_width, out_channels]

    """
    in_channels = input_tensor.shape[3]
    with tf.variable_scope(name, 'cp_layer', [input_tensor]):
        #
        # Implement add_cp_layer here.
        #
        
        output_tensor = None
        
        #
        # Also add histogram summaries of the layer parameters and activations
        #
        
        return output_tensor

## Defining Metrics and Training
In the TensorFlow graph, we also need to define metrics and a training op.

Metrics are used to measure how good our model is. Here we define two metrics: __cross-entropy loss__ and __accuracy__. Cross-entropy loss measures the difference in distribution between the classification predicted by the model and the ground-truth labels. This loss is optimized during training. Accuracy measures the ratio of correct predictions to the number of samples.

The training op is created from an __Adam__ optimizer. This optimizer usually gives a fast convergence. We can experiment with different optimizers and see which one gives the best result.



In [8]:
def add_metrics(onehot_labels, logits, name=None):
    """Creates cross-entropy loss and accuracy metrics.

    Args:
        onehot_labels (tensor): 2D tensor of shape [batch_size, num_classes]. Ground-truth labels in
            one_hot format.
        logits (tensor): 2D tensor of shape [batch_size, num_classes]. Logits of the softmax prediction.
        name (string): Name for the scope. Default to 'metrics'.

    Returns:
        Dictionary: containing cross-entropy loss (with key 'loss') and accuracy (with key 'accuracy').
            Both metrics are scaler tensors.

    """
    with tf.name_scope(name, 'metrics', [onehot_labels, logits]):
        metrics = {}
        
        # Cross-entropy loss.
        with tf.name_scope('loss', values=[onehot_labels, logits]):
            metrics['loss'] = tf.losses.softmax_cross_entropy(onehot_labels, logits, label_smoothing=0.01)
            tf.summary.scalar('loss', metrics['loss'])
        
        # Accuracy.
        with tf.name_scope('accuracy', values=[onehot_labels, logits]):
            labels = tf.argmax(onehot_labels, axis=-1, name='labels')
            predictions = tf.argmax(logits, axis=-1, name='predictions')
            metrics['accuracy'] = tf.reduce_mean(tf.cast(tf.equal(labels, predictions), tf.float32))
            tf.summary.scalar('accuracy', metrics['accuracy'])
        
        return metrics

In [9]:
def add_training(loss, global_step, name=None):
    """Returns a training op from Adam optimizer."""
    with tf.name_scope(name, 'training', [loss, global_step]):
        optimizer = tf.train.AdamOptimizer()
        return optimizer.minimize(loss, global_step)

## Embedding Visualization
It's difficult to understand how exactly a neural network can achieve its objective (in this case classifying MNIST digits). One way to help us understand a neural network is to visualize its hidden states. Here we will visualize its hidden states, which have high dimensionality, by projecting them onto a lower dimension space (2D or 3D). A TensorBoard plugin called __Projector__ allows us to project a set of vectors (in this case hidden states) onto their PCA space, or alternatively embeds them in a low dimensional space using tSNE technique.

We will think of how our neural network as follows. The layers up until the last hidden layer transform the MNIST images into a space where different digits are easily seperable. The readout layer works in this space, so it can accurately label the digits

The two functions defined below are for embedding visualization. The first function, `create_projector_meta` creates metadata for the embeddings (the corresponding images and labels). The second function `add_projector` creates a config for embedding visualization for the given variable containing the embeddings.

Later in the graph construction code and the model training code, we will compute the embeddings (last hidden states) of the validation dataset and visualize them. Also, we will compare the seperability of the embeddings to one of the raw MNIST images.

In [10]:
def create_projector_meta(directory, mnist):
    labels = np.argmax(mnist.labels, axis=1)
    count = labels.size
    
    with open(os.path.join(directory, 'metadata.tsv'), 'w') as f:
        for label in labels:
            f.write(str(label) + '\n')

    sprite_size = np.int(np.ceil(np.sqrt(count)))
    sprite_im = np.empty([sprite_size * mnist_size] * 2, dtype=np.float32)
    for i in range(count):
        row = (i // sprite_size) * mnist_size
        col = (i % sprite_size) * mnist_size
        im = 1.0 - np.reshape(mnist.images[i, :], [mnist_size, mnist_size])
        sprite_im[row:row+mnist_size, col:col+mnist_size] = im
    scipy.misc.toimage(sprite_im, cmin=0.0, cmax=1.0).save(os.path.join(directory, 'sprite.png'))
    
create_projector_meta(os.path.join(LOGDIR, 'test'), mnist.validation)

In [11]:
def add_projector(embedding_vars, writer):
    config = projector.ProjectorConfig()
    for embedding_var in embedding_vars:
        embedding = config.embeddings.add()
        embedding.tensor_name = embedding_var.name
        embedding.metadata_path = 'metadata.tsv'
        embedding.sprite.image_path = 'sprite.png'
        embedding.sprite.single_image_dim.extend([mnist_size, mnist_size])
    projector.visualize_embeddings(writer, config)

## Building TensorFlow Graph
Here the TensorFlow graph is constructed. The network architecture here is 1 convolution-max-pooling layer followed by 1 fully-connected layer, and finally a readout layer. Feel free to experiment with different architecture for the best result.

Note that at the end we create a saver object for saving model checkpoints.

<br>
<font color=red>__EXERCISE: Modify the graph construction code to use a convolution-max-pooling layer to improve the model accuracy.__</font>

In [12]:
img_dim = 784
img_size = np.sqrt(img_dim)
num_labels = 10
mnist.validation.images
fc1_dim = 64

g = tf.Graph()
    
with g.as_default():
    global_step = tf.Variable(0, trainable=False, name='global_step')
    images = tf.placeholder(dtype=tf.float32, shape=[None, mnist_dim], name='images')
    labels = tf.placeholder(dtype=tf.float32, shape=[None, mnist_classes], name='labels')
    tf.summary.image('MNIST', tf.reshape(images, [-1, mnist_size, mnist_size, 1]))
    
    # Forward computations
    with tf.variable_scope('forward_computation', values=[images, labels]):
        fc1 = add_fc_layer(images, fc1_dim)
        logits, output_softmax = add_readout_layer(fc1)
        
    # Metrics and train_op
    metrics = add_metrics(labels, logits)
    train_op = add_training(metrics['loss'], global_step)
    
    # Projector visualization
    with tf.variable_scope('projector', values=[fc1], initializer=tf.zeros_initializer):
        num_embs = mnist.validation.images.shape[0]
        img_embs = tf.get_variable(dtype=tf.float32, name='img_embs',
                                   shape=[num_embs, mnist_dim])
        fc1_embs = tf.get_variable(dtype=tf.float32, name='fc1_embs',
                                   shape=[num_embs, fc1_dim])
        emb_op = tf.group(img_embs.assign(images),
                          fc1_embs.assign(fc1))
        add_projector([img_embs, fc1_embs], test_writer)
        
    
    init_op = tf.global_variables_initializer()
    summary = tf.summary.merge_all()
    saver = tf.train.Saver()
    
train_writer.add_graph(g)

## Model Training
Before the training loop, notice that we restore our model with the lastest checkpoint. Hence, this code block can be interrupted to stop the training, and rerun to continue training.

In the training loop, the test summary (metrics on validation dataset) is written every 200 steps. The embedding visualization is updated every 200 steps as well. Notice that the loop breaks with the condition on the global step instead of a local step. This keeps the step counting accurate even if we stop and continue training. This is also beneficial when we train on a cluster with parameter servers and several trainers.

In [15]:
MAX_STEPS = 50000
BATCH_SIZE = 200
SAVE_NAME = 'model.ckpt'
SAVE_PATH = os.path.join(LOGDIR, SAVE_NAME)
sess = tf.Session(graph=g)
sess.run(init_op)

last_ckpt_path = tf.train.latest_checkpoint(LOGDIR)
if last_ckpt_path:
    saver.restore(sess, last_ckpt_path)
    
train_writer.reopen()
test_writer.reopen()

for _ in range(MAX_STEPS):
    i = sess.run(global_step)
    if i > MAX_STEPS:
        break
    train_images, train_labels = mnist.train.next_batch(batch_size=BATCH_SIZE, shuffle=True)
    feed_dict = {images:train_images, labels:train_labels}
    _, train_summary = sess.run([train_op, summary], feed_dict=feed_dict)
    train_writer.add_summary(train_summary, i)
    if i % 200 == 0 or i == MAX_STEPS:
        print(i)
        val_images, val_labels = mnist.validation.images, mnist.validation.labels
        feed_dict = {images:val_images, labels:val_labels}
        _, test_summary = sess.run([emb_op, summary], feed_dict=feed_dict)
        test_writer.add_summary(test_summary, i)
        train_writer.flush()
        test_writer.flush()
        saver.save(sess, SAVE_PATH, global_step=i)

train_writer.close()
test_writer.close()

INFO:tensorflow:Restoring parameters from D:\ownCloud\Techtalk\tensorflow\exercise2\log\run0001\model.ckpt-16000
16200
16400
16600
16800
17000
17200
17400
17600
17800
18000
18200
18400
18600
18800
19000
19200
19400
19600
19800
20000
20200
20400
20600
20800
21000
21200
21400
21600
21800
22000
22200
22400
22600
22800
23000
23200
23400
23600
23800
24000
24200
24400
24600
24800
25000
25200
25400
25600
25800
26000
26200
26400
26600
26800
27000
27200
27400
27600
27800
28000
28200
28400
28600
28800
29000
29200
29400
29600
29800
30000
30200
30400
30600
30800
31000
31200
31400
31600
31800
32000
32200
32400
32600
32800
33000
33200
33400
33600
33800
34000
34200
34400
34600
34800
35000
35200
35400
35600
35800
36000
36200
36400
36600
36800
37000
37200
37400
37600
37800
38000
38200
38400
38600
38800
39000
39200
39400
39600
39800
40000
40200
40400
40600
40800
41000
41200
41400
41600
41800
42000
42200
42400
42600
42800
43000
43200
43400
43600
43800
44000
44200
44400
44600
44800
45000
45200
45400
45600