# CNN with MNIST dataset

ref: deep learning with tensorflow

Dataset for MNIST will be downloaded from Yann LeCunn's Webiste, using a helper function.
The downloaded is stored so we dont have to redownload it anytime we run the script.
The raw downloaded data is stored as a raw string of bytes so we use np.frombuffer to convert it into a numpy array.
The data is further rescaled from 0 to 255 , to be between -0.5 and 0.5 to make it easy for the gradient descent algorithm.

The data is extracted and returned as \[tensor, height, width, channels\]

5000 of the images will be subset for validation.



In [1]:
import gzip
import os, sys, time, numpy

from six.moves import urllib, xrange
import tensorflow as tf

In [2]:
#Global variables

SOURCE_URL = "http://yann.lecun.com/exdb/mnist/"
WORK_DIR = "data"
IMAGE_SIZE = 28
NUM_CHANNELS = 1
PIXEL_DEPTH = 255
NUM_LABELS = 10
VALIDATION_SIZE = 5000
SEED = 66478
BATCH_SIZE = 64
NUM_EPOCHS = 10
EVAL_BATCH_SIZE = 64
EVAL_FREQUENCY = 100 # number of steps between evaluations

In [3]:
# Helper functions

def download(filename):
    # Function for downloading dataset and performing housecleaning before
    # if required
    if not os.path.exists(WORK_DIR):
        os.makedirs(WORK_DIR)
    filepath = os.path.join(WORK_DIR, filename)
    if not os.path.exists(filepath):
        filepath, _ = urllib.request.urlretrieve(SOURCE_URL + filename,
                                                filepath)
        size = os.stat(filepath).st_size
        print("Successfully downloaded", filename, size, 'bytes.')
    return filepath

def extract_data(filename, num_images):
    """
    Extract the images into a 4D tensor [image index, y, x, channels]
    
    values are rescaled from [0, 255] down to [-0.5, 0.5]
    """
    print("Extracting %s" %filename)
    with gzip.open(filename) as bytestream:
        #remove header
        bytestream.read(16)
        # Read bytes for labels
        buf = bytestream.read(IMAGE_SIZE * IMAGE_SIZE * num_images * NUM_CHANNELS)
        data  = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.float32)
        # Center data to have mean zero and unit range
        data = (data - (255/2.0))/255
        data = data.reshape(num_images, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)
        return data
    
def extract_labels(filename, num_images):
    """Extract the labels into a vector of int64 label IDs"""
    print("Extracting %s" %filename)
    with gzip.open(filename) as bytestream:
        # remove header
        bytestream.read(8)
        # Read bytes for lalels
        buf = bytestream.read(num_images)
        labels = numpy.frombuffer(buf, dtype=numpy.uint8).astype(
        numpy.int64)
    return labels


def error_rate(predictions, labels):
    """
    Return  the error rate based on dense predictions and sparse labels
    """
    return 100.0 - (100.0 * 
                   numpy.sum(numpy.argmax(predictions, 1) == labels) / 
                   predictions.shape[0])

def model(data, train=False):
    """ Model Definition"""
    # 2D convolution, with 'SAME' padding so that output feature map
    # ha the same size as the input. Strides has the same array dimensions
    # as the input [image index, y, x, depth]
    
    conv = tf.nn.conv2d(data,
                        conv1_weights,
                        strides=[1,1,1,1],
                        padding='SAME')
    # Bias and rectified linear non-linearity
    relu = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases))
    # Max pooling. The kernel size spec(ksize) also follows the layout of the 
    # data. We use a pooling window of 2, and a stride of 2
    
    pool = tf.nn.max_pool(relu,
                         ksize=[1,2,2,1],
                         strides=[1,2,2,1],
                         padding='SAME')
    
    conv = tf.nn.conv2d(pool,
                        conv2_weights,
                        strides=[1,1,1,1],
                        padding='SAME')

    relu = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases))
    pool = tf.nn.max_pool(relu,
                         ksize=[1,2,2,1],
                         strides=[1,2,2,1],
                         padding='SAME')
    #Reshape the feature map cuboid into a 2D matrix to feed to the fully connected
    # layers
    pool_shape = pool.get_shape().as_list()
    reshape = tf.reshape(pool,
                        [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]])
    # Fully connected layer. Note that '+' operation automatically broadcasts
    # the biases
    hidden = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases)
    # Add a 50% dropout during training only. Dropout also scales activations
    # such that no rescaling is needed at evaluation time
    if train:
        hidden = tf.nn.dropout(hidden, 0.5, seed=SEED)
    return tf.matmul(hidden, fc2_weights) + fc2_biases


# utility function to evaluate a dataset by feeding batches of data to 
# {eval_data} and pulling the results from {eval_predictions}.
# saves memory and enables running on GPUs
def eval_in_batches(data, sess):
    """
    Get predictions for a dataset by running it in small batches.
    """
    size = data.shape[0]
    if size < EVAL_BATCH_SIZE:
        raise ValueError("batch size for evals larger than dataset: %d"
                        % size)
    predictions = numpy.ndarray(shape=(size, NUM_LABELS),
                               dtype=numpy.float32)
    for begin in xrange(0, size, EVAL_BATCH_SIZE):
        end = begin + EVAL_BATCH_SIZE
        if end <= size:
            predictions[begin:end,:] = sess.run(eval_prediction,
                                                feed_dict={eval_data : data[begin:end, ...]})
        else:
            batch_predictions = sess.run(eval_prediction,
                                         feed_dict={eval_data: data[-EVAL_BATCH_SIZE:,...]})
            predictions[begin:,:] = batch_predictions[begin - size:, :]
    return predictions
    

The architecture for the CNN will use 2 convolutional layers interspersed with 2 pooling layers and then end with two fully connected layers.

We create weights only for the convolutional and fully connected layers and not for the pooling layers as they dont learn any weights.

Separate tensors are created to hold the weights for each of the relevant layers. Values are also created for the biases, both for the convolution and fully connected layers.

The input images are of size=28. During the each of the pooling stages, the current input size is reduced by 2. So the shape of the first fully connected layer is created accordingly.
The first fully connected layer converts the output of the convolutional layer to a vector of length 512.


The shape of the second fully connected label is created knowing we want a 10-way classification ouput. (512,10).

when defining the architecture, a dropout layer is added after the final fully connected(FCN) layer, and a check if performed to ensure this included only for the training and not for the predictions

In [4]:
# Get the data
train_data_filename = download('train-images-idx3-ubyte.gz')
train_labels_filename = download('train-labels-idx1-ubyte.gz')
test_data_filename = download('t10k-images-idx3-ubyte.gz')
test_labels_filename = download('t10k-labels-idx1-ubyte.gz')

# Extract data into numpy arrats
train_data = extract_data(train_data_filename, 60000)
train_labels = extract_labels(train_labels_filename, 60000)
test_data = extract_data(test_data_filename, 10000)
test_labels = extract_labels(test_labels_filename, 10000)

# Generate a validation set
validation_data = train_data[:VALIDATION_SIZE, ...]
validation_labels = train_labels[:VALIDATION_SIZE]
train_data = train_data[VALIDATION_SIZE:, ...]
train_labels = train_labels[VALIDATION_SIZE:]


Extracting data/train-images-idx3-ubyte.gz
Extracting data/train-labels-idx1-ubyte.gz
Extracting data/t10k-images-idx3-ubyte.gz
Extracting data/t10k-labels-idx1-ubyte.gz


We create placeholders for inputting the training images and corresponding labels.

A separate placeholder is created also for the evaluation, to allow to input larger batches during the evaluation.

In [7]:
num_epochs = NUM_EPOCHS
train_size = train_labels.shape[0]

# This is where training samples and labels are fed to the graph.
# These placeholder nodes will be fed a batch of training data at each
# training step using the {feed_dict} argument to the Run() call

train_data_node = tf.placeholder(tf.float32,
                                shape=(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))
train_labels_node = tf.placeholder(tf.int64, shape=(BATCH_SIZE,))
eval_data = tf.placeholder(tf.float32, 
                           shape=(EVAL_BATCH_SIZE,IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))

# The variables below hold all the trainable weights. They are passed an initial
# value which will be assigned when we call  run on the initializer method

# 5x5 filter, depth 32
conv1_weights = tf.Variable(tf.truncated_normal([5,5,NUM_CHANNELS,32],
                                                stddev=0.1,
                                                seed=SEED, dtype=tf.float32))
conv1_biases = tf.Variable(tf.zeros([32], dtype=tf.float32))
conv2_weights = tf.Variable(tf.truncated_normal([5,5,32,64], 
                                                stddev=0.1,
                                                seed=SEED, 
                                                dtype=tf.float32))
conv2_biases = tf.Variable(tf.constant(0.1, shape=[64],
                                       dtype=tf.float32))
# fully connected, depth 512
fc1_weights = tf.Variable(tf.truncated_normal([IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 64, 512],
                         stddev=0.1,
                         seed=SEED,
                         dtype=tf.float32))
fc1_biases = tf.Variable(tf.constant(0.1, shape=[512], dtype=tf.float32))
fc2_weights = tf.Variable(tf.truncated_normal([512, NUM_LABELS],
                                              stddev=0.1,
                                              seed=SEED,
                                              dtype=tf.float32))
fc2_biases = tf.Variable(tf.constant(0.1, shape=[NUM_LABELS], dtype=tf.float32))



In [8]:
# Training computation: logits + cross-entropy loss
logits = model(train_data_node, True)
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=train_labels_node,
                                                                    logits=logits))

# L2 regularization for the fully connected parameters
regularizers = (tf.nn.l2_loss(fc1_weights) 
               + tf.nn.l2_loss(fc1_biases)
               + tf.nn.l2_loss(fc2_weights)
               + tf.nn.l2_loss(fc2_biases))

# Add the regularization term to the loss
loss += 5e-4 * regularizers

# optimizer: set up a variable that's incrememted once per batch and controls
# the learning rate decay
batch = tf.Variable(0, dtype=tf.float32)
# Decay once per epoch, using an exponential schedule starting at 0.01
learning_rate = tf.train.exponential_decay(0.01, 
                                           batch * BATCH_SIZE,
                                           train_size,
                                          0.95,
                                          staircase=True)

# use simple momentum for the optimization
optimizer = tf.train.MomentumOptimizer(learning_rate,
                                       0.9).minimize(loss,
                                                    global_step=batch)

# Predictions for the current training minibatch
train_prediction = tf.nn.softmax(logits)

# predictions for the test and validation, which we will compute less often
eval_prediction = tf.nn.softmax(model(eval_data))


For the training we start by defining a session and initializing the variables. We then proceed with the training in batches of the input, calculating the offset and incrementing as we train.
We subset the data and corresponding to be supplied for the current training using the offset and the batch size. 

A dictionary is created to hold these data as a feed_dict and supplied to the optimizer. This is the process of minibatching.

An evaluation is done at each of the evaluation frequency defined earlier. The evaluation is done on the mini batches as our network is crafted this way.

Note that the learning rate is reduced exponentially as the training progresses.

We add a little instrumentation to track the performance of our model as we train based on the validation set as well.

In [12]:
# create a local session to run the training
start_time = time.time()
with tf.Session() as sess:
    # run all the initializers to prepare the trainable parameters
    tf.global_variables_initializer().run()
    # Loop through training steps
    for step in xrange(int(num_epochs * train_size) // BATCH_SIZE):
        # compute the offset of the current minibatch in the data
        offset = (step * BATCH_SIZE) % (train_size - BATCH_SIZE)
        batch_data = train_data[offset:(offset + BATCH_SIZE), ...]
        batch_labels = train_labels[offset:(offset + BATCH_SIZE)]
        # This dictionary maps the batch data (as a numpy array) to the node 
        # in the graph it should be fed to
        feed_dict = {train_data_node: batch_data,
                     train_labels_node : batch_labels}
        # Run the optimizer to update weights
        sess.run(optimizer, feed_dict=feed_dict)
        # print some extra information once we reach the evaluation frequency
        if step % EVAL_FREQUENCY == 0:
            # fetch some extra nodes' data
            l, lr, predictions = sess.run([loss, learning_rate,
                                           train_prediction],
                                         feed_dict=feed_dict)
            elapsed_time = time.time() - start_time
            start_time = time.time()
            print("Step %d (epoch %.2f), %.1f ms" % (step, float(step) * BATCH_SIZE / train_size,
                                                    1000 * elapsed_time / EVAL_FREQUENCY))
            print("Minibatch loss: %.3f, learning rate: %.6f" % (l, lr))
            print("Minibatch error: %.1f%%" %error_rate(predictions, batch_labels))
            print("Validation error: %.1f%%" % error_rate(eval_in_batches(validation_data,
                                                                         sess),
                                                         validation_labels))
            sys.stdout.flush()
        #Finally print the result
        test_error = error_rate(eval_in_batches(test_data, sess), test_labels)
        print('Test error: %.1f%%' %test_error)

Step 0 (epoch 0.00), 4.9 ms
Minibatch loss: 8.334, learning rate: 0.010000
Minibatch error: 85.9%
Validation error: 84.6%
Test error: 84.0%
Test error: 75.1%
Test error: 78.4%
Test error: 69.1%
Test error: 66.0%
Test error: 66.0%
Test error: 67.8%
Test error: 64.4%
Test error: 55.9%
Test error: 58.4%
Test error: 55.2%
Test error: 51.1%
Test error: 48.0%
Test error: 44.6%
Test error: 41.4%
Test error: 38.1%
Test error: 35.7%
Test error: 32.7%
Test error: 30.3%
Test error: 27.8%
Test error: 25.6%
Test error: 24.4%
Test error: 24.4%
Test error: 24.2%
Test error: 23.7%
Test error: 22.7%
Test error: 22.3%
Test error: 20.6%
Test error: 18.8%
Test error: 18.1%
Test error: 17.2%
Test error: 16.1%
Test error: 15.7%
Test error: 15.7%
Test error: 15.7%
Test error: 15.5%
Test error: 14.6%
Test error: 13.7%
Test error: 13.7%
Test error: 14.3%
Test error: 15.2%
Test error: 15.2%
Test error: 14.0%
Test error: 12.4%
Test error: 11.8%
Test error: 11.6%
Test error: 12.2%
Test error: 12.6%
Test error: 12

Test error: 2.3%
Test error: 2.3%
Test error: 2.3%
Test error: 2.4%
Test error: 2.4%
Test error: 2.6%
Test error: 2.8%
Test error: 3.0%
Test error: 3.0%
Test error: 3.0%
Test error: 3.0%
Test error: 2.9%
Test error: 2.9%
Test error: 2.9%
Test error: 3.0%
Test error: 2.9%
Test error: 2.8%
Test error: 2.9%
Test error: 2.8%
Test error: 2.8%
Test error: 2.7%
Test error: 2.6%
Test error: 2.5%
Test error: 2.5%
Test error: 2.5%
Test error: 2.4%
Test error: 2.5%
Test error: 2.4%
Test error: 2.5%
Test error: 2.5%
Test error: 2.5%
Test error: 2.5%
Test error: 2.5%
Test error: 2.5%
Test error: 2.4%
Test error: 2.4%
Test error: 2.3%
Test error: 2.4%
Test error: 2.4%
Test error: 2.3%
Test error: 2.3%
Test error: 2.2%
Test error: 2.2%
Test error: 2.1%
Test error: 2.1%
Test error: 2.1%
Test error: 2.1%
Test error: 2.2%
Test error: 2.2%
Test error: 2.1%
Test error: 2.1%
Test error: 2.2%
Test error: 2.2%
Test error: 2.2%
Test error: 2.3%
Test error: 2.3%
Test error: 2.4%
Test error: 2.5%
Test error: 2.

Test error: 1.7%
Test error: 1.7%
Test error: 1.7%
Test error: 1.7%
Test error: 1.7%
Test error: 1.7%
Step 900 (epoch 1.05), 3408.8 ms
Minibatch loss: 2.908, learning rate: 0.009500
Minibatch error: 1.6%
Validation error: 1.7%
Test error: 1.7%
Test error: 1.7%
Test error: 1.6%
Test error: 1.6%
Test error: 1.6%
Test error: 1.7%
Test error: 1.8%
Test error: 1.7%
Test error: 1.8%
Test error: 1.8%
Test error: 1.7%
Test error: 1.7%
Test error: 1.7%
Test error: 1.7%
Test error: 1.7%
Test error: 1.7%
Test error: 1.7%
Test error: 1.7%
Test error: 1.7%
Test error: 1.7%
Test error: 1.7%
Test error: 1.7%
Test error: 1.7%
Test error: 1.7%
Test error: 1.7%
Test error: 1.8%
Test error: 1.8%
Test error: 1.7%
Test error: 1.7%
Test error: 1.7%
Test error: 1.7%
Test error: 1.7%
Test error: 1.7%
Test error: 1.8%
Test error: 1.7%
Test error: 1.7%
Test error: 1.8%
Test error: 1.8%
Test error: 1.9%
Test error: 1.8%
Test error: 1.8%
Test error: 1.7%
Test error: 1.6%
Test error: 1.6%
Test error: 1.7%
Test err

Test error: 1.4%
Test error: 1.4%
Test error: 1.4%
Test error: 1.4%
Test error: 1.4%
Test error: 1.3%
Test error: 1.3%
Test error: 1.3%
Test error: 1.3%
Test error: 1.3%
Test error: 1.3%
Test error: 1.4%
Test error: 1.4%
Test error: 1.4%
Test error: 1.5%
Test error: 1.5%
Test error: 1.5%
Test error: 1.5%
Test error: 1.5%
Test error: 1.5%
Test error: 1.5%
Test error: 1.5%
Test error: 1.5%
Test error: 1.5%
Test error: 1.5%
Test error: 1.5%
Test error: 1.5%
Test error: 1.5%
Test error: 1.4%
Test error: 1.4%
Test error: 1.4%
Test error: 1.4%
Test error: 1.4%
Test error: 1.4%
Test error: 1.4%
Test error: 1.4%
Test error: 1.4%
Test error: 1.4%
Test error: 1.4%
Test error: 1.4%
Test error: 1.4%
Test error: 1.4%
Test error: 1.4%
Test error: 1.4%
Test error: 1.4%
Test error: 1.5%
Test error: 1.4%
Test error: 1.4%
Test error: 1.4%
Test error: 1.4%
Test error: 1.4%
Test error: 1.4%
Test error: 1.4%
Test error: 1.3%
Test error: 1.3%
Test error: 1.4%
Test error: 1.4%
Test error: 1.4%
Test error: 1.

KeyboardInterrupt: 