Training a convolutional neural network to classify glyphs

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import os
import tensorflow as tf
import matplotlib.pyplot as plt
from six.moves import cPickle as pickle
from six.moves import range

In [2]:
data_root = os.path.join('.', 'notMNIST_data') # Change me to store data elsewhere
pickle_file = os.path.join(data_root, 'notMNIST.pickle')

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    train_dataset = save['train_dataset']
    train_labels = save['train_labels']
    valid_dataset = save['valid_dataset']
    valid_labels = save['valid_labels']
    test_dataset = save['test_dataset']
    test_labels = save['test_labels']
    del save  # hint to help gc free up memory
    print('Training set', train_dataset.shape, train_labels.shape)
    print('Validation set', valid_dataset.shape, valid_labels.shape)
    print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a TensorFlow-friendly shape:
- convolutions need the image data formatted as a cube (width by height by #channels)
- labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10
num_channels = 1 # grayscale

def reformat(dataset, labels):
    dataset = dataset.reshape(
        (-1, image_size, image_size, num_channels)).astype(np.float32)
    labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
    return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28, 1) (200000, 10)
Validation set (10000, 28, 28, 1) (10000, 10)
Test set (10000, 28, 28, 1) (10000, 10)


In [4]:
def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

In [5]:
batch_size = 16
patch_size = 5
depth = 16
num_hidden = 64
learning_rate = 0.001

log_dir = os.path.join(data_root, 'logs_v7')
if not os.path.exists(log_dir):
    print('Directory %s does not exist. Creating it...' % log_dir)
    os.makedirs(log_dir)
    print('Created directory %s.' % log_dir)
    
save_dir = os.path.join(data_root, 'save_v7')
if not os.path.exists(save_dir):
    print('Directory %s does not exist. Creating it...' % save_dir)
    os.makedirs(save_dir)
    print('Created directory %s.' % save_dir)

graph = tf.Graph()

with graph.as_default():

    # Input data.
    with tf.name_scope('input'):
        input_data = tf.placeholder(
            tf.float32, shape=(None, image_size, image_size, num_channels),
            name='input_data')
        input_labels = tf.placeholder(tf.float32, shape=(None, num_labels),
            name='input_labels')
        tf.summary.image('input', input_data, 10)

    # Variables.
    def weight_variable(shape):
        """Create a weight variable with appropriate initialization."""
        initial = tf.truncated_normal(shape, stddev=0.1)
        return tf.Variable(initial)

    def bias_variable(shape):
        """Create a bias variable with appropriate initialization."""
        initial = tf.constant(0.1, shape=shape)
        return tf.Variable(initial)
    
    def variable_summaries(var):
        """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
        with tf.name_scope('summaries'):
            mean = tf.reduce_mean(var)
            tf.summary.scalar('mean', mean)
            with tf.name_scope('stddev'):
                stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
            tf.summary.scalar('stddev', stddev)
            tf.summary.scalar('max', tf.reduce_max(var))
            tf.summary.scalar('min', tf.reduce_min(var))
            tf.summary.histogram('histogram', var)
            
    def conv_layer(input_tensor, patch_size, input_channels, output_channels, layer_name, act=tf.nn.relu):
        """Reusable code for making a simple neural net layer.
        It does a 2d convolution, bias add, and then uses ReLU to nonlinearize.
        It also sets up name scoping so that the resultant graph is easy to read,
        and adds a number of summary ops.
        """
        # Adding a name scope ensures logical grouping of the layers in the graph.
        with tf.name_scope(layer_name):
            # This Variable will hold the state of the weights for the layer
            with tf.name_scope('weights'):
                weights = weight_variable([patch_size, patch_size, input_channels, output_channels])
                variable_summaries(weights)
            with tf.name_scope('biases'):
                biases = bias_variable([output_channels])
                variable_summaries(biases)
            with tf.name_scope('conv_plus_b'):
                preactivate = tf.nn.conv2d(input_tensor, weights, [1, 1, 1, 1], padding='SAME') + biases
                tf.summary.histogram('pre_activations', preactivate)
            activations = act(preactivate, name='activation')
            tf.summary.histogram('activations', activations)
            return activations
        
    def pool_layer(input_tensor, layer_name):
        """Reusable code for making a simple max pooling layer
        """
        # Adding a name scope ensures logical grouping of the layers in the graph.
        with tf.name_scope(layer_name):
            pool = tf.nn.max_pool(input_tensor, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
            return pool
    
    def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu):
        """Reusable code for making a simple neural net layer.
        It does a matrix multiply, bias add, and then uses ReLU to nonlinearize.
        It also sets up name scoping so that the resultant graph is easy to read,
        and adds a number of summary ops.
        """
        # Adding a name scope ensures logical grouping of the layers in the graph.
        with tf.name_scope(layer_name):
            # This Variable will hold the state of the weights for the layer
            with tf.name_scope('weights'):
                weights = weight_variable([input_dim, output_dim])
                variable_summaries(weights)
            with tf.name_scope('biases'):
                biases = bias_variable([output_dim])
                variable_summaries(biases)
            with tf.name_scope('Wx_plus_b'):
                preactivate = tf.matmul(input_tensor, weights) + biases
                tf.summary.histogram('pre_activations', preactivate)
            activations = act(preactivate, name='activation')
            tf.summary.histogram('activations', activations)
            return activations
    
    # Model.
    conv1 = conv_layer(input_data, patch_size, num_channels, depth, 'conv1')
    pool1 = pool_layer(conv1, 'pool1')
    conv2 = conv_layer(pool1, patch_size, depth, depth, 'conv2')
    pool2 = pool_layer(conv2, 'pool2')
    
    with tf.name_scope('image_reshape'):
        shape = pool2.get_shape().as_list()
        reshape = tf.reshape(pool2, [-1, shape[1] * shape[2] * shape[3]])

    hidden1 = nn_layer(reshape, image_size // 4 * image_size // 4 * depth, num_hidden, 'hidden1')
    hidden2 = nn_layer(hidden1, num_hidden, num_hidden, 'hidden2')
    
    with tf.name_scope('dropout'):
        keep_prob = tf.placeholder(tf.float32)
        tf.summary.scalar('dropout_keep_probability', keep_prob)
        dropped = tf.nn.dropout(hidden2, keep_prob)
    
    logits = nn_layer(dropped, num_hidden, num_labels, 'hidden3', act=tf.identity)

    # Training computation.
    with tf.name_scope('cross_entropy'):
        diff = tf.nn.softmax_cross_entropy_with_logits(labels=input_labels, logits=logits)
        with tf.name_scope('total'):
            cross_entropy = tf.reduce_mean(diff)
    tf.summary.scalar('cross_entropy', cross_entropy)
    
    with tf.name_scope('train'):
        train_step = tf.train.AdamOptimizer(learning_rate).minimize(
            cross_entropy)

    with tf.name_scope('accuracy'):
        with tf.name_scope('correct_prediction'):
            correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(input_labels, 1))
        with tf.name_scope('accuracy'):
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    tf.summary.scalar('accuracy', accuracy)

    # Merge all the summaries
    merged = tf.summary.merge_all()

Directory ./notMNIST_data/logs_v7 does not exist. Creating it...
Created directory ./notMNIST_data/logs_v7.
Directory ./notMNIST_data/save_v7 does not exist. Creating it...
Created directory ./notMNIST_data/save_v7.


In [6]:
num_steps = 12501 # 12,501 steps takes an hour or two on my laptop
k = 0.5 # keep probability

with tf.Session(graph=graph) as sess:
    saver = tf.train.Saver()
    train_writer = tf.summary.FileWriter(log_dir + '/train', sess.graph)
    valid_writer = tf.summary.FileWriter(log_dir + '/valid')
    tf.global_variables_initializer().run()
    print('Initialized')
    
    for step in range(num_steps):
        if step % 100 == 0:  # Record summaries and valid-set accuracy
            # TODO: Set up feed_dict for validation set
            feed_dict = {input_data : valid_dataset, input_labels : valid_labels, keep_prob : 1.0}
            summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict)
            valid_writer.add_summary(summary, step)
            print('Accuracy at step %s: %s' % (step, acc))
        if step % 100 == 0:
            saver.save(sess, save_dir + '/my-model-new', global_step=step)
        # Record train set summaries, and train
        if step % 100 == 99:  # Record execution stats
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            run_metadata = tf.RunMetadata()
            offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
            batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
            batch_labels = train_labels[offset:(offset + batch_size), :]
            feed_dict = {input_data : batch_data, input_labels : batch_labels, keep_prob : k}
            summary, _ = sess.run([merged, train_step],
                                  feed_dict=feed_dict,
                                  options=run_options,
                                  run_metadata=run_metadata)
            train_writer.add_run_metadata(run_metadata, 'step%03d' % step)
            train_writer.add_summary(summary, step)
            print('Adding run metadata for', step)
        else:  # Record a summary
            offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
            batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
            batch_labels = train_labels[offset:(offset + batch_size), :]
            feed_dict = {input_data : batch_data, input_labels : batch_labels, keep_prob : k}
            summary, _ = sess.run([merged, train_step], feed_dict=feed_dict)
            train_writer.add_summary(summary, step)
    train_writer.close()
    valid_writer.close()

Initialized
Accuracy at step 0: 0.0845
Adding run metadata for 99
Accuracy at step 100: 0.692
Adding run metadata for 199
Accuracy at step 200: 0.7766
Adding run metadata for 299
Accuracy at step 300: 0.8135
Adding run metadata for 399
Accuracy at step 400: 0.8165
Adding run metadata for 499
Accuracy at step 500: 0.819
