In [1]:
import tensorflow as tf # Import library
from tensorflow.examples.tutorials.mnist import input_data # Import dataset

mnist = input_data.read_data_sets('MNIST_data', one_hot=True) # Handle to the MNIST data object
sess = tf.InteractiveSession() # Allows us to run variables without referring to a session object

x = tf.placeholder(tf.float32, shape=[None, 784]) # Empty variable to put samples at runtime
y_ = tf.placeholder(tf.float32, shape=[None, 10]) # Empty variable to put sample responses at runtime

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [2]:
# Weight initialization
# If all weights were the same, then the cost gradient w.r.t. to each each of them would be the same,
# resulting in them all updating in the same way
def weight_variable(shape): # Weight filters 
    initial = tf.truncated_normal(shape, stddev=0.25)
    return tf.Variable(initial) # initialized with norm dist to avoid 'dead neurons' and 'symmetry breaking'

# Bias variable can be initialised at zero, but better results can be gotten this way
def bias_variable(shape):
    initial = tf.truncated_normal(shape=shape, stddev=0.25)
    return tf.Variable(initial)

In [3]:
# Convolution and Pooling
def conv2d(x, W): # Computes dot product of W with x with each xi centered in W, returns matrix
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME') # SAME padding -> input and output have same dim.

def max_pool_2x2(x): # Downsamples activation result by a factor of 2
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

In [4]:
# First convolutional layer
W_conv1 = weight_variable([5, 5, 1, 48]) # Kernel size is 3x3, 32 filters (->32*28*28 = 25 000 input features to layer 1)
b_conv1 = bias_variable([48]) # Bias for each filter

# To apply the layer we built above, we need to reshape x to a 4D tensor
x_image = tf.reshape(x, [-1, 28, 28, 1]) # -1 corresponds to untouches dim of x (# samples), 1 output color channel
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1) # Apply activation function to biased convolution result
h_pool1 = max_pool_2x2(h_conv1) # Pool convolution result to a 14x14 image

In [5]:
# Second convolutional layer
W_conv2 = weight_variable([5, 5, 48, 96]) # 3 kernel width, 3 kernel height, 32 kernel depth, 64 filters
b_conv2 = bias_variable([96]) # 64 weights, one for each filter

h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)  # Apply activation to biased conv. result
h_pool2 = max_pool_2x2(h_conv2) # Pool result to produce a 7x7x64 output volume

In [6]:
# Densely connected layers
W_fc1 = weight_variable([7 * 7 * 96, 1024]) # Fully connected layer - 1024 filters, convolution with entire volume
b_fc1 = bias_variable([1024]) # One bias for each filter

h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*96]) # Reshape output from second layer to a 1D structure
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1) # Matmul is conv as filter dim = input dim

In [7]:
# Dropout
# - to reduce overfitting
# create a placeholder for the probabilitiy that a neuron's output is kept during dropout
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

In [8]:
# Readout layer
W_fc2 = weight_variable([1024, 10]) # One weight filter for each response class
b_fc2 = bias_variable([10])

y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2 # Proportional to class probability -> normalized in softmax step

In [9]:
# Saving
saver = tf.train.Saver()

In [10]:
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y_conv, y_)) # Nice cost function
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) # Gradient descent on cost function
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1)) # Prediction accuracy vector
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) # Mean accuracy
sess.run(tf.initialize_all_variables())

In [11]:
for i in range(10000):
    batch = mnist.train.next_batch(20) # Minibatch gradient descent is more RAM-friendly
    if i%100 == 0:
        train_accuracy = accuracy.eval(feed_dict={x:batch[0], y_:batch[1], keep_prob:1.0})
        print('Step %d, training accuracy %g'%(i, train_accuracy))
    if i%1000 == 0:
        saver.save(sess, "model.ckpt")
    train_step.run(feed_dict={x:batch[0], y_:batch[1], keep_prob:0.5})

Step 0, training accuracy 0.05
Step 100, training accuracy 0.6
Step 200, training accuracy 0.9
Step 300, training accuracy 0.95
Step 400, training accuracy 0.8
Step 500, training accuracy 0.9
Step 600, training accuracy 0.85
Step 700, training accuracy 0.9
Step 800, training accuracy 0.9
Step 900, training accuracy 0.95
Step 1000, training accuracy 0.9
Step 1100, training accuracy 1
Step 1200, training accuracy 1
Step 1300, training accuracy 0.9
Step 1400, training accuracy 0.9
Step 1500, training accuracy 1
Step 1600, training accuracy 1
Step 1700, training accuracy 1
Step 1800, training accuracy 1
Step 1900, training accuracy 0.95
Step 2000, training accuracy 0.9
Step 2100, training accuracy 0.9
Step 2200, training accuracy 0.85
Step 2300, training accuracy 0.95
Step 2400, training accuracy 0.8
Step 2500, training accuracy 1
Step 2600, training accuracy 0.95
Step 2700, training accuracy 1
Step 2800, training accuracy 1
Step 2900, training accuracy 1
Step 3000, training accuracy 0.95


In [13]:
print('test accuracy %g'%accuracy.eval(feed_dict={x:mnist.test.images[0:3000],
                                                  y_:mnist.test.labels[0:3000],
                                                  keep_prob:1.0}))

test accuracy 0.959667


In [33]:
# Import Kaggle mnist test data
import pandas as pd
import numpy as np

test_data = pd.read_csv('./data/test.csv')
test_data = test_data.as_matrix()
test_data = np.array([np.reshape(row, [28, 28]) for row in test_data])

In [None]:
y_conv.eval(feed_dict={x:mnist.test.images, y_:mnist.test.labels, keep_prob:1.0})