In [22]:
import tensorflow
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten
from tensorflow.keras.optimizers import RMSprop

# Loading training data
(mnist_train_images, mnist_train_labels), (mnist_test_images, mnist_test_labels) = mnist.load_data()

from tensorflow.keras import backend as K

# reshaping the raw data into 28x28x1. This is a 28x28 image with 1 color channel
if K.image_data_format() == 'channels_first':
    train_images = mnist_train_images.reshape(mnist_train_images.shape[0], 1, 28, 28)
    test_images = mnist_test_images.reshape(mnist_test_images.shape[0], 1, 28, 28)
    input_shape = (1, 28, 28)
else:
    train_images = mnist_train_images.reshape(mnist_train_images.shape[0], 28, 28, 1)
    test_images = mnist_test_images.reshape(mnist_test_images.shape[0], 28, 28, 1)
    input_shape = (28, 28, 1)
    
train_images = train_images.astype('float32')
test_images = test_images.astype('float32')
train_images /= 255
test_images /= 255

# Converting the labels to one-shot format once again.
train_labels = tensorflow.keras.utils.to_categorical(mnist_train_labels, 10)
test_labels = tensorflow.keras.utils.to_categorical(mnist_test_labels, 10)

def display_sample(num):
    #Print the one-hot array of this sample's label ex: [1. 0. 0. 0. ...]
    print(train_labels[num])  
    #Converting one hot to its int representation for output
    label = train_labels[num].argmax(axis=0)
    #Reshape the 768 values to a 28x28x1 image
    image = train_images[num].reshape([28,28])
    # Outputting title and label
    plt.title('Sample: %d  Label: %d' % (num, label))
    plt.imshow(image, cmap=plt.get_cmap('gray_r'))
    plt.show()

In [23]:
# Creation of the model #

model = Sequential()
# 32 windows in the 2D convolution of 3x3 each
model.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=input_shape))
# 64 3x3 kernels for input into the 2d convolution
model.add(Conv2D(64, (3, 3), activation='relu'))
# Reduce by taking the max of each 2x2 block, making the results more managable - speed++
model.add(MaxPooling2D(pool_size=(2, 2)))
# Dropout to avoid overfitting
model.add(Dropout(0.25))
# Flatten the results to one dimension for passing into our final layer
model.add(Flatten())
# A hidden layer to learn with
model.add(Dense(128, activation='relu'))
# Another dropout
model.add(Dropout(0.5))
# Final categorization from 0-9 with softmax
model.add(Dense(10, activation='softmax'))

# Compiling model with categorical crossent as this is a multiple classification problem
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Training the model in batches of 32 through 10 epochs. This may take a long time so please consider before compiling
history = model.fit(train_images, train_labels,
                    batch_size=32,
                    epochs=10,
                    verbose=2,
                    validation_data=(test_images, test_labels))

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Train on 60000 samples, validate on 10000 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
 - 123s - loss: 0.1964 - acc: 0.9407 - val_loss: 0.0462 - val_acc: 0.9851
Epoch 2/10
 - 126s - loss: 0.0854 - acc: 0.9751 - val_loss: 0.0460 - val_acc: 0.9854
Epoch 3/10
 - 131s - loss: 0.0660 - acc: 0.9802 - val_loss: 0.0345 - val_acc: 0.9883
Epoch 4/10
 - 129s - loss: 0.0532 - acc: 0.9840 - val_loss: 0.0315 - val_acc: 0.9900
Epoch 5/10
 - 120s - loss: 0.0431 - acc: 0.9868 - val_loss: 0.0322 - val_acc: 0.9894
Epoch 6/10
 - 125s - loss: 0.0398 - acc: 0.9876 - val_loss: 0.0274 - val_acc: 0.9926
Epoch 7/10
 - 125s - loss: 0.0359 - acc: 0.9886 - val_loss: 0.0325 - val_acc: 0.9904
Epoch 8/10
 - 141s - loss: 0.0306 - acc: 0.9906 - val_loss: 0.0295 - val_acc: 0.9918
Epoch 9/10
 - 140s - loss: 0.0280 - acc: 0.9909 - val_loss: 0.0317 - val_acc: 0.9916
Epoch 10/10
 - 117s - 

As demonstrated above, this model beats out the ~98 accuracy of the ANN version but takes substantially longer to train!
On my i5 6500k, this model took ~25 minutes to train.