## Import Modules

In [1]:
# Use GPU for Theano, comment to use CPU instead of GPU
# Tensorflow uses GPU by default
import os
os.environ["THEANO_FLAGS"] = "mode=FAST_RUN, device=gpu, floatX=float32"

In [2]:
# If using tensorflow, set image dimensions order
from keras import backend as K
if K.backend()=='tensorflow':
    K.set_image_dim_ordering("th")

Using TensorFlow backend.


In [3]:
import time
import matplotlib.pyplot as plt
import numpy as np
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.layers import Activation, Flatten, Dense, Dropout
from keras.optimizers import SGD
from keras.layers.normalization import BatchNormalization
% matplotlib inline
np.random.seed(1234)

## Load MNIST Dataset

In [4]:
from keras.datasets import mnist
(train_features, train_labels), (test_features, test_labels) = mnist.load_data()
_, img_rows, img_cols = train_features.shape
num_classes = len(np.unique(train_labels))
num_input_nodes = img_rows * img_cols # For the input layer
# Some debug

print("Number of training samples: %d" % train_features.shape[0])
print("Number of test samples: %d" % test_features.shape[0])
print("Image rows: %d" % test_features.shape[1])
print("Image cols: %d" % test_features.shape[2])
print("Number of Classes: %d" % num_classes)


Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz
Number of training samples: 60000
Number of test samples: 10000
Image rows: 28
Image cols: 28
Number of Classes: 10


## Pre-processing

In [7]:
train_features = train_features.reshape(train_features.shape[0], 1, img_rows, img_cols).astype('float32')
test_features = test_features.reshape(test_features.shape[0], 1, img_rows, img_cols).astype('float32')
train_features /= 255
test_features /= 255 # Convert class labels to categorical/binary
train_labels = np_utils.to_categorical(train_labels, num_classes)
test_labels = np_utils.to_categorical(test_labels, num_classes)


## Function to plot model accuracy and loss

In [10]:
def plotModelHistory(model_history):
    fig, axs = plt.subplots(1,2,figsize=(15,5))
    # summarize history for accuracy
    axs[0].plot(range(1,len(model_history.history['acc'])+1),model_history.history['acc'])
    axs[0].plot(range(1,len(model_history.history['val_acc'])+1),model_history.history['val_acc'])
    axs[0].set_title('Model Accuracy')
    axs[0].set_ylabel('Accuracy')
    axs[0].set_xlabel('Epoch')
    axs[0].set_xticks(np.arange(1,len(model_history.history['acc'])+1),len(model_history.history['acc'])/10)
    axs[0].legend(['train', 'val'], loc='best')
    # summarize history for loss
    axs[1].plot(range(1,len(model_history.history['loss'])+1),model_history.history['loss'])
    axs[1].plot(range(1,len(model_history.history['val_loss'])+1),model_history.history['val_loss'])
    axs[1].set_title('Model Loss')
    axs[1].set_ylabel('Loss')
    axs[1].set_xlabel('Epoch')
    axs[1].set_xticks(np.arange(1,len(model_history.history['loss'])+1),len(model_history.history['loss'])/10)
    axs[1].legend(['train', 'val'], loc='best')
    plt.show()
    

## Funtion to compute test accuracy

In [12]:
def computeAccuracy(test_x, test_y, model):
    result = model.predict(test_x)
    predicted_class = np.argmax(result, axis=1)
    true_class = np.argmax(test_y, axis=1)
    num_correct = np.sum(predicted_class == true_class) 
    accuracy = float(num_correct)/result.shape[0]
    return (accuracy * 100)

## 1 - Simple CNN
We gonna set an simple CNN with 32 Convolutional filters 5 by 5 on an image 28 by 28 and maxpooling size 2x2

In [15]:
# Set the model
model = Sequential()
model.add(Convolution2D(32,5,5, border_mode='valid', input_shape=(1,28,28)))
model.add(Activation("relu"))
model.add(MaxPooling2D(pool_size=(2,2))) # Max pool 2X2
model.add(Flatten()) # Collapse the outout to be a vector
model.add(Dense(num_classes)) # Matrix multiplication to be updated during backpropagation
model.add(Activation("softmax"))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
start_time = time.time()
model_info = model.fit(train_features, train_labels, batch_size=128,
                      epochs = 20 , validation_split=0.2, verbose = 1)
end_time = time.time()
# Plot model history
plotModelHistory(model_info)
print("Model took %0.2f seconds to train"%(end_time - start_time))
# Compute accuracy
print("Accuracy on test data is: %0.2f"%computeAccuracy(test_features, test_labels, model))

  app.launch_new_instance()


KeyboardInterrupt: 

## Lets Increase the deepth opf the network by adding 

In [21]:
# Define the model
model = Sequential()
model.add(Convolution2D(32,3,3, border_mode='valid', input_shape=(1,28,28)))
model.add(Activation("relu"))
model.add(Convolution2D(32, 3, 3, border_mode ='valid'))
model.add(Activation("relu"))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Flatten())
model.add(Dense(128))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation("softmax"))

# Compile the model
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=['accuracy'])

# Train the model
start_time = time.time()
model_info = model.fit(train_features, train_labels, batch_size=128,
                      epochs = 20, validation_split =0.2, verbose = 1)

end_time = time.time()

# Plot model history
plotModelHistory(model_info)
print("Model took %0.2f seconds to train"%(end_time - start_time))

# Compute accuracy
print("Accuracy on test data is: %0.2f"%computeAccuracy(test_features, test_labels, model))


  app.launch_new_instance()


Train on 48000 samples, validate on 12000 samples
Epoch 1/20
 1664/48000 [>.............................] - ETA: 988s - loss: 1.4904 - acc: 0.5763 

KeyboardInterrupt: 

## Observation:

Increasing network depth, increased the test accuracy but also increased the training time.

## Lets add a Dropout Layer

In [24]:
# Define the model
model = Sequential()
model.add(Convolution2D(32, 5, 5, border_mode ='valid', input_shape = (1,28,28)))
model.add(Activation("relu"))
model.add(Convolution2D(32, 5, 5, border_mode ='valid'))
model.add(Activation("relu"))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.25)) # Drop 25% of the networks to avoid overfitting
model.add(Flatten())
model.add(Dense(128))
model.add(Dropout(0.5))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation("softmax"))

#Compile the model
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=['accuracy'])

# Train the model
start_time = time.time()
model_info = model.fit(train_features, train_labels, batch_size=128,
                      epochs = 20, validation_split = 0.2, verbose =1)

end_time = time.time()

# Plot model history
plotModelHistory(model_info)
print("Model took %0.2f seconds to train"%(end_time - start_time))

# Compute accuracy
print("Accuracy on test data is: %0.2f"%computeAccuracy(test_features, test_labels, model))


  app.launch_new_instance()


Train on 48000 samples, validate on 12000 samples
Epoch 1/20


KeyboardInterrupt: 

## Observation:

Adding the droput layer increases the test accuracy while increasing the training time. 
Dropout layer adds regularization to the network by preventing weights to converge at the same position. 
During forward propagation, nodes are turned off randomly while all nodes are turned on during forward
propagation.

Reference: [Hinton](https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf)


## Add Batch Normalization

In [25]:
# Define the model
model = Sequential()
model.add(Convolution2D(32, 5, 5, border_mode ='valid', input_shape=(1,28,28)))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(Convolution2D(32, 5, 5, border_mode ='valid'))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Flatten())
model.add(Dense(128))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(BatchNormalization())
model.add(Activation("softmax"))

# Compile the model
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=['accuracy'])

# Train the model
start_time = time.time()
model_info = model.fit(train_features, train_labels, batch_size=128,
                      epochs = 20, validation_split =0.2, verbose =1)

end_time = time.time()

# Plot model history
plotModelHistory(model_info)
print("Model took %0.2f seconds to train"%(end_time - start_time))

# Compute accuracy
print("Accuracy on test data is: %0.2f"%computeAccuracy(test_features, test_labels, model))


  app.launch_new_instance()


Train on 48000 samples, validate on 12000 samples
Epoch 1/20


KeyboardInterrupt: 

## Observation:

Adding the batch normalization increases the test accuracy while increasing the training time.
It normalizes the network input weights between 0 and 1.
This allows using higher learning rates when using SGD and for some datasets, eliminates the need for dropout layer.

Reference: [Ioffe](http://jmlr.org/proceedings/papers/v37/ioffe15.pdf) 