In [3]:
#Test 1: Original from Deep Learning with Keras Book.

from __future__ import print_function
import numpy as np
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import SGD
from keras.utils import np_utils
np.random.seed(1671) # for reproducibility 

# network and training 
NB_EPOCH = 20
BATCH_SIZE = 128
VERBOSE = 1
NB_CLASSES = 10       # number of outputs = number of digits
OPTIMIZER = SGD()     # SGD optimizer
N_HIDDEN = 128
VALIDATION_SPLIT=0.2  # how much TRAIN is reserved for VALIDATION

# data: shuffled and split between train and test sets
# 
(X_train, y_train), (X_test, y_test) = mnist.load_data()
#X_train is 60000 rows of 28x28 values --> reshaped in 60000 x 784
RESHAPED = 784
#
X_train = X_train.reshape(60000, RESHAPED)
X_test = X_test.reshape(10000, RESHAPED)
X_train = X_train.astype('float32') 
X_test = X_test.astype('float32') 
# normalize 
# 
X_train /= 255
X_test /= 255 
print(X_train.shape[0], 'train samples') 
print(X_test.shape[0], 'test samples')
# convert class vectors to binary class matrices
Y_train = np_utils.to_categorical(y_train, NB_CLASSES) 
Y_test = np_utils.to_categorical(y_test, NB_CLASSES)
#
# 10 outputs
# final stage is softmax
model = Sequential()
model.add(Dense(N_HIDDEN, input_shape=(RESHAPED,)))
model.add(Activation('relu'))
model.add(Dense(N_HIDDEN))
model.add(Activation('relu'))
model.add(Dense(NB_CLASSES))
model.add(Activation('softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer=OPTIMIZER, metrics=['accuracy'])
#
history = model.fit(X_train, Y_train,
batch_size=BATCH_SIZE, epochs=NB_EPOCH,
verbose=VERBOSE, validation_split=VALIDATION_SPLIT)
#
score = model.evaluate(X_test, Y_test, verbose=VERBOSE)
print("Test score:", score[0])
print('Test accuracy:', score[1])


60000 train samples
10000 test samples
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 128)               100480    
_________________________________________________________________
activation_7 (Activation)    (None, 128)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 128)               16512     
_________________________________________________________________
activation_8 (Activation)    (None, 128)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 10)                1290      
_________________________________________________________________
activation_9 (Activation)    (None, 10)                0         
Total params: 118,282
Trainable params: 118,282
Non-trainable params: 0
_________

### Original Test 
In the Deep Learning with Keras book, pages 22-24 I added two hidden layers and modified the iterations from 200, to 20. The optimizer tries to adjust the weights so that the objective function is minimised. The test accuracy is listed below:
##### Training Set: 94.49%
##### Validation: 94.69%
##### Test Accuracy: 94.22%

In [1]:
# Test 2: Modified BATCH_SIZE = 256.

from __future__ import print_function
import numpy as np
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import SGD
from keras.utils import np_utils
np.random.seed(1671) # for reproducibility 

# network and training 
NB_EPOCH = 20
BATCH_SIZE = 256      # number of training instances observed before optimizer performs a weight update
VERBOSE = 1
NB_CLASSES = 10       # number of outputs = number of digits
OPTIMIZER = SGD()     # SGD optimizer
N_HIDDEN = 128
VALIDATION_SPLIT=0.2  # how much TRAIN is reserved for VALIDATION

# data: shuffled and split between train and test sets
# 
(X_train, y_train), (X_test, y_test) = mnist.load_data()
#X_train is 60000 rows of 28x28 values --> reshaped in 60000 x 784
RESHAPED = 784
#
X_train = X_train.reshape(60000, RESHAPED)
X_test = X_test.reshape(10000, RESHAPED)
X_train = X_train.astype('float32') 
X_test = X_test.astype('float32') 
# normalize 
# 
X_train /= 255
X_test /= 255 
print(X_train.shape[0], 'train samples') 
print(X_test.shape[0], 'test samples')
# convert class vectors to binary class matrices
Y_train = np_utils.to_categorical(y_train, NB_CLASSES) 
Y_test = np_utils.to_categorical(y_test, NB_CLASSES)
#
# 10 outputs
# final stage is softmax
model = Sequential()
model.add(Dense(N_HIDDEN, input_shape=(RESHAPED,)))
model.add(Activation('relu'))
model.add(Dense(N_HIDDEN))
model.add(Activation('relu'))
model.add(Dense(NB_CLASSES))
model.add(Activation('softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer=OPTIMIZER, metrics=['accuracy'])
#
history = model.fit(X_train, Y_train,
batch_size=BATCH_SIZE, epochs=NB_EPOCH,
verbose=VERBOSE, validation_split=VALIDATION_SPLIT)
#
score = model.evaluate(X_test, Y_test, verbose=VERBOSE)
print("Test score:", score[0])
print('Test accuracy:', score[1])

Using TensorFlow backend.


60000 train samples
10000 test samples
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               100480    
_________________________________________________________________
activation_1 (Activation)    (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
activation_2 (Activation)    (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                1290      
_________________________________________________________________
activation_3 (Activation)    (None, 10)                0         
Total params: 118,282
Trainable params: 118,282
Non-trainable params: 0
_________

### Test 2: Modified BATCH_SIZE = 256
Here you can see the test accuracy decreasing with the batch size increasing. The iterations remained the same at 20. The new batch size is set at 256. There is a test accuracy loss comparing the original test with the batch size set to 128. Test results below:
##### Training Set: 92.68%
##### Validation: 93.08%
##### Test Accuracy: 93.18%
##### Loss Difference: 1.1%

In [2]:
# Test 3: Modified BATCH_SIZE = 512

from __future__ import print_function
import numpy as np
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import SGD
from keras.utils import np_utils
np.random.seed(1671) # for reproducibility 

# network and training 
NB_EPOCH = 20
BATCH_SIZE = 512      # number of training instances observed before optimizer performs a weight update
VERBOSE = 1
NB_CLASSES = 10       # number of outputs = number of digits
OPTIMIZER = SGD()     # SGD optimizer
N_HIDDEN = 128
VALIDATION_SPLIT=0.2  # how much TRAIN is reserved for VALIDATION

# data: shuffled and split between train and test sets
# 
(X_train, y_train), (X_test, y_test) = mnist.load_data()
#X_train is 60000 rows of 28x28 values --> reshaped in 60000 x 784
RESHAPED = 784
#
X_train = X_train.reshape(60000, RESHAPED)
X_test = X_test.reshape(10000, RESHAPED)
X_train = X_train.astype('float32') 
X_test = X_test.astype('float32') 
# normalize 
# 
X_train /= 255
X_test /= 255 
print(X_train.shape[0], 'train samples') 
print(X_test.shape[0], 'test samples')
# convert class vectors to binary class matrices
Y_train = np_utils.to_categorical(y_train, NB_CLASSES) 
Y_test = np_utils.to_categorical(y_test, NB_CLASSES)
#
# 10 outputs
# final stage is softmax
model = Sequential()
model.add(Dense(N_HIDDEN, input_shape=(RESHAPED,)))
model.add(Activation('relu'))
model.add(Dense(N_HIDDEN))
model.add(Activation('relu'))
model.add(Dense(NB_CLASSES))
model.add(Activation('softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer=OPTIMIZER, metrics=['accuracy'])
#
history = model.fit(X_train, Y_train,
batch_size=BATCH_SIZE, epochs=NB_EPOCH,
verbose=VERBOSE, validation_split=VALIDATION_SPLIT)
#
score = model.evaluate(X_test, Y_test, verbose=VERBOSE)
print("Test score:", score[0])
print('Test accuracy:', score[1])

60000 train samples
10000 test samples
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 128)               100480    
_________________________________________________________________
activation_4 (Activation)    (None, 128)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 128)               16512     
_________________________________________________________________
activation_5 (Activation)    (None, 128)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 10)                1290      
_________________________________________________________________
activation_6 (Activation)    (None, 10)                0         
Total params: 118,282
Trainable params: 118,282
Non-trainable params: 0
_________

### Test 3: Modified BATCH_SIZE = 512
Here you can see the test accuracy decreasing with the batch size increasing. The iterations remained the same at 20. The new batch size is set at 512. There is a test accuracy loss comparing the original test with a batch size of 128 as well as the previous batch size of 256. Test results below:

##### Training Set: 90.89%
##### Validation: 91.59%
##### Test Accuracy: 91.32%
##### Loss Difference from Batch Size 128: 3.1%
##### Loss Difference from Batch Size 256: 2.0%

In the above tests, various batch sizes were tested starting with 128, then to 256, and lastly to 512. As the batch size increases, the test accuracy shows a decent. The batch size can also have a significant impact on your model’s performance and the training time. A small batch size ensures that each training iteration is very fast, and a larger batch size will give a more precise estimate of the gradients.