# Tensorflow with Keras MNIST using Optimized Densely Connected Neural Network

In [1]:
import warnings
warnings.filterwarnings('ignore')

# Import Relevant Libraries

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

# Data preprocessing

In [3]:
# Loading entire mnist dataset
# 16.7% of the entire dataset is test data
# training data: 60,000 images
# testing data: 10,000 images
(X_train, y_train), (X_test, y_test) = mnist.load_data()



# Splitting the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15)

# Old code:
# The lines where the .reshape() method is used reshapes the data to fit the model.
# These lines are not neccessary because "tf.keras.layers.Flatten(input_shape=(28, 28, 1)"...
# reshapes the data in the "TF_KERAS_SEQUENTIAL_MODEL_WRAP function...
# further downstream to this code.
# These lines of code are included here as a reference for an alternative to reshaping data...
# for the model that will be used. 

#X_train = X_train.reshape((X_train.shape[0], 28 * 28))
#X_val = X_val.reshape((X_val.shape[0], 28 * 28))
#X_test = X_test.reshape((X_test.shape[0], 28 * 28))



# Scale the pixel values to be between 0 and 1
X_train = X_train.astype('float32') / 255
X_val = X_val.astype('float32') / 255
X_test = X_test.astype('float32') / 255

# Model created (using helper function)

In [4]:
#Notes regarding "TF_KERAS_SEQUENTIAL_MODEL_WRAP" function:

# 784 is the number of input features for the model...
# since each image is 28 by 28 pixels which equals 784

#the hidden layers can be of different size

#Regarding "Flatten" layer:
# each sample is 28x28x1 pixels making them each a rank no.3 tensor
# flattening tensor inputs into vectors to feed neural network
# this flattening is not needed when working with CNNs.

# tf.keras.layers.Dense: output = activation(dot(input, weight) + bias)

# softmax activation is used for final "Dense" layer because for classification the activation function must transform the values propagated to this layer into probabilities

# Regarding specification of optimizer and cost function:
# using a loss specifically used for classifiers is best practice
# the loss/cost function sparse_categorical_crossentropy appies one-hot encoding to the data
# the model and optimizer expect the output shape to match the target shape in a one-hot encoded format



def TF_KERAS_SEQUENTIAL_MODEL_WRAP(input_size=784, 
                                   output_size=10, 
                                   hidden_layer_size=275,
                                   dense_layer_1_activation='relu',
                                   dense_layer_2_activation='relu',
                                   dense_layer_3_activation='softmax',
                                   optimizer='ADAM',
                                   loss='categorical_crossentropy'
                                   ):
    model = tf.keras.Sequential([
            tf.keras.layers.Flatten(input_shape=(28, 28, 1)), 
            tf.keras.layers.Dense(hidden_layer_size, activation=dense_layer_1_activation), 
            tf.keras.layers.Dense(hidden_layer_size, activation=dense_layer_2_activation), 
            tf.keras.layers.Dense(output_size, activation=dense_layer_3_activation) 
            ])
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
    return model


#Important note:
#The hyperparameters specified in the creation of this helper function are
#here for function testing. This hardcoding will change for the allowance of the use of GridSearchCV later on

# Defining grid search parameter dictionary

In [5]:
# Testing GridSearchCV() fucntionality with 'optimizer' parameter options.
param_grid = {'optimizer': ['RMSprop', 'ADAM', 'Nadam']}

# Setting data to categorical

In [6]:
# Set data "to_categorical" 
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)
y_test = to_categorical(y_test)

# Wrapping model with KerasClassifier from SciKeras

In [7]:
# Notes regarding "KerasClassifier" hyperparameters:
# Number of epochs set is arbitrary
# batch_size is set for mini-batch gradient (which will be used to train the model)
# these hyperparameter can be fined-tuned to attempt to improve the model


# Set early stopping: monitors loss and stop training proccess the first time loss starts to increase
early_stopping = EarlyStopping(monitor='loss', patience=2)

# wrapping model with KerasClassifer using helper function: "TF_KERAS_SEQUENTIAL_MODEL_WRAP"
kc = KerasClassifier(build_fn=TF_KERAS_SEQUENTIAL_MODEL_WRAP, epochs=6, batch_size=100, verbose=2, callbacks=[early_stopping])

# Training

## Running Gridsearch and Cross-validation

In [8]:
grid = GridSearchCV(estimator=kc, param_grid=param_grid, cv=5)
grid_result = grid.fit(X_train, y_train)



# The inside info on each epoch:
# 1. at the start of each epoch, training loss set to 0
# 2. algo iterates over the preset num of batches, from train_data
# 3. weights and bias update as many times as there are batches
# 4. user recieves value for loss function, which indicates how the training is going
# 5. user additionally recieves training accuracy 
# 6. at the end of the epoch, algo will forward propagate entire validation dataset
# fin. training ends when max number of epochs reached

Epoch 1/6
408/408 - 2s - loss: 0.2818 - accuracy: 0.9180 - 2s/epoch - 6ms/step
Epoch 2/6
408/408 - 2s - loss: 0.1075 - accuracy: 0.9680 - 2s/epoch - 4ms/step
Epoch 3/6
408/408 - 2s - loss: 0.0706 - accuracy: 0.9781 - 2s/epoch - 4ms/step
Epoch 4/6
408/408 - 2s - loss: 0.0496 - accuracy: 0.9847 - 2s/epoch - 4ms/step
Epoch 5/6
408/408 - 2s - loss: 0.0351 - accuracy: 0.9887 - 2s/epoch - 4ms/step
Epoch 6/6
408/408 - 2s - loss: 0.0259 - accuracy: 0.9919 - 2s/epoch - 4ms/step
102/102 - 0s - 314ms/epoch - 3ms/step
Epoch 1/6
408/408 - 2s - loss: 0.2962 - accuracy: 0.9143 - 2s/epoch - 6ms/step
Epoch 2/6
408/408 - 2s - loss: 0.1123 - accuracy: 0.9662 - 2s/epoch - 4ms/step
Epoch 3/6
408/408 - 2s - loss: 0.0725 - accuracy: 0.9782 - 2s/epoch - 4ms/step
Epoch 4/6
408/408 - 2s - loss: 0.0492 - accuracy: 0.9844 - 2s/epoch - 4ms/step
Epoch 5/6
408/408 - 2s - loss: 0.0334 - accuracy: 0.9899 - 2s/epoch - 4ms/step
Epoch 6/6
408/408 - 2s - loss: 0.0262 - accuracy: 0.9916 - 2s/epoch - 4ms/step
102/102 - 0s -

In [9]:
# The inside info on each epoch:
# 1. at the start of each epoch, training loss set to 0
# 2. algo iterates over the preset num of batches, from train_data
# 3. weights and bias update as many times as there are batches
# 4. user recieves value for loss function, which indicates how the training is going
# 5. user additionally recieves training accuracy 
# 6. at the end of the epoch, algo will forward propagate entire validation dataset
# fin. training ends when max number of epochs reached

# Note regarding model assessment from training results

In [10]:
# Assessing our model:
# we look at the validation accuracy to see if model is overfitting
# validation accuracy is the true accuracy of the model
# to assess the overall accuracy of the model we lok at the validation accuracy for the last epoch

# Test

In [11]:
# by testing the model accuracy on the test data we have a sanity check which tells us if we tuned the hyperparameters to overfit the validation dataset

# Note regarding use of the "score method":
# Since a SciKeras wrapper was used in model training, the "score" method is used to evaluate the model...
# after passing in test data because SciKeras wrapper objects have no evaluate method.
# In the case of using GridSearchCV(), the score fucntion is called on what was found to be the best estimator.

# forward propagates test data through the net
test_score = grid_result.best_estimator_.score(X_test, y_test)
print("1. Test score for best estimator:", test_score)
print("\n")
print("2. Best estimator found: ", grid_result.best_estimator_)
print("\n")
print("3. Best parameters found: ", grid_result.best_params_)


#old code (without SciKeras wrapper used in this project):
#test_loss, test_accuracy = model.evaluate(test_data)

100/100 - 0s - 296ms/epoch - 3ms/step
1. Test score for best estimator: 0.9783


2. Best estimator found:  KerasClassifier(
	model=None
	build_fn=<function TF_KERAS_SEQUENTIAL_MODEL_WRAP at 0x0000022A02B36830>
	warm_start=False
	random_state=None
	optimizer=RMSprop
	loss=None
	metrics=None
	batch_size=100
	validation_batch_size=None
	verbose=2
	callbacks=[<keras.src.callbacks.EarlyStopping object at 0x0000022A02B3BA00>]
	validation_split=0.0
	shuffle=True
	run_eagerly=False
	epochs=6
	class_weight=None
)


3. Best parameters found:  {'optimizer': 'RMSprop'}


In [12]:
#//prints the results with formatting applied, in case user wants to do so//
# print('Test loss: {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss, test_accuracy*100.))

#Important Note:
# once the model has been tested it is best practice to NOT change it any further than it already is

# Final Model Training

## Building the model plainly

In [13]:
# Best optimizer found through GridSearchCV():
optimizer_best = grid_result.best_params_['optimizer']

In [14]:
print(optimizer_best)

RMSprop


In [15]:
input_size = 784 # 784 is the number of input features for the model...
                 # since each image is 28 by 28 pixels which equals 784
    
output_size = 10 # there are 10 classes each sample can be classified as 

#the hidden layers can be of different size
hidden_layer_size = 275 # _optimal hyperpara val found_ (through manual search)


# Building Final Model
final_model = tf.keras.Sequential([
    # each sample is 28x28x1 pixels making them each a rank no.3 tensor
    # flattening tensor inputs into vectors to feed neural network
    # this flattening is not needed when working with CNNs.
    tf.keras.layers.Flatten(input_shape=(28, 28, 1)), # input layer
    
    # tf.keras.layers.Dense: output = activation(dot(input, weight) + bias)
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # hidden layer no.1
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # hidden layer no.2
    
    # softmax activation is used here because for classification the activation function must transform the values propagated to this layer into probabilities
    tf.keras.layers.Dense(output_size, activation='softmax') # output layer
])



# using a loss specifically used for classifiers is best practice
# the loss/cost function sparse_categorical_crossentropy appies one-hot encoding to the data
# the model and optimizer expect the output shape to match the target shape in a one-hot encoded format
final_model.compile(optimizer=optimizer_best, loss='categorical_crossentropy', metrics=['accuracy'])



## Final model training using plain model

### Final data preprocessing

In [16]:
# Concatenation of X and Y datasets and shuffling to randomly in replicatable fashion 
np.random.seed(69)
X = np.concatenate((X_train, X_val, X_test), axis=0)
y = np.concatenate((y_train, y_val, y_test), axis=0)



# Generating a random permutation of indices using the permutation function
indices = np.random.permutation(X.shape[0])



# Useing the permutation to index X and y arrays
X = X[indices]
y = y[indices]

# This ensures that both arrays are shuffled in the same order, preserving the correspondence between the features and labels.
# After running this code, both the X and y arrays will be correctly shuffled and ready for final model training.

### Setting variable that will be used for the fitting of the final model

In [17]:
#stores the number of epochs set for training
# number is arbitrary
NUM_EPOCHS = 6 # _optimized hyper para val_



#monitors validation loss and stop training proccess the first time the validation loss starts increasing
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)

### Final model training

In [18]:
final_model.fit(X, y, epochs=NUM_EPOCHS, callbacks=[early_stopping], validation_split=0.2, verbose =2)


# The inside info on each epoch:
# 1. at the start of each epoch, training loss set to 0
# 2. algo iterates over the preset num of batches, from train_data
# 3. weights and bias update as many times as there are batches
# 4. user recieves value for loss function, which indicates how the training is going
# 5. user additionally recieves training accuracy 
# 6. at the end of the epoch, algo will forward propagate entire validation dataset
# fin. training ends when max number of epochs reached

Epoch 1/6
1750/1750 - 8s - loss: 0.2048 - accuracy: 0.9377 - val_loss: 0.1227 - val_accuracy: 0.9620 - 8s/epoch - 4ms/step
Epoch 2/6
1750/1750 - 7s - loss: 0.0902 - accuracy: 0.9734 - val_loss: 0.0865 - val_accuracy: 0.9740 - 7s/epoch - 4ms/step
Epoch 3/6
1750/1750 - 6s - loss: 0.0644 - accuracy: 0.9811 - val_loss: 0.0941 - val_accuracy: 0.9750 - 6s/epoch - 4ms/step
Epoch 4/6
1750/1750 - 6s - loss: 0.0482 - accuracy: 0.9860 - val_loss: 0.1206 - val_accuracy: 0.9713 - 6s/epoch - 4ms/step


<keras.src.callbacks.History at 0x22a054f54e0>

# Saving model

In [19]:
final_model.save('MNIST_DNN_FINAL.keras')

# Final Notes

It is best to save a keras-based model as shown. It is not best to use joblib. Joblib should be used to save machine learning models only. Keras models are too complex and have mechanisms in place to handle this complexity which joblib does not have.

Using GridSearchCV() for DNN hyperparameter search and cross-validation is sucessfully demonstrated here.
The next step would be to expand the hyperparameters dictonary to search through other hyperparameters.
Since this the more hyperparameters are searched for the more computational resources will need to be used, I suggest expanding the use of GridSearchCV() using GPU through Google Colab or Amazon SageMaker Studio Lab