# Tensorflow with Keras MNIST using Optimized Densely Connected Neural Network and with no GridsearchCV

# Import Relevant Libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import train_test_split

  _warn(("h5py is running against HDF5 {0} when it was built against {1}, "


# Data preprocessing

In [2]:
# Loading entire mnist dataset
# 16.7% of the entire dataset is test data
# training data: 60,000 images
# testing data: 10,000 images
(X_train, y_train), (X_test, y_test) = mnist.load_data()



# Splitting the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15)

# Old code:
# The lines where the .reshape() method is used reshapes the data to fit the model.
# These lines are not neccessary because "tf.keras.layers.Flatten(input_shape=(28, 28, 1)"...
# reshapes the data in the "TF_KERAS_SEQUENTIAL_MODEL_WRAP function...
# further downstream to this code.
# These lines of code are included here as a reference for an alternative to reshaping data...
# for the model that will be used. 

#X_train = X_train.reshape((X_train.shape[0], 28 * 28))
#X_val = X_val.reshape((X_val.shape[0], 28 * 28))
#X_test = X_test.reshape((X_test.shape[0], 28 * 28))



# Scale the pixel values to be between 0 and 1
X_train = X_train.astype('float32') / 255
X_val = X_val.astype('float32') / 255
X_test = X_test.astype('float32') / 255

# Model created (using helper function) for use in SKLEARN Pipeline through wrapping

In [3]:
#Notes regarding "TF_KERAS_SEQUENTIAL_MODEL_WRAP" function:

# 784 is the number of input features for the model...
# since each image is 28 by 28 pixels which equals 784

#the hidden layers can be of different size

#Regarding "Flatten" layer:
# each sample is 28x28x1 pixels making them each a rank no.3 tensor
# flattening tensor inputs into vectors to feed neural network
# this flattening is not needed when working with CNNs.

# tf.keras.layers.Dense: output = activation(dot(input, weight) + bias)

# softmax activation is used for final "Dense" layer because for classification the activation function must transform the values propagated to this layer into probabilities

# Regarding specification of optimizer and cost function:
# using a loss specifically used for classifiers is best practice
# the loss/cost function sparse_categorical_crossentropy appies one-hot encoding to the data
# the model and optimizer expect the output shape to match the target shape in a one-hot encoded format



def TF_KERAS_SEQUENTIAL_MODEL_WRAP(input_size=784, 
                                   output_size=10, 
                                   hidden_layer_size=275,
                                   dense_layer_1_activation='relu',
                                   dense_layer_2_activation='relu',
                                   dense_layer_3_activation='softmax',
                                   optimizer='ADAM',
                                   loss='categorical_crossentropy'
                                   ):
    model = tf.keras.Sequential([
            tf.keras.layers.Flatten(input_shape=(28, 28, 1)), 
            tf.keras.layers.Dense(hidden_layer_size, activation=dense_layer_1_activation), 
            tf.keras.layers.Dense(hidden_layer_size, activation=dense_layer_2_activation), 
            tf.keras.layers.Dense(output_size, activation=dense_layer_3_activation) 
            ])
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
    return model


#Important note:
#The hyperparameters specified in the creation of this helper function are
#here for function testing. This hardcoding will change for the allowance of the use of GridSearchCV later on

# Model Wrapping 

In [4]:
# Notes regarding "KerasClassifier" hyperparameters:
# Number of epochs set is arbitrary
# batch_size is set for mini-batch gradient (which will be used to train the model)
# these hyperparameter can be fined-tuned to attempt to improve the model


# wrapping model using "TF_KERAS_SEQUENTIAL_MODEL_WRAP"
kc = KerasClassifier(build_fn=TF_KERAS_SEQUENTIAL_MODEL_WRAP, epochs=6, batch_size=100, verbose=2)

# Training

In [5]:
# Set data "to_categorical" 
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)
y_test = to_categorical(y_test)



#monitors validation loss and stop training proccess the first time the validation loss starts increasing
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)



kc.fit(X_train, y_train, callbacks=[early_stopping], validation_data=(X_val, y_val), verbose=2)



# The inside info on each epoch:
# 1. at the start of each epoch, training loss set to 0
# 2. algo iterates over the preset num of batches, from train_data
# 3. weights and bias update as many times as there are batches
# 4. user recieves value for loss function, which indicates how the training is going
# 5. user additionally recieves training accuracy 
# 6. at the end of the epoch, algo will forward propagate entire validation dataset
# fin. training ends when max number of epochs reached

  X, y = self._initialize(X, y)


Epoch 1/6
510/510 - 12s - loss: 0.2639 - accuracy: 0.9217 - val_loss: 0.1435 - val_accuracy: 0.9561 - 12s/epoch - 23ms/step
Epoch 2/6
510/510 - 6s - loss: 0.0969 - accuracy: 0.9704 - val_loss: 0.1145 - val_accuracy: 0.9660 - 6s/epoch - 12ms/step
Epoch 3/6
510/510 - 6s - loss: 0.0639 - accuracy: 0.9799 - val_loss: 0.0916 - val_accuracy: 0.9726 - 6s/epoch - 12ms/step
Epoch 4/6
510/510 - 6s - loss: 0.0446 - accuracy: 0.9858 - val_loss: 0.0844 - val_accuracy: 0.9754 - 6s/epoch - 12ms/step
Epoch 5/6
510/510 - 7s - loss: 0.0327 - accuracy: 0.9894 - val_loss: 0.0925 - val_accuracy: 0.9734 - 7s/epoch - 13ms/step
Epoch 6/6
510/510 - 6s - loss: 0.0236 - accuracy: 0.9924 - val_loss: 0.0993 - val_accuracy: 0.9732 - 6s/epoch - 12ms/step


# Note regarding model assessment from training results

In [6]:
# Assessing our model:
# we look at the validation accuracy to see if model is overfitting
# validation accuracy is the true accuracy of the model
# to assess the overall accuracy of the model we lok at the validation accuracy for the last epoch

# Test

In [7]:
# by testing the model accuracy on the test data we have a sanity check which tells us if we tuned the hyperparameters to overfit the validation dataset

#Note regarding use of the "score method":
# Since a scikit-learn pipeline was used in model training, the "score" method is used to evaluate the model...
# after passing in test data because scikit-learn pipeline objects have no evaluate method.

# forward propagates test data through the net
test_score = kc.score(X_test, y_test)
print(test_score)




#old code (without scikit-learn pipeline used in this project):
#test_loss, test_accuracy = model.evaluate(test_data)

100/100 - 1s - 1s/epoch - 11ms/step
0.9759


In [8]:
#prints the results with formatting applied in case user wants to do so
# print('Test loss: {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss, test_accuracy*100.))

#Important Note:
# once the model has been tested it is best practice to NOT change it any further than it already is

# Final Model Training

## Building the model plainly

In [9]:
input_size = 784 # 784 is the number of input features for the model...
                 # since each image is 28 by 28 pixels which equals 784
output_size = 10 # there are 10 classes each sample can be classified as 
#the hidden layers can be of different size
hidden_layer_size = 275 # _optimal hyperpara val found_





final_model = tf.keras.Sequential([
    # each sample is 28x28x1 pixels making them each a rank no.3 tensor
    # flattening tensor inputs into vectors to feed neural network
    # this flattening is not needed when working with CNNs.
    tf.keras.layers.Flatten(input_shape=(28, 28, 1)), # input layer
    
    # tf.keras.layers.Dense: output = activation(dot(input, weight) + bias)
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # hidden layer no.1
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # hidden layer no.2
    
    # softmax activation is used here because for classification the activation function must transform the values propagated to this layer into probabilities
    tf.keras.layers.Dense(output_size, activation='softmax') # output layer
])





# using a loss specifically used for classifiers is best practice
# the loss/cost function sparse_categorical_crossentropy appies one-hot encoding to the data
# the model and optimizer expect the output shape to match the target shape in a one-hot encoded format
final_model.compile(optimizer='ADAM', loss='categorical_crossentropy', metrics=['accuracy'])






## Final model training using plain model

### Final data preprocessing

In [10]:
# Concatenation of X and Y datasets and shuffling to randomly in replicatable fashion 
np.random.seed(69)
X = np.concatenate((X_train, X_val, X_test), axis=0)
y = np.concatenate((y_train, y_val, y_test), axis=0)



# Generating a random permutation of indices using the permutation function
indices = np.random.permutation(X.shape[0])



# Useing the permutation to index X and y arrays
X = X[indices]
y = y[indices]

# This ensures that both arrays are shuffled in the same order, preserving the correspondence between the features and labels.
# After running this code, both the X and y arrays will be correctly shuffled and ready for final model training.

### Setting variable that will be used for the fitting of the final model

In [11]:
#stores the number of epochs set for training
# number is arbitrary
NUM_EPOCHS = 6 # _optimized hyper para val_



#monitors validation loss and stop training proccess the first time the validation loss starts increasing
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)

### Final model training

In [12]:
final_model.fit(X, y, epochs=NUM_EPOCHS, callbacks=[early_stopping], validation_split=0.2, verbose =2)


# The inside info on each epoch:
# 1. at the start of each epoch, training loss set to 0
# 2. algo iterates over the preset num of batches, from train_data
# 3. weights and bias update as many times as there are batches
# 4. user recieves value for loss function, which indicates how the training is going
# 5. user additionally recieves training accuracy 
# 6. at the end of the epoch, algo will forward propagate entire validation dataset
# fin. training ends when max number of epochs reached

Epoch 1/6
1750/1750 - 22s - loss: 0.2005 - accuracy: 0.9392 - val_loss: 0.1147 - val_accuracy: 0.9634 - 22s/epoch - 12ms/step
Epoch 2/6
1750/1750 - 20s - loss: 0.0832 - accuracy: 0.9740 - val_loss: 0.0836 - val_accuracy: 0.9743 - 20s/epoch - 11ms/step
Epoch 3/6
1750/1750 - 20s - loss: 0.0569 - accuracy: 0.9824 - val_loss: 0.0942 - val_accuracy: 0.9719 - 20s/epoch - 12ms/step
Epoch 4/6
1750/1750 - 16s - loss: 0.0411 - accuracy: 0.9864 - val_loss: 0.0795 - val_accuracy: 0.9778 - 16s/epoch - 9ms/step
Epoch 5/6
1750/1750 - 19s - loss: 0.0339 - accuracy: 0.9879 - val_loss: 0.0927 - val_accuracy: 0.9767 - 19s/epoch - 11ms/step
Epoch 6/6
1750/1750 - 19s - loss: 0.0300 - accuracy: 0.9904 - val_loss: 0.0890 - val_accuracy: 0.9767 - 19s/epoch - 11ms/step


<keras.src.callbacks.History at 0x1a33839b610>

In [13]:
final_model.save('MNIST_DNN_FINAL.keras')

# Saving Model

It is best to save a keras-based model as shown. It is not best to use joblib. Joblib should be used to save machine learning models only. Keras models are too complex and have mechanisms in place to handle this complexity which joblib does not have.

However, in our case here, saving the model as a keras model is not working. It may have something to do with the keras wrapper beining used. One potential solution is to do final training on the model with a regularly built DNN. Meaning training the final model on a plain DNN  with the parameters that we are happy with. This should work. The only reason we are using the keras wrapper here is for proof of concept. I just want to know I can set it up so that I can work on a project that requires it. Training a DNN model on MNIST data with grid search and cross-validation. In this project the final model can be built plainly with the best parameters found during gridsearch and cross-validation on the testing model. Same should be done here. For now however, I will unideally save the final model created with joblib

Update: The final model was build plainly as discussed above. I was able to save it as a keras file no problem. This makes this project a good foundation for testing the use of scikit-learn's GridSearchCV on a keras DNN model