# Build a DNN with 5 hidden layers of 100 neurons each, He initialization, and Elu activation

In [1]:
import tensorflow as tf

#now specify he initialization

he_init = tf.variance_scaling_initializer()

def dnn(inputs, n_hidden_layers=5, n_neurons=100, name=None, activation=tf.nn.elu, initializer=he_init):
    #now specify a name scope
    with tf.variable_scope(name, "dnn"):
        for layer in range(n_hidden_layers):
            inputs = tf.layers.dense(inputs,n_neurons, activation=activation, kernel_initializer=initializer,
                                    name="hidden%d" %(layer +1))
        return inputs
    
#now specify the inputs, for MNIST data

n_inputs = 28*28
n_outputs = 5

tf.reset_default_graph()

X = tf.placeholder(tf.float32, shape = (None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")

# make a final output layer using the function above
logits = tf.layers.dense(dnn(X), n_outputs, kernel_initializer=he_init, name="logits")
Y_proba = tf.nn.softmax(logits, name="Y_proba")

  from ._conv import register_converters as _register_converters


Using Adam optimization AND early stopping, train the MNIST but only on digits 0 to 4, as we will use transfer learning to train digits 5 to 9. You will need a softmax output layer with 5 neurons.
Make sure to save checkpoints at regular intervals and save the final model for later use.

In [3]:
#specify the learning rate
learning_rate = 0.01

#specify the loss function setup (Cross entropy plus reducing the mean)
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")

#specify the optimizer that learns at the learning rate
optimizer = tf.train.AdamOptimizer(learning_rate)
#specify what that optimizer acts on (the loss function)
training_op = optimizer.minimize(loss, name="training_op")

#identify when the function is correct
correct = tf.nn.in_top_k(logits,y,1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")

#specify the initializer
init = tf.global_variables_initializer()

#specify the saver
saver = tf.train.Saver()

In [5]:
import numpy as np
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train = X_train.astype(np.float32).reshape(-1, 28*28) / 255.0
X_test = X_test.astype(np.float32).reshape(-1, 28*28) / 255.0
y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)
X_valid, X_train = X_train[:5000], X_train[5000:]
y_valid, y_train = y_train[:5000], y_train[5000:]

X_train1 = X_train[y_train < 5]
y_train1 = y_train[y_train < 5]
X_valid1 = X_valid[y_valid < 5]
y_valid1 = y_valid[y_valid < 5]
X_test1 = X_test[y_test < 5]
y_test1 = y_test[y_test < 5]


X_train2 = X_train[y_train>=5]
y_train2 = y_train[y_train>=5]
X_test2 = X_test[y_test>=5]
y_test2 = y_test[y_test>=5]
X_valid2 = X_valid[y_valid>=5]
y_valid2 = y_valid[y_valid>=5]

In [10]:
rnd_idx = np.random.permutation(len(X_train1))
for rnd_indices in np.array_split(rnd_idx, len(X_train1)//20)[:1]:
    print('0')
    print(rnd_indices[0])
    print('1')
    print(rnd_indices[1])
    print('\n')
    print(rnd_indices)

0
21041
1
12587


[21041 12587  5510 26325 10110  5152 10993  8870  7952 21996 18137 14908
  6890 26632  7038  1438  1148 16506 20984 11532  3446]


In [13]:
#specify the number of epochs and the batch size

n_epochs = 1000
batch_size = 20

#specify the number of max checks without progress
max_checks_without_progress = 20
#initialize the checks without progress and best loss function value
checks_without_progress = 0
best_loss = np.infty

with tf.Session() as sess:
    init.run()
    
    for epoch in range(n_epochs):
        rnd_idx = np.random.permutation(len(X_train1))
        for rnd_indices in np.array_split(rnd_idx, len(X_train1)//batch_size):
            X_batch, y_batch = X_train1[rnd_indices], y_train1[rnd_indices]
            sess.run(training_op, feed_dict={X:X_batch,y:y_batch})
        loss_val, acc_val = sess.run([loss, accuracy],feed_dict={X:X_valid1, y: y_valid1})
        
        if loss_val < best_loss:
            save_path = saver.save(sess, "./my_mnist_model_0_to_4.ckpt")
            best_loss = loss_val
            checks_without_progress = 0
        else:
            checks_without_progress += 1
            
            if checks_without_progress > max_checks_without_progress:
                print("Early stopping!")
                break
    print("{}\tValidation loss:{:.6f}\tBest loss: {:.6f}\tAccuracy: {:.2f}%".format(epoch, loss_val, best_loss, acc_val*100))

Early stopping!
23	Validation loss:1.768096	Best loss: 0.111777	Accuracy: 19.08%


In [15]:
with tf.Session() as sess:
    saver.restore(sess, "./my_mnist_model_0_to_4.ckpt")
    acc_test = accuracy.eval(feed_dict={X:X_test1, y: y_test1})
    print("Final test accuracy: {:.2f}%".format(acc_test*100))

INFO:tensorflow:Restoring parameters from ./my_mnist_model_0_to_4.ckpt
Final test accuracy: 98.02%


Next exercise: tune the hyper parameters using cross validation and see what precision you can achieve.

Let's create a DNNClassifier class, compatible with scikitlearns RandomizedSearchCV class to perform hyperparameter tuning. Here are the key points of this implementation

* The init(), `__init__()` method (constructor) does nothing more than create instance variables for each of the hyperparameters.

* The `fit()` method creates the graphg, starts a session and trains the model:

    * It calls the `_build_graph()` method, to build the graph (much like the graph we defined earlier). Once this method is done creating the graph, it saves all the important operations as instance variables for easy access by other methods.
    
    * The `_dnn()` method builds the hidden layers, just like the `dnn()` function above, but also with support for *batch normalization* and drop out (for the next exercises).
    
    * If the `.fit()` method is given a validation set; `X_valid, y_valid` then it implements early stopping. This implementation does not save the best model to disk, but rather to memory: it uses the `_get_model_params()` method to get all the graph's variables and their values, and the `_restore_model_params()` method to restore the variable values (of the best model found). This trick helps speed up training.
    
    * After the `.fit()` method has finished training the model, it keeps the session open so that predictions can be made quickly without having to save a model to disk and restore for every prediction. You can close the session by calling the `.close_session()` method.
    
 * The `predict_proba()` method uses the trained model to predict the class probabilities. 
 
 * The `predict()` method calls the `predict_proba()` and returns the class with the highest probability, for each instance.

In [23]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.exceptions import NotFittedError
he_init = tf.variance_scaling_initializer()

class DNNClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self,n_hidden_layers=5, n_neurons=100, optimizer_class = tf.train.AdamOptimizer, 
                 learning_rate=0.01, batch_size=20, activation=tf.nn.elu, initializer=he_init,
                batch_norm_momentum=None, dropout_rate=None, random_state=None):
        '''Initialize the DNNClassifier by simply storing all the hyper-parameters'''
        self.n_hidden_layers = n_hidden_layers
        self.n_neurons = n_neurons
        self.optimizer_class = optimizer_class
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.activation = activation
        self.initializer = initializer
        self.batch_norm_momentum = batch_norm_momentum
        self.dropout_rate = dropout_rate
        self.random_state = random_state
        self._session = None
    
    def _dnn(self, inputs):
        '''Build the hidden layers, with support for batch normalization and dropout.'''
        for layer in range(self.n_hidden_layers):
            if self.dropout_rate:
                inputs = tf.layers.dropout(inputs, self.dropout_rate, training=self._training)
            if self.batch_norm_momentum:
                inputs = tf.layers.batch_normalization(inputs, momentum=self.batch_norm_momentum)
        inputs = self.activation(inputs, name="hidden%d_out"%(layer+1))
        return inputs
    
    def _build_graph(self, n_inputs, n_outputs):
        '''Build the same model as earlier.'''
        if self.random_state is not None:
            tf.set_random_seed(self.random_state)
            np.random.seed(self.random_state)
        
        X = tf.placeholder(tf.float32, shape=(None, n_inputs),name="X")
        y = tf.placeholder(tf.int32, shape=(None), name="y")
        
        if self.batch_norm_momentum or self.dropout_rate:
            self._training = tf.placeholder_with_default(False, shape=(), name='training')
        
        else:
            self._training = None
        
        dnn_outputs = self._dnn(X)
        
        logits = tf.layers.dense(dnn_outputs, n_outputs, kernel_initializer=he_init, name="logits")
        
        y_proba = tf.nn.softmax(logits, name="y_proba")
        
        xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
        
        loss = tf.reduce_mean(xentropy, name="loss")
        
        optimizer = self.optimizer_class(learning_rate=self.learning_rate)
        
        training_op = optimizer.minimize(loss)
        
        correct = tf.nn.in_top_k(logits,y,1)
        accuracy = tf.reduce_mean(tf.cast(correct,tf.float32, name="accuracy"))
        
        init = tf.global_variables_initializer()
        saver = tf.train.Saver()
        
        #make the important operations available easily through instance variables
        
        self._X, self._y = X, y
        self._Y_proba, self._loss = Y_proba, loss
        self._training_op, self._accuracy = training_op, accuracy
        self._init, self._saver = init, saver
    
    def close_session(self):
        if self._session:
            self._session.close()
    
    def _get_model_params(self):
        """Get all variable values (used for early stopping, faster than saving to disk)"""
        with self._graph.as_default():
            gvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
        return {gvar.op.name: value for gvar, value in zip(gvars, self._session.run(gvars))}
    
    def _restore_model_params(self, model_params):
        """Set all variables to the given values for early stopping, faster than loading from disk"""
        gvar_names = list(model_params.keys())
        
        assign_ops = {gvar_name: self._graph.get_operation_by_name(gvar_name +"/Assign") for gvar_name in gvar_names}
 
        init_values = {gvar_name: assign_op.inputs[1] for gvar_name, assign_op in assign_ops.items()}
    
        feed_dict = {init_values[gvar_name]:model_params[gvar_name] for gvar_name in gvar_names}
        
        self._session.run(assign_ops, feed_dict=feed_dict)
        
    def fit(self, X,y,n_epochs=100, X_valid=None, y_valid=None):
        """Fit the model to the training set. If X_valid and y_valid are provided, use early stopping."""
        self.close_session()
        
        #infer n_inputs and n_outputs from the training set
        n_inputs = X.shape[1]
        self.classes_ = np.unique(y)
        n_outputs = len(self.classes_)
        
        #translate the labels vector to a vector of sorted class indices, containing 
        # intefers from 0 to n_outputs - 1
        self.classes_to_index_ = {label:index for index, label in enumerate(self.classes_)}
        
        y = np.array([self.classes_to_index_[label] for label in y], dtype=np.int32)
        
        self._graph = tf.Graph()
        
        with self._graph.as_default():
            self._build_graph(n_inputs, n_outputs)
            #EXTRA OPS FOR BATCH NORMALIZATION
            extra_update_ops = tf.get_collection(tf.GraphKeys().UPDATE_OPS)
        
        #mneeded in case of early stopping
        max_checks_without_progress = 20
        
        #initialize checks without progress
        checks_without_progress = 0
        
        #initialize the best loss and best params
        best_loss = np.infty
        best_params = None
        
        #Now train the model!
        self._session = tf.Session(graph=self._graph)
        with self._session.as_default() as sess:
            self._init.run()
            for epoch in range(n_epochs):
                rnd_idx = np.random.permutation(len(X))
                for rnd_indices in np.array_split(rnd_idx, len(X)//self.batch_size):
                    X_batch, y_batch = X[rnd_indices], y[rnd_indices]
                    feed_dict = {self._X: X_batch, self._y:y_batch}
                    
                    if self._training is not None:
                        feed_dict[self._training] = True
                    sess.run(extra_update_ops, feed_dict=feed_dict)
                
                if (X_valid is not None) and (y_valid is not None):
                    loss_val, acc_val = sess.run([self._loss, self._accuracy],
                                                 feed_dict={self._X:X_valid, self._y:y_valid})
                    
                    if loss_val < best_loss:
                        best_params = self._get_model_params()
                        best_loss = loss_val
                        checks_without_progress = 0
                    else:
                        checks_without_progress+=1

                    message123 = "{}\tValidation loss:{:.6f}\tBest loss: {:.6f}\tAccuracy: {:.2f}%".format(epoch, loss_val, 
                                                                                                           best_loss, acc_val*100)
                    print(message123)
                    
                    if checks_without_progress>max_checks_without_progress:
                        print("Early Stopping!")
                        break
                else:
                    loss_train, acc_train = sess.run([self._loss, self._accuracy], feed_dict={self._X:X_batch,
                                                                                              self._y:y_batch})
                    print("{}\tLast training batch loss: {:.6f}\t Accuracy: {:.2f}%".format(epoch, loss_train,
                                                                                           acc_train*100))
            if best_params:
                self._restore_model_params(best_params)
            return self
        
        def predict_proba(self, X):
            if not self._session:
                raise NotFittedError("This %s instance is not fitted yet" %self.__class__.__name__)
            with self._session.as_default() as sess:
                return self._Y_proba.eval(feed_dict={self._X: X})
            
        
        def predict(self,X):
            class_indices = np.argmax(self.predict_proba(X),axis=1)
            return np.array([[self.classes_[class_index]] for class_index in class_indices], np.int32)
        
        def save(self, path):
            self._saver.save(self._session, path)

In [24]:
dnn_clf = DNNClassifier(random_state=42)
dnn_clf.fit(X_train1,y_train1, n_epochs=1000, X_valid=X_valid1,y_valid=y_valid1)

0	Validation loss:1.672759	Best loss: 1.672759	Accuracy: 14.39%
1	Validation loss:1.672759	Best loss: 1.672759	Accuracy: 14.39%
2	Validation loss:1.672759	Best loss: 1.672759	Accuracy: 14.39%
3	Validation loss:1.672759	Best loss: 1.672759	Accuracy: 14.39%
4	Validation loss:1.672759	Best loss: 1.672759	Accuracy: 14.39%
5	Validation loss:1.672759	Best loss: 1.672759	Accuracy: 14.39%
6	Validation loss:1.672759	Best loss: 1.672759	Accuracy: 14.39%
7	Validation loss:1.672759	Best loss: 1.672759	Accuracy: 14.39%
8	Validation loss:1.672759	Best loss: 1.672759	Accuracy: 14.39%
9	Validation loss:1.672759	Best loss: 1.672759	Accuracy: 14.39%
10	Validation loss:1.672759	Best loss: 1.672759	Accuracy: 14.39%
11	Validation loss:1.672759	Best loss: 1.672759	Accuracy: 14.39%
12	Validation loss:1.672759	Best loss: 1.672759	Accuracy: 14.39%
13	Validation loss:1.672759	Best loss: 1.672759	Accuracy: 14.39%
14	Validation loss:1.672759	Best loss: 1.672759	Accuracy: 14.39%
15	Validation loss:1.672759	Best lo

DNNClassifier(activation=<function elu at 0x126668f28>,
       batch_norm_momentum=None, batch_size=20, dropout_rate=None,
       initializer=<tensorflow.python.ops.init_ops.VarianceScaling object at 0x183fec6198>,
       learning_rate=0.01, n_hidden_layers=5, n_neurons=100,
       optimizer_class=<class 'tensorflow.python.training.adam.AdamOptimizer'>,
       random_state=42)