In [1]:
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [2]:
!unzip MNIST_data.zip


Archive:  MNIST_data.zip
   creating: MNIST_data/
  inflating: __MACOSX/._MNIST_data   
  inflating: MNIST_data/t10k-images-idx3-ubyte.gz  
  inflating: __MACOSX/MNIST_data/._t10k-images-idx3-ubyte.gz  
  inflating: MNIST_data/train-images-idx3-ubyte.gz  
  inflating: __MACOSX/MNIST_data/._train-images-idx3-ubyte.gz  
  inflating: MNIST_data/train-labels-idx1-ubyte.gz  
  inflating: __MACOSX/MNIST_data/._train-labels-idx1-ubyte.gz  
  inflating: MNIST_data/t10k-labels-idx1-ubyte.gz  
  inflating: __MACOSX/MNIST_data/._t10k-labels-idx1-ubyte.gz  


#Dense Neural Network

**Problem 1**. Consider the following neural network:
    $$h_1 =W_1 X+b_1$$
    $$a_1 =sigmoid(h_1)$$
    $$h_2 =W_2 a_1+b_2$$
    $$a_2 =tanh(h_2)$$
    $$o=W_3a_2+b_3$$
    $$p=softmax(o)$$
    $$softmax(o_i)=\frac{e^{o_i}}{\sum_{j=0}^{K}e^{o_j}} \qquad i=0,1,...,K$$
where, $$h_n$$ denote the hidden layers, $a_n$ denotes the activation layers, $W_n$ are the weights, X being the input to the neural network, o denotes the output layer and p denotes the predicted probabilities. The cross-entropy loss is used as the loss function and is given by:  
$$L=- \sum_{\text{for all class c}}y_c log(p_c)$$ 

where y is the target labels (one-hot vector, i.e., the $y_c$ = 1 if the label of the instance is c).
Compute the derivative of the cross-entropy loss L w.r.t o, i.e compute $\frac{\partial L }{\partial o}$. (25 marks)

**Problem 2**. Draw the computational graph of the network described in Problem 1. Using
the derivative calculated in the previous question, perform Backpropagation to compute the
gradients of loss L w.r.t all the weights and biases, i.e compute $\frac{\partial L }{\partial W_1},\frac{\partial L }{\partial W_2},\frac{\partial L }{\partial W_3},\frac{\partial L }{\partial b_1},\frac{\partial L }{\partial b_2},\frac{\partial L }{\partial b_3}$
. (25 marks)

In [None]:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True, reshape=False)

def fully_connected(prev_layer, num_units, batch_norm, is_training=False):
    layer = tf.layers.dense(prev_layer, num_units, use_bias=False, activation=None)
    if batch_norm:
    	layer = tf.layers.batch_normalization(layer, training=is_training)
    layer = tf.nn.relu(layer)
    return layer

num_batches = 3000
batch_size = 128
learning_rate = 0.002
batch_norm = True
layer_num = 5

inputs = tf.placeholder(tf.float32, [None, 28, 28, 1])
labels = tf.placeholder(tf.float32, [None, 10])
is_training = tf.placeholder(tf.bool)

layer = inputs
orig_shape = layer.get_shape().as_list()
layer = tf.reshape(layer, shape=[-1, orig_shape[1] * orig_shape[2] * orig_shape[3]])
for layer_i in range(1, layer_num + 1):
    layer = fully_connected(layer, 2**(9-layer_i), batch_norm, is_training)

logits = tf.layers.dense(layer, 10)
    
model_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels))
tf.summary.scalar('fc_loss',model_loss)

if batch_norm:  
    with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
	    #train_opt = tf.train.GradientDescentOptimizer(learning_rate).minimize(model_loss)
        train_opt = tf.train.AdamOptimizer(learning_rate).minimize(model_loss)
        #train_opt = tf.train.RMSPropOptimizer(learning_rate).minimize(model_loss)
else:
	#train_opt = tf.train.GradientDescentOptimizer(learning_rate).minimize(model_loss)
    #train_opt = tf.train.RMSPropOptimize(learning_rate).minimize(model_loss)
	train_opt = tf.train.AdamOptimizer(learning_rate).minimize(model_loss)


correct_prediction = tf.equal(tf.argmax(logits,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    
with tf.Session() as sess:
    merged = tf.summary.merge_all()
    if batch_norm: 
        logdir = "mnist/fc/SGD_batchnorm"
    else:
        logdir = "mnist/fc/SGD_no_batchnorm"
    writer = tf.summary.FileWriter(logdir, sess.graph)

    sess.run(tf.global_variables_initializer())
    for batch_i in range(num_batches):
        batch_xs, batch_ys = mnist.train.next_batch(batch_size)

        _,summary = sess.run([train_opt,merged], {inputs: batch_xs, labels: batch_ys, is_training: True})
        
        writer.add_summary(summary, batch_i)

        if batch_i % 500 == 0:
            loss, acc = sess.run([model_loss, accuracy], {inputs: mnist.validation.images,
                labels: mnist.validation.labels,
                is_training: False})
            print('Batch: {:>2}: Validation loss: {:>3.5f}, Validation accuracy: {:>3.5f}'.format(batch_i, loss, acc))
        elif batch_i % 100 == 0:
            loss, acc = sess.run([model_loss, accuracy], {inputs: batch_xs, labels: batch_ys, is_training: False})
            print('Batch: {:>2}: Training loss: {:>3.5f}, Training accuracy: {:>3.5f}'.format(batch_i, loss, acc))

            # At the end, score the final accuracy for both the validation and test sets
    acc = sess.run(accuracy, {inputs: mnist.validation.images,
        labels: mnist.validation.labels,is_training: False})
    print('Final validation accuracy: {:>3.5f}'.format(acc))
    acc = sess.run(accuracy, {inputs: mnist.test.images,
        labels: mnist.test.labels,is_training: False})
    print('Final test accuracy: {:>3.5f}'.format(acc))


**Problem 3**. Batch Normalization is a technique that forces the input to any layer to be
zero mean and unit standard deviation. Below is the algorithm from the paper (Refer: https://arxiv.org/pdf/1502.03167.pdf), Using the algorithm, draw the computational graph of the batchnorm layer. Consider a function $F(y_i)$ and compute the derivatives of
$F(y_i)$ w.r.t x, γ and β, i.e compute
$\frac{\partial F }{\partial \gamma},\frac{\partial F }{\partial \beta},\frac{\partial F }{\partial x}$,Assume  $\frac{\partial F }{\partial y_i}$
is given. (25 marks)

**Problem 4**. Consider the MNIST dataset. It consists of 10 class labels (0-9) and has 60,000
training images and 10,000 test images.
1. Construct a model using fully connected layers (at least 3 layers or more!) and ReLu layers to solve this classification problem using Tensorflow. Report the accuracy obtained on the test set. Plot a graph demonstrating how the loss function decreases
over the number of iterations. (5 marks)

2. Add batch normalization layers in the model. Report the accuracy obtained and plot
a graph showing how loss decreases. Elaborate briefly on how and why batch normalization helped. (5 marks)

3. For the same dataset, train a Convolutional Neural Network (with and without batchnorm). Try experimenting with different architectures (different optimizers, number of
convolutional layers, etc) and report the accuracy that you obtained with both using
and without using batchnorm. Plot the loss vs iterations graph and explain why and
how batch normalization helped. (15 marks)

In [6]:
import tensorflow as tf
tf.keras.backend.clear_session()
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True, reshape=False)

def fully_connected(prev_layer, num_units, batch_norm, is_training=False):
    layer = tf.layers.dense(prev_layer, num_units, use_bias=False, activation=None)
    if batch_norm:
        layer = tf.layers.batch_normalization(layer, training=is_training)
    layer = tf.nn.relu(layer)
    return layer

def conv_layer(prev_layer, layer_depth, batch_norm, is_training=False):
	if layer_depth % 3 == 0:
	    strides = 2
	else:
		strides = 1
	conv_layer = tf.layers.conv2d(prev_layer, layer_depth*4, 3, strides, 'same', use_bias=False, activation=None)
	if batch_norm:
		conv_layer = tf.layers.batch_normalization(conv_layer, training=is_training)
	conv_layer = tf.nn.relu(conv_layer)
	return conv_layer


num_batches = 3000
batch_size = 128
learning_rate = 0.002
layer_num = 5
batch_norm = True

inputs = tf.placeholder(tf.float32, [None, 28, 28, 1])
labels = tf.placeholder(tf.float32, [None, 10])
is_training = tf.placeholder(tf.bool)

layer = inputs
for layer_i in range(1, 1+layer_num):
    layer = conv_layer(layer, layer_i, batch_norm, is_training)

orig_shape = layer.get_shape().as_list()

layer = tf.reshape(layer, shape=[-1, orig_shape[1] * orig_shape[2] * orig_shape[3]])
layer = fully_connected(layer, 100, batch_norm, is_training)

logits = tf.layers.dense(layer, 10)
model_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels))
tf.summary.scalar('conv_loss',model_loss)

if batch_norm:  
    with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
        #train_opt = tf.train.GradientDescentOptimizer(learning_rate).minimize(model_loss)
		#train_opt = tf.train.RMSPropOptimize(learning_rate).minimize(model_loss)
        train_opt = tf.train.AdamOptimizer(learning_rate).minimize(model_loss)
else:
    train_opt = tf.train.GradientDescentOptimizer(learning_rate).minimize(model_loss)
	#train_opt = tf.train.RMSPropOptimize(learning_rate).minimize(model_loss)
	#train_opt = tf.train.AdamOptimizer(learning_rate).minimize(model_loss)

correct_prediction = tf.equal(tf.argmax(logits,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    

with tf.Session() as sess:
	merged = tf.summary.merge_all()
	if batch_norm: 
		logdir = "mnist/conv/SGD_batchnorm"
	else:
		logdir = "mnist/conv/SGD_no_batchnorm"
	writer = tf.summary.FileWriter(logdir, sess.graph)

	sess.run(tf.global_variables_initializer())
	for batch_i in range(num_batches):
		batch_xs, batch_ys = mnist.train.next_batch(batch_size)

		_,summary = sess.run([train_opt,merged], {inputs: batch_xs, labels: batch_ys, is_training: True})
		
		writer.add_summary(summary, batch_i)

		if batch_i % 500 == 0:
			loss, acc = sess.run([model_loss, accuracy], {inputs: mnist.validation.images, labels: mnist.validation.labels, is_training: False})
			print('Batch: {:>2}: Validation loss: {:>3.5f}, Validation accuracy: {:>3.5f}'.format(batch_i, loss, acc))
		elif batch_i % 100 == 0:
			loss, acc = sess.run([model_loss, accuracy], {inputs: batch_xs, labels: batch_ys, is_training: False})
			print('Batch: {:>2}: Training loss: {:>3.5f}, Training accuracy: {:>3.5f}'.format(batch_i, loss, acc))

	acc = sess.run(accuracy, {inputs: mnist.validation.images, labels: mnist.validation.labels,is_training: False})
	print('Final validation accuracy: {:>3.5f}'.format(acc))
	acc = sess.run(accuracy, {inputs: mnist.test.images, labels: mnist.test.labels,is_training: False})
	print('Final test accuracy: {:>3.5f}'.format(acc))

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
Batch:  0: Validation loss: 0.69016, Validation accuracy: 0.31800
Batch: 100: Training loss: 0.39585, Training accuracy: 0.09375
Batch: 200: Training loss: 0.27462, Training accuracy: 0.40625
Batch: 300: Training loss: 0.13369, Training accuracy: 0.78125
Batch: 400: Training loss: 0.03592, Training accuracy: 0.96094
Batch: 500: Validation loss: 0.01450, Validation accuracy: 0.98200
Batch: 600: Training loss: 0.00820, Training accuracy: 0.99219
Batch: 700: Training loss: 0.00824, Training accuracy: 0.98438
Batch: 800: Training loss: 0.00861, Training accuracy: 0.99219
Batch: 900: Training loss: 0.00919, Training accuracy: 0.98438
Batch: 1000: Validation loss: 0.00868, Validation accuracy: 0.98780
Batch: 1100: Training loss: 0.00185, Training accuracy: 1.00000
Batch: 1200: Training loss: 0.01392, 