## Deep Learning
Assignment 3
Previously in 2_fullyconnected.ipynb, you trained a logistic regression and a neural network model.
The goal of this assignment is to explore regularization techniques.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from time import time

# First reload the data we generated in notmist.ipynb.
pickle_file = 'proj1/notMNIST_original.pickle'

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    train_dataset = save['train_dataset']
    train_labels = save['train_labels']
    valid_dataset = save['valid_dataset']
    valid_labels = save['valid_labels']
    test_dataset = save['test_dataset']
    test_labels = save['test_labels']
    del save  # hint to help gc free up memory
    print('Training set', train_dataset.shape, train_labels.shape)
    print('Validation set', valid_dataset.shape, valid_labels.shape)
    print('Test set', test_dataset.shape, test_labels.shape)

def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

# Reformat into a shape that's more adapted to the models we are going to train:
# - data as a flat matrix
# - labels as float 1-hot encoding
image_size = 28
num_labels = 10
def reformat(dataset,labels):
    dataset = dataset.reshape([-1, image_size * image_size]).astype(np.float32)
    labels = (np.arange(num_labels) == labels[:, None]).astype(np.float32)
    return(dataset, labels)
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)
Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


## Problem 1
Introduce and tune L2 regularization for both logistic and neural network models. 
- Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. 
- In TensorFlow, you can compute the L2 loss for a tensor t using nn.l2_loss(t). The right amount of regularization should improve your validation / test accuracy.


LR:
- beta = 1  ;  60.6%; 18.13 s # too big, underfitting
- beta = 0.1;  87.7%; 13.43 s # good
- beta = 0.005;88.4%; 18.18 s
- beta = 0.01; 88.9%; 18.76 s
- beta = 0.001;84.1%; 18.3 s
- beta = 0;    82.1%; 18.3 s

In [14]:
# LR first
# With gradient descent training, even this much data is prohibitive.
# Subset the training data for faster turnaround.
train_subset = 10000
beta = 0 # changes regulartion term, big value prevents overfitting
LR = 0.5
graph = tf.Graph()\

with graph.as_default():
    
    # input data.
    # Load the training, validation and test data into constants that are
    # attached to the graph
    
    tf_train_dataset = tf.constant(train_dataset[:train_subset,:])
    tf_train_labels  = tf.constant(train_labels[:train_subset])
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    # Variables.
    # These are the parameters that we are going to be training. The weight
    # matrix will be initialized using random values following a (truncated)
    # normal distribution. The biases get initialized to zero.
    
    weights = tf.Variable(
       tf.truncated_normal([image_size * image_size, num_labels]))
    biases = tf.Variable(tf.zeros([num_labels]))
    
    # Training computation.
    # We multiply the inputs with the weight matrix, and add biases. We compute
    # the softmax and cross-entropy (it's one operation in TensorFlow, because
    # it's very common, and it can be optimized). We take the average of this
    # cross-entropy across all training examples: that's our loss.
    
    logits = tf.matmul(tf_train_dataset, weights) + biases # 10000*784, 784*10
    loss = tf.reduce_mean(
       tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) 
    regloss = tf.nn.l2_loss(weights)
    total_loss = loss + beta*regloss
    
    # Optimizer.
    # We are going to find the minimum of this loss using gradient descent.
    
    ##global_step = tf.Variable(0)  # count the number of steps taken.
    #learning_rate = tf.train.exponential_decay(
    #      0.7,                      # Base learning rate.
    #      global_step,   # Current index into the dataset.
    #      801,                # Decay step.
    #      0.95,                      # Decay rate.
    #      staircase=True)
    ##Optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(total_loss, global_step = global_step)

    Optimizer = tf.train.GradientDescentOptimizer(LR).minimize(total_loss)
    
    # Predictions for training, validation, and test data.
    # These are not part of training, but merely here so that we can report
    # accuracy figures as we train.
    
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(
      tf.matmul(tf_valid_dataset, weights) + biases)
    test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)
    

In [15]:
# Let's run this computation and iterate
num_steps = 801
t0 = time()
with tf.Session(graph=graph) as session:
    # This is a one-time operation which ensures the parameters get initialized
    # as we described in the graph: random weights for the matrix, zeros for the
    # biases.
    tf.initialize_all_variables().run()
    loss_step = np.zeros(num_steps)
    print('Initialized')
    for step in range(num_steps):
        # Run the computations. We tell .run() that we want to run the optimization
        # and get the loss value and the training predictions returned as numpy arrays.
        if step == 0:
            print('initial',train_prediction)
        _, l, predictions = session.run([Optimizer, total_loss, train_prediction])
        if (step % 100 ==0):
            print('Loss at step %d: %f' % (step, l))
            print('Training accuracy: %.1f%%' % accuracy(predictions, train_labels[:train_subset,:]))
            # Calling .eval() on validation_prediction is basically like calling run(), but
            # just to get that one numpy array. Note that it recomputes all its graph
            # dependencies.
            print('Validation accuracy: %.1f%%' % accuracy(valid_prediction.eval(), valid_labels))
            loss_step[step] = l
            
    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))
    print('In',time() - t0,'seconds')



Initialized
initial Tensor("Softmax:0", shape=(10000, 10), dtype=float32)
Loss at step 0: 17.776443
Training accuracy: 14.1%
Validation accuracy: 14.8%
Loss at step 100: 2.267347
Training accuracy: 71.7%
Validation accuracy: 69.8%
Loss at step 200: 1.827666
Training accuracy: 75.1%
Validation accuracy: 72.5%
Loss at step 300: 1.591444
Training accuracy: 76.7%
Validation accuracy: 73.1%
Loss at step 400: 1.430357
Training accuracy: 77.5%
Validation accuracy: 73.7%
Loss at step 500: 1.309553
Training accuracy: 78.1%
Validation accuracy: 73.9%
Loss at step 600: 1.214620
Training accuracy: 78.8%
Validation accuracy: 74.1%
Loss at step 700: 1.137688
Training accuracy: 79.2%
Validation accuracy: 74.2%
Loss at step 800: 1.073795
Training accuracy: 79.6%
Validation accuracy: 74.2%
Test accuracy: 82.1%
In 18.3461520672 seconds


# LR with SGD
- beta = 1    ; 55.2%; 3.6 s;4batch; 48.6%;3.6s
- beta = 0.5  ; 66.8%; 3.4 s;4batch; 71 %; 3.6s
- beta = 0.1  ; 85.1%; 3.4 s;4batch; 71 %; 3.6s
- beta = 0.05 ; 86.9%; 3.4 s;4batch; 71 %; 3.6s
- beta = 0.01 ; 88.6%; 3.4 s;4batch; 76 %; 3.6s
- beta = 0.005 ; 89%; 3.4 s;4batch; 76 %; 3.6s
- *beta = 0.002; 89.2%; 3.6 s;4batch; 75 %; 3.6s
- beta = 0.001; 89%; 3.6 s;4batch; 75 %; 3.6s
- beta = 0.0001;86.9%; 3.5 s;4batch; 70 %; 3.6s

In [80]:
# LR with SGD second
batch_size = 128
beta = 0
LR = 0.5

graph = tf.Graph()

with graph.as_default():
    # Input data. For the training data, we use a placeholder that 
    # will be fed at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape = (batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape = (batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    # Variables.
    weights = tf.Variable(
       tf.truncated_normal([image_size * image_size, num_labels]))
    biases = tf.Variable(tf.zeros([num_labels]))
    
    # Training computation.
    logits = tf.matmul(tf_train_dataset, weights) + biases
    loss = tf.reduce_mean(
       tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
    regloss = tf.nn.l2_loss(weights)
    total_loss = loss + beta*regloss
    
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(LR).minimize(total_loss)
    
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(
          tf.matmul(tf_valid_dataset, weights) + biases)
    test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)

In [81]:
num_steps = 3001
t0 = time()
with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        ## step2 = step % 4
        offset = (step * batch_size) % (train_labels.shape[0]-batch_size)
        
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset+batch_size),:]
        batch_labels = train_labels[offset:(offset+batch_size),:]
        
        # Prepare a dictionary telling the session where to feed the 
        # minibatch. The key of the dictionary is the placeholder node of the
        # graph to be fed and the value is the numpy array.
        
        feed_dict = {tf_train_dataset : batch_data, 
                     tf_train_labels : batch_labels}
        _, l, predictions = session.run(
                [optimizer, loss, train_prediction], feed_dict=feed_dict)
        
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(
                valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
    print('In',time() - t0,'seconds')


        

Initialized
Minibatch loss at step 0: 16.217316
Minibatch accuracy: 6.2%
Validation accuracy: 9.6%
Minibatch loss at step 500: 0.969233
Minibatch accuracy: 80.5%
Validation accuracy: 75.1%
Minibatch loss at step 1000: 1.462774
Minibatch accuracy: 75.0%
Validation accuracy: 76.0%
Minibatch loss at step 1500: 0.830923
Minibatch accuracy: 82.8%
Validation accuracy: 76.6%
Minibatch loss at step 2000: 0.794860
Minibatch accuracy: 82.8%
Validation accuracy: 77.2%
Minibatch loss at step 2500: 1.052609
Minibatch accuracy: 76.6%
Validation accuracy: 77.8%
Minibatch loss at step 3000: 1.080875
Minibatch accuracy: 75.8%
Validation accuracy: 78.5%
Test accuracy: 86.6%
In 3.4141600132 seconds


### NN with SGD
- beta = 1    ; 10.7%; 33 s; 4batch; 10%; 33 s;
- beta = 0.5  ; 38.5%; 33 s; 4batch; 10%; 33 s;
- beta = 0.1  ; 84.8%; 33 s; 4batch; 74%; 33 s;
- beta = 0.05 ; 85.7%; 34 s; 4batch; 79%; 33 s;
- beta = 0.01 ; 86.7%; 34 s; 4batch; 77%; 33 s;
- beta = 0.005; 86.6%; 34 s; 4batch; 74%; 33 s;
- beta = 0.001; 83.4%; 34 s; 4batch; 75%; 48 s;
- the best setting is : 
 - batch size = 1024 (unchanged)
 - beta = 0.0007 (small for more sophisticated model)
 - LR = 0.1 (learn slow but won't miss the optimum)
 - num_steps = 20k (many times of training)
 - init weights with stddev = 0.1
 - result = 95.5% in 238 sec

In [9]:
# NN with SGD and with regularization : third
batch_size = 128
num_nodes = 1024
graph = tf.Graph()
beta = 0.0007
LR = 0.1

with graph.as_default():
    # Input data. For the training data, we use a placeholder that 
    # will be fed at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape = (batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape = (batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    # Variables.
    weights1 = tf.Variable(
       tf.truncated_normal([image_size * image_size, num_nodes],stddev = 0.1))
    biases1 = tf.Variable(tf.zeros([num_nodes]))
    weights2 = tf.Variable(
       tf.truncated_normal([num_nodes, num_labels],stddev = 0.1))
    biases2 = tf.Variable(tf.zeros([num_labels]))
    
    # Training computation.
    logits1 = tf.matmul(tf_train_dataset, weights1) + biases1
    tf_hidden_dataset = tf.nn.relu(logits1)
    logits = tf.matmul(tf_hidden_dataset, weights2) + biases2
    
    loss = tf.reduce_mean(
       tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
    regloss = tf.nn.l2_loss(weights1) +  tf.nn.l2_loss(weights2)
    total_loss = loss + beta*regloss
    
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(LR).minimize(total_loss)
    
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    
    first_valid = tf.matmul(tf_valid_dataset, weights1) + biases1
    logits_valid = tf.matmul(tf.nn.relu(first_valid), weights2) + biases2
    valid_prediction = tf.nn.softmax(logits_valid)
    
    first_test = tf.matmul(tf_test_dataset, weights1) + biases1
    logits_test = tf.matmul(tf.nn.relu(first_test), weights2) + biases2
    test_prediction = tf.nn.softmax(logits_test)

In [11]:
num_steps = 20001
from time import time
t0 = time()

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        step2 = step % 4
        offset = (step2 * batch_size) % (train_labels.shape[0]-batch_size)
        
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset+batch_size),:]
        batch_labels = train_labels[offset:(offset+batch_size),:]
        
        # Prepare a dictionary telling the session where to feed the 
        # minibatch. The key of the dictionary is the placeholder node of the
        # graph to be fed and the value is the numpy array.
        
        feed_dict = {tf_train_dataset : batch_data, 
                     tf_train_labels : batch_labels}
        _, l, predictions = session.run(
                [optimizer, loss, train_prediction], feed_dict=feed_dict)
        
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(
                valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
    print('In',time() - t0,'seconds')



Initialized
Minibatch loss at step 0: 3.868145
Minibatch accuracy: 9.4%
Validation accuracy: 25.0%
Minibatch loss at step 500: 0.013318
Minibatch accuracy: 100.0%
Validation accuracy: 77.7%
Minibatch loss at step 1000: 0.007880
Minibatch accuracy: 100.0%
Validation accuracy: 77.9%
Minibatch loss at step 1500: 0.006327
Minibatch accuracy: 100.0%
Validation accuracy: 78.1%
Minibatch loss at step 2000: 0.005699
Minibatch accuracy: 100.0%
Validation accuracy: 78.4%
Minibatch loss at step 2500: 0.005430
Minibatch accuracy: 100.0%
Validation accuracy: 78.6%
Minibatch loss at step 3000: 0.005337
Minibatch accuracy: 100.0%
Validation accuracy: 78.7%
Minibatch loss at step 3500: 0.005335
Minibatch accuracy: 100.0%
Validation accuracy: 78.8%
Minibatch loss at step 4000: 0.005380
Minibatch accuracy: 100.0%
Validation accuracy: 79.0%
Minibatch loss at step 4500: 0.005454
Minibatch accuracy: 100.0%
Validation accuracy: 79.1%
Minibatch loss at step 5000: 0.005542
Minibatch accuracy: 100.0%
Validatio

## Problem 2
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?


### Ans : I tried to overfit on only 4 batches (4*128 training points) and the performance dropped from 95.5% to 86.5% (seems still pretty high to me) 

## Problem 3
Introduce Dropout on the hidden layer of the NN. 

Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides nn.dropout() for that, but you have to make sure it's only inserted during training.
What happens to our extreme overfitting case?


### NN with SGD with dropout no regularization:
- the best setting is : 
 - batch size = 1024 (unchanged)
 - beta = 0 (small for more sophisticated model)
 - LR = 0.1 (learn slow but won't miss the optimum)
 - num_steps = 20k (many times of training)
 - init weights with stddev = 0.1
 - with dropout
 - result = 94.6% in 238 sec

In [21]:
# NN with SGD and with dropout : fourth
batch_size = 128
num_nodes = 1024
graph = tf.Graph()
beta = 0
LR = 0.1

with graph.as_default():
    # Input data. For the training data, we use a placeholder that 
    # will be fed at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape = (batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape = (batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    # Variables.
    weights1 = tf.Variable(
       tf.truncated_normal([image_size * image_size, num_nodes],stddev =0.1))
    biases1 = tf.Variable(tf.zeros([num_nodes]))
    weights2 = tf.Variable(
       tf.truncated_normal([num_nodes, num_labels],stddev = 0.1))
    biases2 = tf.Variable(tf.zeros([num_labels]))
    
    # Training computation.
    logits1 = tf.matmul(tf_train_dataset, weights1) + biases1
    logits1 = tf.nn.dropout(logits1,0.7)
    tf_hidden_dataset = tf.nn.relu(logits1)
    tf_hidden_dataset = tf.nn.dropout(tf_hidden_dataset,0.7)
    logits = tf.matmul(tf_hidden_dataset, weights2) + biases2
    
    loss = tf.reduce_mean(
       tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
    regloss = tf.nn.l2_loss(weights1) + tf.nn.l2_loss(weights2)
    total_loss = loss + beta*regloss
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(LR).minimize(total_loss)
    
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    
    first_valid = tf.matmul(tf_valid_dataset, weights1) + biases1
    logits_valid = tf.matmul(tf.nn.relu(first_valid), weights2) + biases2
    valid_prediction = tf.nn.softmax(logits_valid)
    
    first_test = tf.matmul(tf_test_dataset, weights1) + biases1
    logits_test = tf.matmul(tf.nn.relu(first_test), weights2) + biases2
    test_prediction = tf.nn.softmax(logits_test)

In [22]:
num_steps = 20001

t0 = time()

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        #step2 = step % 4
        offset = (step * batch_size) % (train_labels.shape[0]-batch_size)
        
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset+batch_size),:]
        batch_labels = train_labels[offset:(offset+batch_size),:]
        
        # Prepare a dictionary telling the session where to feed the 
        # minibatch. The key of the dictionary is the placeholder node of the
        # graph to be fed and the value is the numpy array.
        
        feed_dict = {tf_train_dataset : batch_data, 
                     tf_train_labels : batch_labels}
        _, l, predictions = session.run(
                [optimizer, loss, train_prediction], feed_dict=feed_dict)
        
        if (step % 2000 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(
                valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
    print('In',time() - t0,'seconds')



Initialized
Minibatch loss at step 0: 4.978772
Minibatch accuracy: 10.2%
Validation accuracy: 23.9%
Minibatch loss at step 2000: 0.403135
Minibatch accuracy: 90.6%
Validation accuracy: 86.0%
Minibatch loss at step 4000: 0.460382
Minibatch accuracy: 85.9%
Validation accuracy: 87.1%
Minibatch loss at step 6000: 0.538373
Minibatch accuracy: 83.6%
Validation accuracy: 87.3%
Minibatch loss at step 8000: 0.845599
Minibatch accuracy: 75.8%
Validation accuracy: 88.0%
Minibatch loss at step 10000: 0.444723
Minibatch accuracy: 85.9%
Validation accuracy: 88.0%
Minibatch loss at step 12000: 0.592690
Minibatch accuracy: 83.6%
Validation accuracy: 88.5%
Minibatch loss at step 14000: 0.454947
Minibatch accuracy: 85.9%
Validation accuracy: 88.8%
Minibatch loss at step 16000: 0.316330
Minibatch accuracy: 91.4%
Validation accuracy: 89.0%
Minibatch loss at step 18000: 0.267795
Minibatch accuracy: 92.2%
Validation accuracy: 89.2%
Minibatch loss at step 20000: 0.507402
Minibatch accuracy: 84.4%
Validation 

## Problem 4
Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is 97.1%.
One avenue you can explore is to add multiple layers.
Another one is to use learning rate decay:
global_step = tf.Variable(0)  # count the number of steps taken.
learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)

### multi-layer model

In [38]:
# NN with SGD and with dropout : fourth
batch_size = 128
num_nodes1 = 128
num_nodes2 = 128
graph = tf.Graph()
beta = 0.007
LR = 0.3

with graph.as_default():
    # Input data. For the training data, we use a placeholder that 
    # will be fed at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape = (batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape = (batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    # Variables.
    weights1 = tf.Variable(
           tf.truncated_normal([image_size * image_size, num_nodes1],stddev =0.1))
    biases1 = tf.Variable(tf.zeros([num_nodes1]))
    weights2 = tf.Variable(
           tf.truncated_normal([num_nodes1, num_nodes2],stddev = 0.1))
    biases2 = tf.Variable(tf.zeros([num_nodes2]))
        
    weights3 = tf.Variable(
           tf.truncated_normal([num_nodes2, num_labels],stddev = 0.1))
    biases3 = tf.Variable(tf.zeros([num_labels]))
    
    # Model.
    def model(data):
        # Training computation.
        logits1 = tf.matmul(data, weights1) + biases1
        #logits1 = tf.nn.dropout(logits1,0.5)
        tf_layer1 = tf.nn.relu(logits1)
        #tf_layer1 = tf.nn.dropout(tf_layer1,0.5)
        
        logits2 = tf.matmul(tf_layer1, weights2) + biases2
        #logits2 = tf.nn.dropout(logits2, 0.5)
        tf_layer2 = tf.nn.relu(logits2)
        #tf_layer2 = tf.nn.dropout(tf_layer2,0.5)
        
        logits = tf.matmul(tf_layer2, weights3) + biases3
        
        print('data input:', data.get_shape().as_list())
        print('logits1 output:', logits1.get_shape().as_list())
        print('tf_layer1:', tf_layer1.get_shape().as_list())
        print('tf_layer2:', tf_layer2.get_shape().as_list())
        print('logits output:', logits.get_shape().as_list())
        return(logits)
    
    logits = model(tf_train_dataset)
    loss = tf.reduce_mean(
       tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
    regloss = tf.nn.l2_loss(weights1) + tf.nn.l2_loss(weights2) + tf.nn.l2_loss(weights3)
    total_loss = loss + beta*regloss
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(LR).minimize(total_loss)
    
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    
    logits_valid = model(tf_valid_dataset)
    valid_prediction = tf.nn.softmax(logits_valid)
    
    logits_test = model(tf_test_dataset)
    test_prediction = tf.nn.softmax(logits_test)

data input: [128, 784]
logits1 output: [128, 128]
tf_layer1: [128, 128]
tf_layer2: [128, 128]
logits output: [128, 10]
data input: [10000, 784]
logits1 output: [10000, 128]
tf_layer1: [10000, 128]
tf_layer2: [10000, 128]
logits output: [10000, 10]
data input: [10000, 784]
logits1 output: [10000, 128]
tf_layer1: [10000, 128]
tf_layer2: [10000, 128]
logits output: [10000, 10]


In [39]:
num_steps = 20001

t0 = time()

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        #step2 = step % 4
        offset = (step * batch_size) % (train_labels.shape[0]-batch_size)
        
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset+batch_size),:]
        batch_labels = train_labels[offset:(offset+batch_size),:]
        
        # Prepare a dictionary telling the session where to feed the 
        # minibatch. The key of the dictionary is the placeholder node of the
        # graph to be fed and the value is the numpy array.
        
        feed_dict = {tf_train_dataset : batch_data, 
                     tf_train_labels : batch_labels}
        _, l, predictions = session.run(
                [optimizer, loss, train_prediction], feed_dict=feed_dict)
        
        if (step % 2000 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(
                valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
    print('In',time() - t0,'seconds')

Initialized
Minibatch loss at step 0: 2.362065
Minibatch accuracy: 18.8%
Validation accuracy: 27.8%
Minibatch loss at step 2000: 0.386487
Minibatch accuracy: 90.6%
Validation accuracy: 84.5%
Minibatch loss at step 4000: 0.460413
Minibatch accuracy: 88.3%
Validation accuracy: 84.9%
Minibatch loss at step 6000: 0.603220
Minibatch accuracy: 81.2%
Validation accuracy: 83.2%
Minibatch loss at step 8000: 0.715961
Minibatch accuracy: 79.7%
Validation accuracy: 83.7%
Minibatch loss at step 10000: 0.595183
Minibatch accuracy: 78.9%
Validation accuracy: 84.7%
Minibatch loss at step 12000: 0.598262
Minibatch accuracy: 81.2%
Validation accuracy: 84.1%
Minibatch loss at step 14000: 0.566148
Minibatch accuracy: 81.2%
Validation accuracy: 84.4%
Minibatch loss at step 16000: 0.409566
Minibatch accuracy: 89.1%
Validation accuracy: 84.5%
Minibatch loss at step 18000: 0.388286
Minibatch accuracy: 90.6%
Validation accuracy: 83.9%
Minibatch loss at step 20000: 0.642207
Minibatch accuracy: 79.7%
Validation 