Deep Learning
=============

Assignment 3
------------

Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [None]:
import os, pickle
import numpy as np
import tensorflow as tf

In [None]:
data_root = './dataset/'
pickle_file = os.path.join(data_root, 'noMNIST_sanit.pickle')

In [None]:
with open(pickle_file, 'rb') as f:
    data = pickle.load(f, encoding='bytes')
    print('Data dict\'s key:', data.keys())
    _train_dataset = data['train_dataset']
    _train_labels = data['train_labels']
    _valid_dataset = data['valid_dataset']
    _valid_labels = data['valid_labels']
    _test_dataset = data['test_dataset']
    _test_labels = data['test_labels']
    del data
    print('Train set', _train_dataset.shape, 'Labels:', _train_labels.shape)
    print('Valid set', _valid_dataset.shape, 'Labels:', _valid_labels.shape)
    print('Test set', _test_dataset.shape, 'Labels:', _test_labels.shape)

In [None]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
    dataset = dataset.reshape((-1, image_size*image_size)).astype(np.float32)
    labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
    return dataset, labels

train_dataset, train_labels = reformat(_train_dataset, _train_labels)
valid_dataset, valid_labels = reformat(_valid_dataset, _valid_labels)
test_dataset, test_labels = reformat(_test_dataset, _test_labels)

print('Train set', train_dataset.shape, 'Labels:',train_labels.shape)
print('Valid set', valid_dataset.shape, 'Labels:',valid_labels.shape)
print('Test set', test_dataset.shape, 'Labels:',test_labels.shape)

In [None]:
def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

---
Problem 1
---------

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.

---

In [None]:
batch_size = 128

graph = tf.Graph()
with graph.as_default():
    tf_train_dataset = tf.placeholder(tf.float32,
                                     shape=(batch_size, image_size*image_size))
    tf_train_labels = tf.placeholder(tf.float32,
                                    shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset =  tf.constant(test_dataset)
    # If the shape is not specified, you can feed a tensor of any shape.
    beta_regul = tf.placeholder(tf.float32)
    
    # Variables.
    weights = tf.get_variable('weights',
                             shape=[tf_train_dataset.shape[-1], 
                                    tf_train_labels.shape[-1]],
                             initializer=tf.truncated_normal_initializer(0., 1.))
    biases = tf.get_variable('biases',
                            shape=[tf_train_labels.shape[-1]],
                            initializer=tf.constant_initializer(0.))
    
    # Training computation.
    _logits = tf.matmul(tf_train_dataset, weights) + biases
    _loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(logits=_logits, 
                                                   labels=tf_train_labels)
        + beta_regul * tf.nn.l2_loss(weights))
    
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(_loss)
    
    # Predictions for the training.
    train_prediction = tf.nn.softmax(_logits)
    valid_logits = tf.matmul(tf_valid_dataset, weights) + biases
    valid_prediction = tf.nn.softmax(valid_logits)
    test_logits = tf.matmul(tf_test_dataset, weights) + biases
    test_prediction = tf.nn.softmax(test_logits)

In [None]:
steps = 5000
regul_val = [pow(10, i) for i in np.arange(-4, -2, 0.1)]
accuracy_val = []

for i, regul in enumerate(regul_val):
    with tf.Session(graph=graph) as session:
        tf.global_variables_initializer().run()
        for step in range(steps):
            offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
            
            # Generate a minibatch.
            batch_data = train_dataset[offset:(offset + batch_size), :]
            batch_labels = train_labels[offset:(offset + batch_size), :]
            
            # Feed it.
            feed_dict = {tf_train_dataset: batch_data, 
                         tf_train_labels: batch_labels,
                         beta_regul: regul}
            
            _, loss, predictions = session.run([optimizer, 
                                                _loss, 
                                                train_prediction], feed_dict=feed_dict)
            
            if ((step+1) % 500 == 0):
                print('Minibatch step: %d, loss: %4.5f, acc: %.1f%%, valid_acc: %.1f%%.'
                  % ((i*steps) + (step+1), loss,
                     accuracy(predictions, batch_labels), 
                     accuracy(valid_prediction.eval(), valid_labels)))
        print('Test Accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))
        accuracy_val.append(accuracy(test_prediction.eval(), test_labels))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# x轴是以对数的底的值log(x).
# plt.plot 绘制折线图.
plt.semilogx(regul_val, accuracy_val)
plt.grid(True)
plt.title('Test accuracy by Regularization (Logistic)')
plt.show()

If same technique will improve prediction of the 1-layer neural network.

In [None]:
batch_size = 128
num_hidden_nodes = 1024

graph = tf.Graph()
with graph.as_default():
    tf_train_dataset = tf.placeholder(tf.float32,
                                     shape=(batch_size, image_size*image_size))
    tf_train_labels = tf.placeholder(tf.float32,
                                    shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset =  tf.constant(test_dataset)
    # If the shape is not specified, you can feed a tensor of any shape.
    beta_regul = tf.placeholder(tf.float32)
    
    # Variables.
    weights_1 = tf.get_variable('weights_1',
                             shape=[tf_train_dataset.shape[-1], 
                                    num_hidden_nodes],
                             initializer=tf.truncated_normal_initializer(0., 1.))
    biases_1 = tf.get_variable('biases_1',
                            shape=[num_hidden_nodes],
                            initializer=tf.constant_initializer(0.))
    
    weights_2 = tf.get_variable('weights_2',
                               shape=[num_hidden_nodes, num_labels],
                               initializer=tf.truncated_normal_initializer(0., 1.))
    biases_2 = tf.get_variable('biases_2',
                              shape=[num_labels],
                              initializer=tf.constant_initializer(0.))
    
    lay1 = tf.nn.relu(tf.matmul(tf_train_dataset, weights_1) + biases_1)
    
    # Training computation.
    _logits = tf.matmul(lay1, weights_2) + biases_2
    _loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(logits=_logits, 
                                                   labels=tf_train_labels)
        + beta_regul * (tf.nn.l2_loss(weights_1) + tf.nn.l2_loss(weights_2)))
    
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(_loss)
    
    # Predictions for the training.
    train_prediction = tf.nn.softmax(_logits)
    valid_lay1 = tf.nn.relu(tf.matmul(tf_valid_dataset, weights_1) + biases_1)
    valid_logits = tf.matmul(valid_lay1, weights_2) + biases_2
    valid_prediction = tf.nn.softmax(valid_logits)
    test_lay1 = tf.nn.relu(tf.matmul(tf_test_dataset, weights_1) + biases_1)
    test_logits = tf.matmul(test_lay1, weights_2) + biases_2
    test_prediction = tf.nn.softmax(test_logits)

In [None]:
%matplotlib qt5
plt.ion()
plt.title('Loss by Steps')
plt.figure(1)
plt.xlabel('Steps')
plt.ylabel('Loss')
x = []
y = []
line, = plt.plot(x, y)
ax = plt.gca()

steps = 3000
regul_val = [pow(10, i) for i in np.arange(-4, -2, 0.1)]
accuracy_val = []

for i, regul in enumerate(regul_val):
    with tf.Session(graph=graph) as session:
        tf.global_variables_initializer().run()
        for step in range(steps):
            offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
            
            # Generate a minibatch.
            batch_data = train_dataset[offset:(offset + batch_size), :]
            batch_labels = train_labels[offset:(offset + batch_size), :]
            
            # Feed it.
            feed_dict = {tf_train_dataset: batch_data, 
                         tf_train_labels: batch_labels,
                         beta_regul: regul}
            
            _, loss, predictions = session.run([optimizer, 
                                                _loss, 
                                                train_prediction], feed_dict=feed_dict)
            
            if ((step+1) % 500 == 0):
                print('Minibatch step: %d, loss: %4.5f, acc: %.1f%%, valid_acc: %.1f%%.'
                  % ((i*steps) + (step+1), loss,
                     accuracy(predictions, batch_labels), 
                     accuracy(valid_prediction.eval(), valid_labels)))
                
                # Plot loss by steps:
                x = np.append(x, (i*steps) + (step+1))
                y = np.append(y, loss)
                line.set_xdata(x)
                line.set_ydata(y)
                ax.relim()
                ax.autoscale_view(True, True, True)
                plt.draw()
                plt.pause(1e-17)
                
        print('Test Accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))
        accuracy_val.append(accuracy(test_prediction.eval(), test_labels))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# x轴是以对数的底的值log(x).
# plt.plot 绘制折线图.
plt.semilogx(regul_val, accuracy_val)
plt.grid(True)
plt.title('Test accuracy by Regularization (1-Layer Net)')
plt.show()

---
Problem 2
---------
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

---

In [None]:
steps = 101
num_batches = 3

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('[Tensorflow]: Initialized!')
    for step in range(steps):
        offset = ((step % num_batches) * batch_size) % (train_labels.shape[0] - batch_size)
        
        # Generate minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        # Feed it.
        feed_dict = {tf_train_dataset: batch_data, 
                     tf_train_labels: batch_labels, beta_regul: 1e-3}
        
        _, loss, predictions = session.run([optimizer, 
                                                _loss, 
                                                train_prediction], feed_dict=feed_dict)
            
        if ((step+1) % 2 == 0):
            print('Minibatch step: %d, loss: %4.5f, acc: %.1f%%, valid_acc: %.1f%%.'
                  % ((step+1), loss,
                     accuracy(predictions, batch_labels), 
                     accuracy(valid_prediction.eval(), valid_labels)))
    print('Test Accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))
    accuracy_val.append(accuracy(test_prediction.eval(), test_labels))

---
Problem 3
---------
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

---

#### Dropout

In [None]:
batch_size = 128
num_hidden_nodes = 1024

graph = tf.Graph()
with graph.as_default():
    tf_train_dataset = tf.placeholder(tf.float32,
                                     shape=(batch_size, image_size*image_size))
    tf_train_labels = tf.placeholder(tf.float32,
                                    shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset =  tf.constant(test_dataset)
    # If the shape is not specified, you can feed a tensor of any shape.
    beta_regul = tf.placeholder(tf.float32)
    
    # Variables.
    weights_1 = tf.get_variable('weights_1',
                             shape=[tf_train_dataset.shape[-1], 
                                    num_hidden_nodes],
                             initializer=tf.truncated_normal_initializer(0., 1.))
    biases_1 = tf.get_variable('biases_1',
                            shape=[num_hidden_nodes],
                            initializer=tf.constant_initializer(0.))
    
    weights_2 = tf.get_variable('weights_2',
                               shape=[num_hidden_nodes, num_labels],
                               initializer=tf.truncated_normal_initializer(0., 1.))
    biases_2 = tf.get_variable('biases_2',
                              shape=[num_labels],
                              initializer=tf.constant_initializer(0.))
    
    lay1 = tf.nn.relu(tf.matmul(tf_train_dataset, weights_1) + biases_1)
    
    # Dropout layer
    dropout_1 = tf.nn.dropout(lay1, rate=0.5)
    
    # Training computation.
    _logits = tf.matmul(dropout_1, weights_2) + biases_2
    _loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(logits=_logits, 
                                                   labels=tf_train_labels)
        + beta_regul * (tf.nn.l2_loss(weights_1) + tf.nn.l2_loss(weights_2)))
    
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(_loss)
    
    # Predictions for the training.
    train_prediction = tf.nn.softmax(_logits)
    valid_lay1 = tf.nn.relu(tf.matmul(tf_valid_dataset, weights_1) + biases_1)
    valid_logits = tf.matmul(valid_lay1, weights_2) + biases_2
    valid_prediction = tf.nn.softmax(valid_logits)
    test_lay1 = tf.nn.relu(tf.matmul(tf_test_dataset, weights_1) + biases_1)
    test_logits = tf.matmul(test_lay1, weights_2) + biases_2
    test_prediction = tf.nn.softmax(test_logits)

In [None]:
steps = 101
num_batches = 3

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('[Tensorflow]: Initialized!')
    for step in range(steps):
        offset = ((step % num_batches) * batch_size) % (train_labels.shape[0] - batch_size)
        
        # Generate minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        # Feed it.
        feed_dict = {tf_train_dataset: batch_data, 
                     tf_train_labels: batch_labels, beta_regul: 1e-3}
        
        _, loss, predictions = session.run([optimizer, 
                                                _loss, 
                                                train_prediction], feed_dict=feed_dict)
            
        if ((step+1) % 2 == 0):
            print('Minibatch step: %d, loss: %4.5f, acc: %.1f%%, valid_acc: %.1f%%.'
                  % ((step+1), loss,
                     accuracy(predictions, batch_labels), 
                     accuracy(valid_prediction.eval(), valid_labels)))
    print('Test Accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))
    accuracy_val.append(accuracy(test_prediction.eval(), test_labels))

The first conclusion is that 100% of accuracy on the minibatches is more diffcult achieved or to keep. As a result, the test accuracy is improved from 78.9% to 82.5%, the final net is more capable of generalization.

---
Problem 4
---------

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
 
 ---


#### 2-Layers Network

In [None]:
batch_size = 128
num_hidden_nodes_1 = 1024
num_hidden_nodes_2 = 256
beta_regul = 1e-3

graph = tf.Graph()
with graph.as_default():
    tf_train_dataset = tf.placeholder(tf.float32,
                                     shape=(batch_size, image_size*image_size))
    tf_train_labels = tf.placeholder(tf.float32,
                                    shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset =  tf.constant(test_dataset)
    # If the shape is not specified, you can feed a tensor of any shape.
    beta_regul = tf.placeholder(tf.float32)
    global_step = tf.Variable(0)
    
    # Variables.
    weights_1 = tf.get_variable('weights_1',
                             shape=[tf_train_dataset.shape[-1], 
                                    num_hidden_nodes_1],
                             initializer=tf.truncated_normal_initializer(0., np.sqrt(3. / (image_size*image_size))))
    biases_1 = tf.get_variable('biases_1',
                            shape=[weights_1.shape[-1]],
                            initializer=tf.constant_initializer(0.))
    
    weights_2 = tf.get_variable('weights_2',
                               shape=[num_hidden_nodes_1, num_hidden_nodes_2],
                               initializer=tf.truncated_normal_initializer(0., 3. / num_hidden_nodes_1))
    biases_2 = tf.get_variable('biases_2',
                              shape=[weights_2.shape[-1]],
                              initializer=tf.constant_initializer(0.))
    
    weights_3 = tf.get_variable('weights_3',
                               shape=[num_hidden_nodes_2, num_labels],
                               initializer=tf.truncated_normal_initializer(0., 3. / num_hidden_nodes_2))
    biases_3 = tf.get_variable('biases_3',
                              shape=[weights_3.shape[-1]],
                              initializer=tf.constant_initializer(0.))
    
    lay1 = tf.nn.relu(tf.matmul(tf_train_dataset, weights_1) + biases_1)
    lay2 = tf.nn.relu(tf.matmul(lay1, weights_2) + biases_2)
    
    # Dropout layer
    dropout_1 = tf.nn.dropout(lay2, rate=0.5)
    
    # Training computation.
    _logits = tf.matmul(dropout_1, weights_3) + biases_3
    _loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(logits=_logits, 
                                                   labels=tf_train_labels)
        + beta_regul * (tf.nn.l2_loss(weights_1) + tf.nn.l2_loss(weights_2) + tf.nn.l2_loss(weights_3)))
    
    # Optimizer.
    # decayed_learning_rate = learning_rate * 
    #                         decay_rate ^ (global_step / decay_steps)
    learning_rate = tf.train.exponential_decay(0.5, # learning_rate
                                               global_step, # global_step
                                               1000, # decay_steps
                                               0.65, # decay_rate
                                               staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(
        learning_rate).minimize(_loss, global_step=global_step)
    
    # Predictions for the training.
    train_prediction = tf.nn.softmax(_logits)
    valid_lay1 = tf.nn.relu(tf.matmul(tf_valid_dataset, weights_1) + biases_1)
    valid_lay2 = tf.nn.relu(tf.matmul(valid_lay1, weights_2) + biases_2)
    valid_logits = tf.matmul(valid_lay2, weights_3) + biases_3
    valid_prediction = tf.nn.softmax(valid_logits)
    test_lay1 = tf.nn.relu(tf.matmul(tf_test_dataset, weights_1) + biases_1)
    test_lay2 = tf.nn.relu(tf.matmul(test_lay1, weights_2) + biases_2)
    test_logits = tf.matmul(test_lay2, weights_3) + biases_3
    test_prediction = tf.nn.softmax(test_logits)

In [None]:
steps = 10000
accuracy_val = []

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('[Tensorflow]: Initialized!')
    for step in range(steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        
        # Generate minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        # Feed it.
        feed_dict = {tf_train_dataset: batch_data, 
                     tf_train_labels: batch_labels, beta_regul: 1e-3}
        
        _, loss, predictions = session.run([optimizer, 
                                                _loss, 
                                                train_prediction], feed_dict=feed_dict)
            
        if ((step+1) % 500 == 0):
            print('Minibatch step: %d, loss: %4.5f, acc: %.1f%%, valid_acc: %.1f%%.'
                  % ((step+1), loss,
                     accuracy(predictions, batch_labels), 
                     accuracy(valid_prediction.eval(), valid_labels)))
    print('Test Accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))
    accuracy_val.append(accuracy(test_prediction.eval(), test_labels))