<a href="https://colab.research.google.com/github/ICRAR/PHYS5511/blob/master/2019/week04/Keras_FC_network_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
from builtins import range
from math import sqrt, ceil

import keras
#from keras.datasets import mnist
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from keras.utils.vis_utils import model_to_dot


%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
from IPython.display import SVG

#Vis utils

In [0]:
#@title visualization utilities [RUN ME]
def visualize_grid(Xs, ubound=255.0, padding=1):
    """
    Reshape a 4D tensor of image data to a grid for easy visualization.

    Inputs:
    - Xs: Data of shape (N, H, W, C)
    - ubound: Output grid will have values scaled to the range [0, ubound]
    - padding: The number of blank pixels between elements of the grid
    """
    (N, H, W, C) = Xs.shape
    grid_size = int(ceil(sqrt(N)))
    grid_height = H * grid_size + padding * (grid_size - 1)
    grid_width = W * grid_size + padding * (grid_size - 1)
    grid = np.zeros((grid_height, grid_width, C))
    next_idx = 0
    y0, y1 = 0, H
    for y in range(grid_size):
        x0, x1 = 0, W
        for x in range(grid_size):
            if next_idx < N:
                img = Xs[next_idx]
                low, high = np.min(img), np.max(img)
                grid[y0:y1, x0:x1] = ubound * (img - low) / (high - low)
                # grid[y0:y1, x0:x1] = Xs[next_idx]
                next_idx += 1
            x0 += W + padding
            x1 += W + padding
        y0 += H + padding
        y1 += H + padding
    # grid_max = np.max(grid)
    # grid_min = np.min(grid)
    # grid = ubound * (grid - grid_min) / (grid_max - grid_min)
    return grid

def vis_grid(Xs):
    """ visualize a grid of images """
    (N, H, W, C) = Xs.shape
    A = int(ceil(sqrt(N)))
    G = np.ones((A*H+A, A*W+A, C), Xs.dtype)
    G *= np.min(Xs)
    n = 0
    for y in range(A):
        for x in range(A):
            if n < N:
                G[y*H+y:(y+1)*H+y, x*W+x:(x+1)*W+x, :] = Xs[n,:,:,:]
                n += 1
    # normalize to [0,1]
    maxg = G.max()
    ming = G.min()
    G = (G - ming)/(maxg-ming)
    return G

def vis_nn(rows):
    """ visualize array of arrays of images """
    N = len(rows)
    D = len(rows[0])
    H,W,C = rows[0][0].shape
    Xs = rows[0][0]
    G = np.ones((N*H+N, D*W+D, C), Xs.dtype)
    for y in range(N):
        for x in range(D):
            G[y*H+y:(y+1)*H+y, x*W+x:(x+1)*W+x, :] = rows[y][x]
    # normalize to [0,1]
    maxg = G.max()
    ming = G.min()
    G = (G - ming)/(maxg-ming)
    return G


#Keras FC network classifier

In [0]:
model = Sequential()
model.add(Dense(128, input_shape=(784,), activation='relu', name='first_hidden'))
#model.add(Dropout(0.5))
model.add(Dense(64, activation='relu', name='second_hidden'))
#model.add(Dropout(0.5))
model.add(Dense(10, activation='softmax'))

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

In [0]:
SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg'))

# MNIST example

##Load data

In [0]:
def load_MNIST():
    test_data = pd.read_csv('/content/sample_data/mnist_test.csv').values
    train_data = pd.read_csv('/content/sample_data/mnist_train_small.csv').values
    xtrain = np.reshape(train_data[:, 1:], [-1, 28, 28, 1]).astype(np.float32)
    ytrain = train_data[:, 0]
    xtest = np.reshape(test_data[:, 1:], [-1, 28, 28, 1]).astype(np.float32)
    ytest = test_data[:, 0]
    return xtrain / 255, ytrain, xtest / 255, ytest
  

In [0]:
def get_MNIST_data(num_training=18000, num_validation=1000, num_test=1000):
  """
  Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
  it for the linear classifier. These are the same steps as we used for the
  SVM, but condensed to a single function.  
  """
  # Load the csv MNIST data
  X_train, y_train, X_test, y_test = load_MNIST()
  
  # subsample the data
  mask = range(num_training, num_training + num_validation)
  X_val = X_train[mask]
  y_val = y_train[mask]
  mask = range(num_training)
  X_train = X_train[mask]
  y_train = y_train[mask]
  mask = range(num_test)
  X_test = X_test[mask]
  y_test = y_test[mask]
  
  # Preprocessing: reshape the image data into rows
  X_train = np.reshape(X_train, (X_train.shape[0], -1))
  X_val = np.reshape(X_val, (X_val.shape[0], -1))
  X_test = np.reshape(X_test, (X_test.shape[0], -1))
  
  # Normalize the data: subtract the mean image
  mean_image = np.mean(X_train, axis=0)
  X_train -= mean_image
  X_val -= mean_image
  X_test -= mean_image
  
  # DO NOT add bias dimension and transform into columns
  # X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))]).T
  # X_val = np.hstack([X_val, np.ones((X_val.shape[0], 1))]).T
  # X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))]).T
  
  return X_train, y_train, X_val, y_val, X_test, y_test


# Invoke the above function to get our data.
X_train, y_train, X_val, y_val, X_test, y_test = get_MNIST_data()
print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)

##check images

In [0]:
print(y_test[3])
plt.imshow(X_test[3].reshape([28, 28]))

## Train the Keras FC network

In [0]:
y_train_K = keras.utils.to_categorical(y_train, 10)
y_val_K = keras.utils.to_categorical(y_val, 10)
y_test_K = keras.utils.to_categorical(y_test, 10)

In [0]:
history = model.fit(X_train, y_train_K,
                          validation_data=(X_val, y_val_K),
                          epochs=10, batch_size=200, 
                          callbacks=None)

##Debug the training for keras FC network

In [0]:
from matplotlib.ticker import MaxNLocator
fig = plt.figure(figsize=(10, 6))
ax = fig.gca()
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
histories = history.history.items()
xvals = np.arange(1, 11)
for k, v in histories:
    plt.plot(xvals, v, label=k if 'val_' in k else 'train_%s' % k)

plt.legend(loc='best', fontsize=14)
plt.suptitle('Loss curve', fontsize=16)
plt.ylabel('MSE', fontsize=14)
plt.xlabel('Epoch', fontsize=14)

In [0]:
def show_keras_net_weights():
    W1 = model.layers[0].get_weights()[0]
    
    print(W1.shape)
    W1 = W1.reshape(28, 28, 1, -1).transpose(3, 0, 1, 2)
    
    grid = visualize_grid(W1, padding=1)
    print(grid.shape)
    plt.imshow(grid.astype('uint8')[:,:,0], cmap='gray')
    plt.gca().axis('off')
    plt.show()

#type(model.layers)
#for layer in model.layers: print(layer.get_config(), layer.get_weights())
show_keras_net_weights()

#Inference on the test set
When you are done experimenting, you should evaluate your final trained network on the test set; you should get above 95%.

In [0]:
score = model.evaluate(X_test, y_test_K, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [0]:
np.argmax(model.predict(X_test[3].reshape([-1, 784])))

#Tune hyperparameters (Change me - Home work!)
**What's wrong?** Looking at the visualisations above, we see that the loss is decreasing more or less linearly, which seems to suggest that the learning rate may be too low. Moreover, there is no gap between the training and validation accuracy, suggesting that the model we used has **low capacity**. and that we should increase its size. On the other hand, with a very large model we would expect to see more overfitting, which would manifest itself as a very large gap between the training and validation accuracy.

**Tuning**. Tuning the hyperparameters and developing intuition for how they affect the final performance is a large part of using Neural Networks, so we want you to get a lot of practice. Below, you should experiment with different values of the various hyperparameters, including hidden layer size, learning rate, numer of training epochs, and regularization strength. You might also consider tuning the learning rate decay, but you should be able to get good performance using the default value.

**Approximate results**. You should be aim to achieve a classification accuracy of greater than 92% on the validation set. Our best network gets over 96.5% on the validation set.

**Experiment**: You goal in this exercise is to get as good of a result on MNIST as you can, with a fully-connected Neural Network. Feel free implement your own techniques (e.g. PCA to reduce dimensionality, or adding dropout, or adding features to the solver, etc.). We will go through these "bells and whisltes" in the next lecture

In [0]:
best_net = None # store the best model into this 
best_val = -1
best_stats = {}

#################################################################################
# TODO: Tune hyperparameters using the validation set. Store your best trained  #
# model in best_net.                                                            #
#                                                                               #
# To help debug your network, it may help to use visualizations similar to the  #
# ones we used above; these visualizations will have significant qualitative    #
# differences from the ones we saw above for the poorly tuned network.          #
#                                                                               #
# Tweaking hyperparameters by hand can be fun, but you might find it useful to  #
# write code to sweep through possible combinations of hyperparameters          #
# automatically like we did on the previous exercises.                          #
#################################################################################
learning_rates = np.logspace(-3, -2.5, 3) 
regularization_strengths = np.logspace(-3, 0, 3)
h_sizes = [50, 80, 100, 120, 150, 180, 200][0:3]
combinatorial_list = [learning_rates, regularization_strengths, h_sizes]

iters = 1000 #100
from itertools import product
for lr, reg, hidden_size in product(*combinatorial_list):
  # Create a two-layer network
  net = TwoLayerNet(input_size, hidden_size, num_classes)
  # Train the network
  stats = net.train(X_train, y_train, X_val, y_val,
                    num_iters=iters, batch_size=64,
                    learning_rate=lr, learning_rate_decay=0.95,
                    reg=reg, verbose=False)
  # Predict on the training set
  train_accuracy = (net.predict(X_train) == y_train).mean()
  # Predict on the validation set
  val_accuracy = (net.predict(X_val) == y_val).mean()
  
  # Save best values
  if val_accuracy > best_val:
    best_val = val_accuracy
    best_net = net
    best_stats = stats
  # Print results
  print('lr %e reg %e hid %d  train accuracy: %f val accuracy: %f' % (
              lr, reg, hidden_size, train_accuracy, val_accuracy))
print('best validation accuracy achieved: %f' % best_val)

In [0]:
# Plot the loss function and train / validation accuracies
plt.subplot(2, 1, 1)
plt.plot(best_stats['loss_history'])
plt.title('Loss history')
plt.xlabel('Iteration')
plt.ylabel('Loss')

plt.subplot(2, 1, 2)
plt.plot(best_stats['train_acc_history'], label='train')
plt.plot(best_stats['val_acc_history'], label='val')
plt.title('Classification accuracy history')
plt.xlabel('Epoch')
plt.ylabel('Clasification accuracy')
plt.legend()
plt.tight_layout()
plt.show()

In [0]:
show_net_weights(best_net)

##Final quiz

Now that you have trained a Neural Network classifier, you may find that your testing accuracy is much lower than the training accuracy. In what ways can we decrease this gap? Select all that apply.

1. Train on a larger dataset.
2. Add more hidden units.
3. Increase the regularization strength.
4. None of the above.