In [None]:
%pylab inline
import random
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from sklearn.datasets import make_moons
from sklearn.metrics import accuracy_score

Populating the interactive namespace from numpy and matplotlib


## Afternoon session 2:
### Optimization and Neural Networks

Read the following paper: [An overview of gradient descent optimization
algorithms](https://arxiv.org/pdf/1609.04747.pdf).  
(At least read about stochastic, full-batch and mini-batch gradient descent, SGD, Momentum and Adam.)

Then try to implement the following:
1. Implement Momentum Accelerated gradient-descent for your backprop from scratch example.
    - If you have not implemented your own version this morning, try to finish this or use the provided example code.
    - The provided code also includes a bias term. 
    - All you have to implmement are the updates for the bias terms.
    - Compare your results from the 1-hidden layer network with bias, with (momentum=0.9) and without momentum.
    - Does this improve your results?

### Backprop from Scratch with Momentum

In [None]:
def make_train_test(batch_size, batch_num, test_size, noise=0.05):
    """
    Makes a two-moon train-test dataset with fixed batch size, number and noise level
    """
    X_train, y_train = make_moons(n_samples=batch_size*batch_num, noise=noise)
    y_train = y_train.reshape(batch_num, batch_size, 1)
    X_train = X_train.reshape(batch_num, batch_size, 2)


    X_test, y_test = make_moons(noise=0.1)
    y_test = y_test.reshape(test_size, 1)
    return X_train, y_train, X_test, y_test

def set_seed(seed):
    """
    Use this to set ALL the random seeds to a fixed value and take out any randomness from cuda kernels
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.benchmark = False  ##uses the inbuilt cudnn auto-tuner to find the fastest convolution algorithms. -
    torch.backends.cudnn.enabled   = False

    return True

Step 1: fill in the following values according to the morning session:

In [None]:
set_seed(42)

epochs =  #Number of loops through whole dataset

batch_size =  #Size of a single batch
batch_num =  #Use full batch training
test_size =  #Examples in test set

lr = 1.
D, H, M =  #Define input size (2), Size of Hidden Layer (4), Output size (1)
momentum =  #Set to 0.9 to try momentum

Step 2: create a two moon dataset with the parameters defined above and convert them to torch tensors

In [None]:
#Use Sklearn to create two-moons + noise

#Define Train Set in Pytorch

#Define Test Set in Pytorch


Step 3: define activation function and neural network parameters

In [None]:
#Define Activation Functions and Derivatives

#Define Neural Network Parameters


Step 4: define and initialise momentum parameters

In [None]:
#Define the momentum parameters


Step 5: create the training loop and print the loss and accuracy

In [None]:
#Enter training loop
for i in range(epochs):
    #Number of input examples
    #Forward Pass Layer 1
        #Affine Layer Transformation z1 = W1*X+b1
        #Apply non-linear activation function a1 = sigmoid(z1)
    
    #Forward Pass Layer 2
        #Affine Layer Transformation z2 = W2*a1+b2
        #Apply non-linear activation function a2 = sigmoid(z2)

    #Backward Pass Layer 2
        #Compute Error on Output
        #Compute derivative of activation function (Sigmoid)
    
    #Compute gradient w.r.t. weights in layer 2       
    #Compute gradient w.r.t. bias in layer 2, sums over all N examples
    
    #Backward Pass Layer 1
    #Compute Error on Output of Layer 1
    #Compute derivative of activation function (Sigmoid)
    
    #Compute gradient w.r.t. weights in layer 2
    #Compute gradient w.r.t. bias in layer 1, sums over all N examples

    #Sensitivity w.r.t. Input
    #Compute gradient w.r.t. input X
    
    #Gradient Descent with Momentum
    if momentum is not None:
        #Momentum step for layer 1 weights
        #Take a step in momentum weighted direction on layer 1 weights
        
        #Momentum step for layer 2 weights
        #Take a step in momentum weighted direction on layer 2 weights
               
        #Momentum step for layer 1 bias
        #Take a step in momentum weighted direction on layer 1 bias     
        
        #Momentum step for layer 2 bias
        #Take a step in momentum weighted direction on layer 2 bias     
        
    else: #Gradient Descent
        #Take a step in gradient direction on layer 1 weights
        #Take a step in gradient direction on layer 1 bias

        #Take a step in gradient direction on layer 2 weights
        #Take a step in gradient direction on layer 2 bias
    
    train_loss = -1./N*(y*torch.log(a2)+(1-y)*torch.log(1-a2)).sum(0) # Compute Average Binary-Crossentropy Loss
    if i % 100 == 0:
        print("Training Loss in epoch "+str(i)+": %1.2f" % train_loss.item())
        print("Training accuracy in epoch "+str(i)+": %1.2f" % accuracy_score(y, np.where(a2[:, 0].numpy()>0.5, 1, 0)),"\n")
        
#Do Forward Pass of Test Dataset
#Forward Pass Layer 1
    #Affine Layer Transformation z1 = W1*X+b1
    #Apply non-linear activation function a1 = sigmoid(z1)

#Forward Pass Layer 2
    #Affine Layer Transformation z2 = W2*a1+b2
    #Apply non-linear activation function a2 = sigmoid(z2)
test_loss = -(y_test*torch.log(a_test)+(1-y_test)*torch.log(1-a_test)).mean(0) #Compute Binary-Crossentropy Loss

print("End of Training -> Testing Phase: ")
print("Train Loss: %1.2f" % train_loss.item(), ", Test Loss: %1.2f" % test_loss.item())
print("Training accuracy in epoch "+str(i)+": %1.2f" % accuracy_score(y, np.where(a2[:, 0].numpy()>0.5, 1, 0)))
print("Test accuracy in epoch "+str(i)+": %1.2f" % accuracy_score(y_test, np.where(a_test[:, 0].numpy()>0.5, 1, 0)))    

Plot results (use the same plotting code from the morning)

In [None]:
# plot results