# Lesson 2: Neural Net Computations

In the previous assignment we used Keras to train a neural network. In this assignment you will build your own minimal neural net library. The basic structure is given to you; you will need to fill in details such as weight updating for backpropogation. Then you will test the network on learning the XOR function.

Read through the class definitions below first to understand the basic architecture.

Then you should add code as necessary where marked "TODO" in the code below.

In [1]:
import numpy as np

In [2]:
class NNet():
    """Implements a basic feedforward neural network."""
    
    def __init__(self):
        self._layers = []  # An ordered list of layers. The first layer is the input; the final is the output.
    
    def _add_layer(self, layer):
        if self._layers:
            # Update pointers. We keep a doubly-linked-list of layers for convenience.
            prev_layer = self._layers[-1]
            prev_layer.set_next_layer(layer)
            layer.set_prev_layer(prev_layer)
            
        self._layers.append(layer)
    
    def add_input_layer(self, size, **kwargs):
        assert type(size).__name__ == 'int', ('Input layer requires integer size. Type was %s instead.' 
                                              % type(size).__name__)
        layer = InputLayer(size=size, **kwargs)
        self._add_layer(layer)

    def add_dense_layer(self, size, **kwargs):
        assert type(size).__name__ == 'int', ('Dense layer requires integer size. Type was %s instead.' 
                                              % type(size).__name__)
        # Find the previous layer's size.
        prev_size = self._layers[-1].size()
        layer = DenseLayer(shape=(prev_size, size), **kwargs)
        self._add_layer(layer)
        
    def train_single_example(self, X_data, y_data, learning_rate=0.01):
        # Train on a single example. X_data and y_data must be numpy arrays.
        
        assert isinstance(X_data, np.ndarray)
        assert isinstance(y_data, np.ndarray)

        # Forward propagation.
        outputs = self.predict(X_data)
        
        # Backpropagation.
        deltas = outputs - y_data  # Compute error on this example.        
        for layer in reversed(self._layers):
            deltas = layer.backpropagate(deltas, learning_rate)
    
    def predict(self, x):
        assert isinstance(x, np.ndarray)
        
        prev_output = x
        for layer in self._layers:
            prev_output = layer.feed_forward(prev_output)
        return prev_output

    def summary(self, verbose=False):
        """Prints a description of the model."""
        for i, layer in enumerate(self._layers):
            print('%d: %s' % (i, str(layer)))
            if verbose:
                print('weights:', layer.get_weights())
                if layer._use_bias:
                    print('bias:', layer._bias)
                print()
    
    def train(self, X_data, y_data, learning_rate, num_epochs, verbose=True, print_every_n=100):
        """Both X_data and y_data should be ndarrays. One example per row."""
        assert isinstance(X_data, np.ndarray)
        assert isinstance(y_data, np.ndarray)
        assert X_data.shape[0] == y_data.shape[0]

        idx = list(range(len(X_data)))
        for epoch in range(num_epochs):    
            np.random.shuffle(idx)
            for i in idx:
                nnet.train_single_example(X_data[i], y_data[i], 0.1)
            if verbose and (epoch % print_every_n == 0):
                results = []
                for i in range(len(X_data)):
                    results.append(nnet.predict(X_data[i]))
                mse = self.compute_mean_squared_error(X_data, y_data)
                acc = self.compute_accuracy(X_data, y_data) * 100
                print('%d: MSE: %.5f Acc: %.1f%% -- %s' % (epoch, mse, acc, results))
    
    def compute_mean_squared_error(self, X_data, y_data):
        """Given input X_data and target y_data, compute and return the mean squared error."""
        assert isinstance(X_data, np.ndarray)
        assert isinstance(y_data, np.ndarray)
        assert X_data.shape[0] == y_data.shape[0]
        
        error = 0
        for i in range(len(X_data)):
            outputs = self.predict(X_data[i])
            error += (y_data[i] - outputs) ** 2
        mse = error / len(X_data)
        return mse
    
    def compute_accuracy(self, X_data, y_data):
        """Given input X_data and target y_data, convert outputs to binary using a threshold of 0.5
        and return the accuracy: # examples correct / total # examples."""
        assert isinstance(X_data, np.ndarray)
        assert isinstance(y_data, np.ndarray)
        assert X_data.shape[0] == y_data.shape[0]
        
        correct = 0
        for i in range(len(X_data)):
            outputs = self.predict(X_data[i])
            outputs = outputs > 0.5
            if outputs == y_data[i]:
                correct += 1
        acc = float(correct) / len(X_data)
        return acc

In [3]:
class Activation():
    """Class that represents an activation function and knows how to take its own derivative."""
    def __init__(self, name):
        self.name = name
    
    def activate(x):
        """x is a scalar or a numpy array. Returns the output y, the result of applying the function to input x."""
        raise NotImplementedError()
    
    def derivative_given_y(self, y):
        """y is a scalar or a numpy array. 
        
        Returns the derivative d(f)/dx given the *activation* value y."""
        raise NotImplementedError()

In [4]:
class IdentityActivation(Activation):
    """Activation function that passes input through unchanged."""
    
    def __init__(self):
        super().__init__(name='Identity')
    
    def activate(self, x):
        """x is a scalar or a numpy array. Returns the output y, the result of applying the function to input x."""
        return x
    
    def derivative_given_y(self, y):
        """y is a scalar or a numpy array. 
        
        Returns the derivative d(f)/dx given the *activation* value y."""
        return 1
    
class SigmoidActivation(Activation):
    """Sigmoid activation function."""

    def __init__(self):
        super().__init__(name='Sigmoid')
    
    def activate(self, x):
        """x is a scalar or a numpy array. Returns the output y, the result of applying the function to input x."""
        return 1.0 / (1.0 + np.exp(-x))
    
    def derivative_given_y(self, y):
        """y is a scalar or a numpy array. 
        
        Returns the derivative d(f)/dx given the *activation* value y."""
        return y * (1.0 - y)
    
class ReluActivation(Activation):
    """ReLU activation function."""

    def __init__(self):
        super().__init__(name='ReLU')
    
    def activate(self, x):
        """x is a scalar or a numpy array. Returns the output y, the result of applying the function to input x."""
        if isinstance(x, np.ndarray):
            result = np.zeros(x.shape)
            for i in len(result):
                result[i] = x[i] if x[i] > 0 else 0
        else:
            result = x if x > 0 else 0
        return result
    
    def derivative_given_y(self, y):
        """y is a scalar or a numpy array. 
        
        Returns the derivative d(f)/dx given the *activation* value y."""
        return y > 0

In [5]:
def WeightInitializer():
    """Function to return a random weight from -1 to 1."""
    return np.random.random()*2 - 1

def WeightInitializerPositive():
    """Function to return a random weight from 0 to 1."""
    return np.random.random()

In [6]:
class Layer():
    """Base class for NNet layers.
    
    Conceptually, in this library a Layer consists at a high level of:
      * a collection of weights (a 2D numpy array)
      * the output nodes that come after the weights above
      * the activation function that is applied to the summed signals in these output nodes
      
    So a Layer isn't just nodes -- it's weights as well as nodes.
      
    Specifically, to send signal forward through a 3-layer network, we start with an Input Layer that does
    very little.  The outputs from the Input layer are simply the fed-in input data.  
    
    Then, the next layer will be a Dense layer that holds the weights from the Input layer to the first hidden
    layer and stores the activation function to be used after doing a product of weights and Input-Layer
    outputs.
    
    Finally, another Dense layer will hold the weights from the hidden to the output layer nodes, and stores
    the activation function to be applied to the final output nodes.
    
    For a typical 1-hidden layer network, then, we would have 1 Input layer and 2 Dense layers.
    
    Each Layer also has funcitons to perform the forward-pass and backpropagation steps for the weights/nodes
    associated with the layer.
    
    Finally, each Layer stores pointers to the pervious and next layers, for convenience when implementing
    backprop.
    """
   
    def __init__(self, shape, use_bias, activation_function=IdentityActivation, weight_initializer=None, name=''):
        # These are the weights from the *previous* layer to the current layer.
        self._weights = None
        
        # Tuple of (# inputs, # outputs) for Dense layers or just a scalar for an input layer.
        assert type(shape).__name__ == 'int' or type(shape).__name__ == 'tuple', (
            'shape must be scalar or a 2-element tuple')
        if type(shape).__name__ == 'tuple':
            assert len(shape)==2, 'shape must be 2-dimensional. Was %d instead' % len(shape)
        self._shape = shape 
    
        # True to use a bias node that inputs to each node in this layer; False otherwise.
        self._use_bias = use_bias
        
        if use_bias:
            bias_size = shape[-1] if len(shape) > 1 else shape
            self._bias = np.zeros(bias_size)
            if weight_initializer:
                for i in range(bias_size):
                    self._bias[i] = weight_initializer()
        
        # Activation function to be applied to each dot product of weights with inputs.
        # Instantiate an object of this class.
        self._activation_function = activation_function() if activation_function else None
        
        # Method used to initialize the weights in this Layer at creation time.
        self._weight_initializer = weight_initializer
        
        # Layer name (optional)
        self._name = name
        
        # Calculated output vector from the most recent feed_forward(inputs) call.
        self._outputs = None
        
        # Doubly linked list pointers to neighbor layers.
        self._prev_layer = None  # Previous layer is closer to (or is) the input layer.
        self._next_layer = None  # Next layer is closer to (or is) the output layer.
    
    def set_prev_layer(self, layer):
        """Set pointer to the previous layer."""
        self._prev_layer = layer
    
    def set_next_layer(self, layer):
        """Set pointer to the next layer."""
        self._next_layer = layer
    
    def size(self):
        """Number of nodes in this layer."""
        if type(self._shape).__name__ == 'tuple':
            return self._shape[-1]
        else:
            return self._shape
        
    def get_weights(self):
        """Return a numpy array of the weights for inputs to this layer."""
        return self._weights
    
    def get_bias(self):
        """Return a numpy array of the bias for nodes in this layer."""
        return self._biass
    
    def feed_forward(self, inputs):
        """Feed the given inputs through the input weights and activation function, and set the outputs vector.
        
        Also returns the outputs vector for convenience."""
        raise NotImplementedError()
        
    def backpropagate(self, error, learning_rate):
        """Adjusts the weights coming into this layer based on the given output error vector.
        
        For the output layer, the "error" vector should be a list of output errors, y_k - t_k.
        For a hidden layer, the "error" vector should be a list of the delta values from the following layer, such as delta_z_k
        
        Returns a list of the delta values for each node in this layer. These deltas can be used as the error
        values when calling backpropagate on the previous layer."""
        raise NotimplementedError()
        
    def __str__(self):
        activation_fxn_name = self._activation_function.name if self._activation_function else None
        return '[%s] shape %s, use_bias=%s, activation=%s' % (self._name, self._shape, self._use_bias,
                                                              activation_fxn_name)

In [7]:
class InputLayer(Layer):
    """A neural network 1-dimensional input layer."""
    
    def __init__(self, size, name='Input'):
        assert type(size).__name__ == 'int', 'Input size must be integer. Was %s instead' % type(size).__name__
        super().__init__(shape=size, use_bias=False, name=name, activation_function=None)
    
    def feed_forward(self, inputs):
        assert len(inputs)==self._shape, 'Inputs must be of size %d; was %d instead' % (self._shape, len(inputs))
        self._outputs = inputs
        return self._outputs

    def backpropagate(self, error, learning_rate):
        return None  # Nothing to do.

In [8]:
class DenseLayer(Layer):
    """A neural network layer that is fully connected to the previous layer."""
    
    def __init__(self, shape, use_bias=True, name='Dense', **kwargs):
        super().__init__(shape=shape, use_bias=use_bias, name=name, **kwargs)
        
        self._weights = np.zeros(shape)
        if self._weight_initializer:
            for i in range(shape[0]):
                for j in range(shape[1]):
                    self._weights[i,j] = self._weight_initializer()
    
    def feed_forward(self, inputs):
        self._outputs = np.dot(inputs, self._weights) + self._bias
        for idx, val in np.ndenumerate(self._outputs):
            self._outputs[idx] = self._activation_function.activate(val)
        return self._outputs
        
    def backpropagate(self, error, learning_rate):
        """Adjusts the weights coming into this layer based on the given output error vector.
        
        For the output layer, the "error" vector should be a list of output errors, y_k - t_k.
        For a hidden layer, the "error" vector should be a list of the delta values from the following layer, such as delta_z_k
        
        Returns a list of the delta values for each node in this layer. These deltas can be used as the error
        values when calling backpropagate on the previous layer."""
        assert isinstance(error, np.ndarray)
        assert isinstance(self._prev_layer._outputs, np.ndarray)
        assert isinstance(self._outputs, np.ndarray)  
        
        # Compute deltas. If this is the last layer, use the simpler backprop formula.
        if self._next_layer is None:    
            deltas = self._activation_function.derivative_given_y(self._outputs) * error
        else:
            deltas = np.matmul(self._next_layer._weights, error) * self._activation_function.derivative_given_y(self._outputs)
            
        gradient = np.matmul(np.expand_dims(self._prev_layer._outputs, 0).T, np.expand_dims(deltas, 0))
                             
        # Adjust weights.
        self._weights -= learning_rate * gradient
        
        # Adjust bias weights.
        if self._use_bias:
            self._bias -= learning_rate * deltas
        
        return deltas

In [9]:
X_data = np.array([[0,0],[1,0],[0,1],[1,1]])
y_data = np.array([[0,1,1,0]]).T

In [10]:
nnet = NNet()
nnet.add_input_layer(2)
nnet.add_dense_layer(2, weight_initializer=WeightInitializer, activation_function=SigmoidActivation)
nnet.add_dense_layer(1, weight_initializer=WeightInitializer, activation_function=SigmoidActivation, name='Output')
nnet.summary()

0: [Input] shape 2, use_bias=False, activation=None
1: [Dense] shape (2, 2), use_bias=True, activation=Sigmoid
2: [Output] shape (2, 1), use_bias=True, activation=Sigmoid


In [11]:
nnet.summary(verbose=True)

0: [Input] shape 2, use_bias=False, activation=None
weights: None

1: [Dense] shape (2, 2), use_bias=True, activation=Sigmoid
weights: [[-0.76245984  0.64441248]
 [ 0.60957632 -0.50917349]]
bias: [ 0.21541581 -0.58466165]

2: [Output] shape (2, 1), use_bias=True, activation=Sigmoid
weights: [[-0.56239624]
 [ 0.52416319]]
bias: [-0.92886289]



In [12]:
nnet.train(X_data, y_data, 0.1, 10000)

0: MSE: 0.30270 Acc: 50.0% -- [array([0.26380321]), array([0.30173363]), array([0.23842]), array([0.27132486])]
100: MSE: 0.24987 Acc: 75.0% -- [array([0.48421576]), array([0.52719147]), array([0.45623555]), array([0.49576903])]
200: MSE: 0.24962 Acc: 50.0% -- [array([0.49291044]), array([0.53372361]), array([0.46483322]), array([0.50169645])]
300: MSE: 0.24943 Acc: 50.0% -- [array([0.49468357]), array([0.53393251]), array([0.46619149]), array([0.50083671])]
400: MSE: 0.24923 Acc: 75.0% -- [array([0.49480667]), array([0.5330573]), array([0.46568025]), array([0.49856992])]
500: MSE: 0.24902 Acc: 75.0% -- [array([0.49700415]), array([0.53477791]), array([0.46706776]), array([0.49862123])]
600: MSE: 0.24878 Acc: 75.0% -- [array([0.49837467]), array([0.53619836]), array([0.4674367]), array([0.49800979])]
700: MSE: 0.24850 Acc: 75.0% -- [array([0.49761613]), array([0.53610514]), array([0.46542627]), array([0.49539508])]
800: MSE: 0.24817 Acc: 75.0% -- [array([0.49814854]), array([0.53784282

7400: MSE: 0.00429 Acc: 100.0% -- [array([0.07243183]), array([0.93974362]), array([0.93528716]), array([0.06414039])]
7500: MSE: 0.00418 Acc: 100.0% -- [array([0.07141364]), array([0.94054125]), array([0.93623603]), array([0.06325798])]
7600: MSE: 0.00406 Acc: 100.0% -- [array([0.07043441]), array([0.94131054]), array([0.93714866]), array([0.06240885])]
7700: MSE: 0.00396 Acc: 100.0% -- [array([0.06949263]), array([0.94205412]), array([0.93802793]), array([0.06159178])]
7800: MSE: 0.00385 Acc: 100.0% -- [array([0.06858421]), array([0.94277185]), array([0.93887323]), array([0.06080304])]
7900: MSE: 0.00376 Acc: 100.0% -- [array([0.06770726]), array([0.94346452]), array([0.93968712]), array([0.06004103])]
8000: MSE: 0.00366 Acc: 100.0% -- [array([0.06686012]), array([0.94413425]), array([0.94047098]), array([0.05930447])]
8100: MSE: 0.00357 Acc: 100.0% -- [array([0.06604251]), array([0.94478308]), array([0.94122848]), array([0.05859331])]
8200: MSE: 0.00349 Acc: 100.0% -- [array([0.0652

In [13]:
nnet.summary(verbose=True)

0: [Input] shape 2, use_bias=False, activation=None
weights: None

1: [Dense] shape (2, 2), use_bias=True, activation=Sigmoid
weights: [[-5.03889463  5.49296836]
 [ 4.76858006 -5.5461631 ]]
bias: [-2.67446457 -3.15866281]

2: [Output] shape (2, 1), use_bias=True, activation=Sigmoid
weights: [[7.43102137]
 [7.30890295]]
bias: [-3.63205322]

