# Exercise 2

Authors:
- Tuoxing Liu
- Sima Esmaeili
- Shruti Ghargi

## Exercise 2.3

In [9]:
import numpy as np
from sklearn import datasets

class ReLULayer(object):
    def forward(self, input):
        self.input = input
        relu = np.maximum(0, input)
        return relu

    def backward(self, upstream_gradient):
        # Applying G_l(downstream) = G_l(upstream)·∂Zl/∂Zl−1
        # Derivative of ReLU is 1 for input>0, hence self.input > 0
        downstream_gradient = (self.input > 0) * upstream_gradient
        return downstream_gradient

    def update(self, learning_rate):
        pass 

class OutputLayer(object):
    def __init__(self, n_classes):
        self.n_classes = n_classes

    def forward(self, input):
        self.input = input
        exps = np.exp(input - np.max(input, axis=-1, keepdims=True))
        softmax = exps / np.sum(exps, axis=-1, keepdims=True)
        return softmax

    def backward(self, predicted_posteriors, true_labels):
        n = predicted_posteriors.shape[0]
        # Derivative of cross-entropy loss with softmax
        predicted_posteriors[range(n), true_labels] -= 1 # need to do copy before this
        # Applying G_l(upstream)=∂Loss/∂Zl
        downstream_gradient = predicted_posteriors / n
        return downstream_gradient

    def update(self, learning_rate):
        pass 

class LinearLayer(object):
    def __init__(self, n_inputs, n_outputs):
        self.n_inputs  = n_inputs
        self.n_outputs = n_outputs
        self.B = np.random.normal(size=(n_inputs, n_outputs))  
        self.b = np.random.normal(size=n_outputs) 

    def forward(self, input):
        self.input = input
        preactivations = np.dot(input, self.B) + self.b
        return preactivations

    def backward(self, upstream_gradient):
        # ∂Loss/∂b = G_l(upstream) as ∂Zl/∂b = 1
        self.grad_b = np.sum(upstream_gradient, axis=0)
        # ∂Loss/∂B = G_l(upstream)·∂Zl/∂B = X^T · G_l(upstream)
        self.grad_B = np.dot(self.input.T, upstream_gradient)
        # G_l(downstream) = G_l(upstream)·∂Zl/∂Zl−1 = G_l(upstream)·B^T
        downstream_gradient = np.dot(upstream_gradient, self.B.T)
        return downstream_gradient

    def update(self, learning_rate):
        # update weights and biases based on gradients calculated in backward step
        self.B -= learning_rate * self.grad_B
        self.b -= learning_rate * self.grad_b

class MLP(object):
    def __init__(self, n_features, layer_sizes):
        self.n_layers = len(layer_sizes)
        self.layers   = []
        n_in = n_features
        for n_out in layer_sizes[:-1]:
            self.layers.append(LinearLayer(n_in, n_out))
            self.layers.append(ReLULayer())
            n_in = n_out
        n_out = layer_sizes[-1]
        self.layers.append(LinearLayer(n_in, n_out))
        self.layers.append(OutputLayer(n_out))

    def forward(self, X):
        batch_size = X.shape[0]
        # flatten the other dimensions of X 
        X = X.reshape(batch_size, -1)
        result = X
        for layer in self.layers:
            result = layer.forward(result)
        return result

    def backward(self, predicted_posteriors, true_classes):
        downstream_gradient = self.layers[-1].backward(predicted_posteriors, true_classes)  # call OutputLayer backward first with true_labels
        for layer in reversed(self.layers[:-1]):
            downstream_gradient = layer.backward(downstream_gradient)

    def update(self, X, Y, learning_rate):
        posteriors = self.forward(X)
        self.backward(posteriors, Y)
        for layer in self.layers:
            layer.update(learning_rate)

    def train(self, x, y, n_epochs, batch_size, learning_rate):
        N = len(x)
        n_batches = N // batch_size
        for i in range(n_epochs):
            permutation = np.random.permutation(N)
            for batch in range(n_batches):
                start = batch * batch_size
                x_batch = x[permutation[start:start+batch_size]]
                y_batch = y[permutation[start:start+batch_size]]
                self.update(x_batch, y_batch, learning_rate)

if __name__=="__main__":
    N = 2000
    X_train, Y_train = datasets.make_moons(N, noise=0.05)
    X_test,  Y_test  = datasets.make_moons(N, noise=0.05)
    n_features = 2
    n_classes  = 2
    offset  = X_train.min(axis=0)
    scaling = X_train.max(axis=0) - offset
    X_train = ((X_train - offset) / scaling - 0.5) * 2.0
    X_test  = ((X_test  - offset) / scaling - 0.5) * 2.0
    layer_sizes = [5, 5, n_classes]
    n_epochs = 5
    batch_size = 200
    learning_rate = 0.05
    network = MLP(n_features, layer_sizes)
    network.train(X_train, Y_train, n_epochs, batch_size, learning_rate)
    predicted_posteriors = network.forward(X_test)
    predicted_classes = np.argmax(predicted_posteriors, axis=1)
    error_rate = np.mean(predicted_classes != Y_test)
    print("error rate:", error_rate)


error rate: 0.148
