In [3]:
import numpy as np
import utils
import typing
np.random.seed(1)

In [None]:
def pre_process_images(X: np.ndarray):
    """
    Args:
        X: images of shape [batch size, 784] in the range (0, 255)
    Returns:
        X: images of shape [batch size, 785] normalized as described in task2a
    """
    assert X.shape[1] == 784,\
        f"X.shape[1]: {X.shape[1]}, should be 784"
    # TODO implement this function (Task 2a)
    std = np.average(np.std(X, axis=0, dtype=float))
    mean = np.average(np.mean(X, axis=0, dtype=float))
    print('Standard deviation:', std)
    print('Mean value:', mean)
    X_norm = (X - mean)/std
    
    #bias trick
    bias = np.ones((X.shape[0],1), dtype='float')
    X_norm = np.append(X_norm, bias, axis=1)
    return X_norm




In [4]:
def cross_entropy_loss(targets: np.ndarray, outputs: np.ndarray):
    """
    Args:
        targets: labels/targets of each image of shape: [batch size, num_classes]
        outputs: outputs of model of shape: [batch size, num_classes]
    Returns:
        Cross entropy error (float)
    """
    assert targets.shape == outputs.shape,\
        f"Targets shape: {targets.shape}, outputs: {outputs.shape}"
    # TODO: Implement this function (copy from last assignment)
    return - np.sum(targets * np.log(outputs)) / targets.shape[0]


In [None]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

def sigmoid_diff(z):
    return sigmoid(z)*(1-sigmoid(z))

def improved_sigmoid(z):
    print('hey')
    return 1.7159*np.tanh(2/3*z)

def improved_sigmoid_diff(z):
    print('yo')
    return 1.7159 * 2/3 * (1-(np.tanh(2/3*z))**2)



In [None]:
class SoftmaxModel:

    def __init__(self,
                 # Number of neurons per layer
                 neurons_per_layer: typing.List[int],
                 use_improved_sigmoid: bool,  # Task 3a hyperparameter
                 use_improved_weight_init: bool  # Task 3c hyperparameter
                 ):
        # Always reset random seed before weight init to get comparable results.
        np.random.seed(1)
        # Define number of input nodes
        self.I = 785
        self.use_improved_sigmoid = use_improved_sigmoid

        # Define number of output nodes
        # neurons_per_layer = [64, 10] indicates that we will have two layers:
        # A hidden layer with 64 neurons and a output layer with 10 neurons.
        self.neurons_per_layer = neurons_per_layer
        


        # Initialize the weights
        self.ws = []
        prev = self.I
        if use_improved_weight_init:
            for size in self.neurons_per_layer:
                w_shape = (prev, size)
                print("Initializing weight to shape:", w_shape)
                sigma = 1/np.sqrt(prev)
                mu = 0
                w = np.random.normal(mu, sigma, w_shape)
                self.ws.append(w)
                prev = size
        else:
            for size in self.neurons_per_layer:
                #np.random.seed(0)
                w_shape = (prev, size)
                print("Initializing weight to shape:", w_shape)
                w = np.random.uniform(-1,1, (w_shape))
                self.ws.append(w)
                prev = size
                
        self.grads = [None for i in range(len(self.ws))]
        self.z_arr = [None for i in range(len(self.ws))]
        self.activations = [None for i in range(len(self.ws))]
        self.delta = [None for i in range(len(self.ws))]

        
    def forward(self, X: np.ndarray) -> np.ndarray:
        """
        Args:
            X: images of shape [batch size, 785]
        Returns:
            y: output of model with shape [batch size, num_outputs]
        """
        # TODO implement this function (Task 2b)
        # HINT: For performing the backward pass, you can save intermediate activations in variables in the forward pass.
        # such as self.hidden_layer_output = ...
        # Task 2b 

        a_i = X
        softmax = lambda z : np.exp(z)/ (np.sum(np.exp(z), keepdims=True, axis=1))
        
        for i in range(len(self.ws)-1):
            z = np.dot(a_i, self.ws[i])
            if self.use_improved_sigmoid:
                a_i = improved_sigmoid(z)                
            else:
                a_i = sigmoid(z)
            self.z_arr[i]= z
            self.activations[i]=a_i

        z = np.dot(a_i, self.ws[-1])
        a_i = softmax(z)
        self.z_arr[-1] = z   
        self.activations[-1] = a_i
        return a_i
         
    def backward(self, X: np.ndarray, outputs: np.ndarray,
                 targets: np.ndarray) -> None:
        """
        Computes the gradient and saves it to the variable self.grad

        Args:
            X: images of shape [batch size, 785]
            outputs: outputs of model of shape: [batch size, num_outputs]
            targets: labels/targets of each image of shape: [batch size, num_classes]
        """

        # TODO implement this function (Task 2b)
        assert targets.shape == outputs.shape,\
            f"Output shape: {outputs.shape}, targets: {targets.shape}"


        #Start with last layer gradient:
        last_index = -1
        self.delta[last_index] = -(targets - outputs)
        self.grads[last_index] = np.dot(self.activations[last_index-1].T, self.delta[last_index])/(X.shape[0])

        for i in range(len(self.ws)-2, 0, - 1):
            if self.use_improved_sigmoid:                
                self.delta[i] = np.dot(self.delta[i+1], self.ws[i+1].T)*improved_sigmoid_diff(self.z_arr[i])
            else:                    
                self.delta[i] = np.dot(self.delta[i+1], self.ws[i+1].T)*sigmoid_diff(self.z_arr[i])
            self.grads[i] = np.dot(self.activations[i-1].T, self.delta[i])/X.shape[0] 
        
        #first layer (hardkodet)
        if self.use_improved_sigmoid:            
            self.delta[0] = np.dot(self.delta[1], self.ws[1].T)*improved_sigmoid_diff(self.z_arr[0])
        else:            
            self.delta[0] = np.dot(self.delta[1], self.ws[1].T)*sigmoid_diff(self.z_arr[0])
        self.grads[0] = np.dot(X.T, self.delta[0])/X.shape[0]

        for grad, w in zip(self.grads, self.ws):
            assert grad.shape == w.shape,\
                f"Expected the same shape. Grad shape: {grad.shape}, w: {w.shape}."
    
       
        
    def zero_grad(self) -> None:
        self.grads = [None for i in range(len(self.ws))]

In [None]:
def one_hot_encode(Y: np.ndarray, num_classes: int):
    """
    Args:
        Y: shape [Num examples, 1]
        num_classes: Number of classes to use for one-hot encoding
    Returns:
        Y: shape [Num examples, num classes]
    """
    # TODO implement this function (Task 3a) 
    res = np.zeros((len(Y),num_classes), dtype=int)
    for i in range(len(Y)):
        num_indx = Y[i] 
        res[i, num_indx] = 1
    return res


In [None]:
def gradient_approximation_test(
        model: SoftmaxModel, X: np.ndarray, Y: np.ndarray):
    """
        Numerical approximation for gradients. Should not be edited. 
        Details about this test is given in the appendix in the assignment.
    """
    epsilon = 1e-3
    for layer_idx, w in enumerate(model.ws):
        for i in range(w.shape[0]):
            for j in range(w.shape[1]):
                orig = model.ws[layer_idx][i, j].copy()
                model.ws[layer_idx][i, j] = orig + epsilon
                logits = model.forward(X)
                cost1 = cross_entropy_loss(Y, logits)
                model.ws[layer_idx][i, j] = orig - epsilon
                logits = model.forward(X)
                cost2 = cross_entropy_loss(Y, logits)
                gradient_approximation = (cost1 - cost2) / (2 * epsilon)
                model.ws[layer_idx][i, j] = orig
                # Actual gradient
                logits = model.forward(X)
                model.backward(X, logits, Y)
                difference = gradient_approximation - \
                    model.grads[layer_idx][i, j]
                assert abs(difference) <= epsilon**2,\
                    f"Calculated gradient is incorrect. " \
                    f"Layer IDX = {layer_idx}, i={i}, j={j}.\n" \
                    f"Approximation: {gradient_approximation}, actual gradient: {model.grads[layer_idx][i, j]}\n" \
                    f"If this test fails there could be errors in your cross entropy loss function, " \
                    f"forward function or backward function"

In [None]:
'''
if __name__ == "__main__":
    # Simple test on one-hot encoding
    Y = np.zeros((1, 1), dtype=int)
    Y[0, 0] = 3
    Y = one_hot_encode(Y, 10)
    assert Y[0, 3] == 1 and Y.sum() == 1, \
        f"Expected the vector to be [0,0,0,1,0,0,0,0,0,0], but got {Y}"

    X_train, Y_train, *_ = utils.load_full_mnist()
    X_train = pre_process_images(X_train)
    Y_train = one_hot_encode(Y_train, 10)
    assert X_train.shape[1] == 785,\
        f"Expected X_train to have 785 elements per image. Shape was: {X_train.shape}"

    neurons_per_layer = [64, 10]
    use_improved_sigmoid = False
    use_improved_weight_init = False
    model = SoftmaxModel(
        neurons_per_layer, use_improved_sigmoid, use_improved_weight_init)

    # Gradient approximation check for 100 images
    X_train = X_train[:100]
    Y_train = Y_train[:100]
    for layer_idx, w in enumerate(model.ws):
        model.ws[layer_idx] = np.random.uniform(-1, 1, size=w.shape)

    gradient_approximation_test(model, X_train, Y_train)
'''


'\nif __name__ == "__main__":\n    # Simple test on one-hot encoding\n    Y = np.zeros((1, 1), dtype=int)\n    Y[0, 0] = 3\n    Y = one_hot_encode(Y, 10)\n    assert Y[0, 3] == 1 and Y.sum() == 1,         f"Expected the vector to be [0,0,0,1,0,0,0,0,0,0], but got {Y}"\n\n    X_train, Y_train, *_ = utils.load_full_mnist()\n    X_train = pre_process_images(X_train)\n    Y_train = one_hot_encode(Y_train, 10)\n    assert X_train.shape[1] == 785,        f"Expected X_train to have 785 elements per image. Shape was: {X_train.shape}"\n\n    neurons_per_layer = [64, 10]\n    use_improved_sigmoid = False\n    use_improved_weight_init = False\n    model = SoftmaxModel(\n        neurons_per_layer, use_improved_sigmoid, use_improved_weight_init)\n\n    # Gradient approximation check for 100 images\n    X_train = X_train[:100]\n    Y_train = Y_train[:100]\n    for layer_idx, w in enumerate(model.ws):\n        model.ws[layer_idx] = np.random.uniform(-1, 1, size=w.shape)\n\n    gradient_approximation_

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=fc7fa119-6ed4-4d50-aa34-54bc46270896' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>