# IMPORTS

In [1]:
import numpy as np
from keras.utils import np_utils
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import random
import sys
import time
import math
import csv
import numpy as np
from keras.datasets import mnist
from keras.utils import np_utils
import pandas as pd
import time

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def initalize_weights(dim_size, input_length):
    # Returns the weight matrix of the current layer (of shape [size of current layer, size of previous layer])
    # randn : (mean 0 and variance 1)
    '''
    |initalize weights for layer L 
    |dim_size : number of neurons contains in layer L
    |input_length : number of inputs for each neuron in layer L
    '''
    return np.random.randn(dim_size, input_length) / 10

def initalize_bias(dim_size):
    #zeros :  Return a new array of given shape and type, filled with zeros.
    '''
    |initalize bias for layer L 
    |dim_size : number of neurons contains in layer L
    '''
    return np.zeros((dim_size,1))


def initialize_parameters(layer_dims):
    params = {}
    inputs = layer_dims[0]
    network_dims = layer_dims[1:]
    prev_dim = inputs
    layer = 0
    for dim in network_dims:
        params['W{}'.format(layer+1)] = initalize_weights(dim, prev_dim)
        params['b{}'.format(layer+1)] = initalize_bias(dim)
        layer += 1
        prev_dim = dim
    return params
        
        
    

In [3]:
def linear_forward(A, W, B, dropout=False, training=False):
    '''
    Implement the linear part of a layer's forward propagation
    |A – the activations of the previous layer
    |W – the weight matrix of the current layer (of shape [size of current layer, size of previous layer])
    |B – the bias vector of the current layer (of shape [size of current layer, 1])
    |dropout : should use dropout
    |training : boolean - is in training mode
    '''
    # copy activations
    activations = np.copy(A)
    # ------------ DROPOUT BONUS ------------ #
    if training and dropout:
        # ------------ DROPOUT BONUS ------------ #
        proba = 0.75
        keeps = np.random.rand(activations.shape[0], activations.shape[1]) < proba
        activations = np.multiply(activations, keeps)
        activations /= proba
        # ------------ DROPOUT BONUS ------------ #
    
        # update cache for backpropagation
    linear_cache = {'A': activations, 'W': W, 'b': B}
    # calculate the linear component W

    # Z[L] : w[L]X + b[L] --> (nL,m) = (nL, nL-1)(nL-1,m) + (nL, m) ; m = #samples
    # |   # -Weights-  #inputs#      # wX #
    # n1  ( 0 1 2 3 )    (1)
    # n2  ( 0 1 2 3 ) *  (2)   ==   (20)
    # n3  ( 0 1 2 3 )    (3)        (20)
    #                   (4)        (20)

    # before applying non linear function
    # Z is the input for the activation function
    Z = np.dot(W, activations) + B

    return Z, linear_cache

# ACTIVATION FUNCTIONS

In [4]:
def softmax(Z):
    '''
    |Z – the linear component of the activation function
    |Returns A - the activation of the layer
    |       activation_cache - returns Z for backpropagation
    '''
    A = np.exp(Z)/sum(np.exp(Z))
    activation_cache = {'Z':Z}
    return A , activation_cache

def relu(Z):
    '''
    |Z – the linear component of the activation function
    |Returns A - the activation of the layer
    |        activation_cache - returns Z for backpropagation
    '''
    A = np.maximum(0, Z)
    activation_cache = {'Z':Z}
    return A , activation_cache


In [5]:
def linear_activation_forward(A_prev, W, B, activation, dropout, training = False):
    '''
    Implement the forward propagation for the LINEAR->ACTIVATION layer
    |A_prev – activations of the previous layer
    |W – the weights matrix of the current layer
    |B – the bias vector of the current layer
    |Activation – the activation function to be used (a string, either “softmax” or “relu”)
    |dropout : use dropout
    |training : is in training mode
    Returns:
    A – the activations of the current layer
    cache – a joint dictionary containing both linear_cache and activation_cache
    '''
    activations = {'softmax':softmax, 'relu':relu}
    Z, linear_cache = linear_forward(A_prev, W, B, dropout, training)
    activation_cache = {}
    A = 0
    try:
        A, activation_cache = activations[activation.lower()](Z)
    except:
        print('activation {} was not implemented'.format(activation))
    # merge dicts to A W b & Z
    cache = linear_cache
    cache.update(activation_cache)
    return A,  cache


In [6]:
def L_model_forward(X, parameters, use_batchnorm, dropout, training):
    '''
    Implement forward propagation for the [LINEAR->RELU]*(L-1)->LINEAR->SOFTMAX computation
    |X – the data, numpy array of shape (input size, number of examples)
    |parameters – the initialized W and b parameters of each layer
    |use_batchnorm - a boolean flag used to determine whether to apply batchnorm after the activation 
    Returns:
        AL – the last post-activation value
        caches – a list of all the cache objects generated by the linear_forward function
    '''
    
    weights = [(k, v) for (k, v) in parameters.items() if 'W' in k]
    biases =[(k, v) for (k, v) in parameters.items() if 'b' in k]

    caches = []
    
    # initalize input layer with inputs X
    A = X
    
    # Apply relu on inner layers:
    inner_layers_count = len(weights) - 1
    activation_func = 'relu'
    for layer in range(inner_layers_count):
        # extract W and B for examined layer.
        W = weights[layer][1:][0]
        B = biases[layer][1:][0]
        A_prev = A
        # don't dropout any of the input neurons
        dropout = dropout and layer != 0
        #activate relu using W and B
        A, cache = linear_activation_forward(A_prev, W, B, activation_func, dropout, training)
        if use_batchnorm:
            A = apply_batchnorm(A)
        caches.append(cache)

    #Apply softmax on last (output) layer & do not use batchnorm:
    activation_func = 'softmax'
    use_batchnorm = False
    last_layer_index = len(weights) - 1
    W = weights[last_layer_index][1:][0]
    B = biases[last_layer_index][1:][0]
    A_prev = A
    AL, cache = linear_activation_forward(A_prev, W, B, activation_func, False, training)   
    caches.append(cache)
    return AL, caches





In [7]:
def compute_cost(AL, Y):
    '''
    Calculate cost function as categorical cross-entropy loss.
    |AL – the last post-activation value
    |Y  - the ground thruth
    |returns cross entropy cost
    '''
    Y_hat = AL
    inner_prod = Y * np.log(Y_hat)
    #[(y1*log(y^1)]
    inner_sum = np.sum(inner_prod, axis = 0)
    outer_sum = np.mean(inner_sum)
    cost =  -1 * outer_sum
    return cost


In [8]:
def apply_batchnorm(A):
    '''
    performs batchnorm on the received activation values of a given layer.
    |A - the activation values of a given layer
    Returns:
        NA - the normalized activation values, based on the formula learned in class
    '''
    epsilon = 1e-5
    batch_mean = np.mean(A)
    if np.isnan(batch_mean):
        batch_mean = 0
    batch_variance = np.var(A)
    if np.isnan(batch_variance):

        batch_variance = 0
    numerator = A - batch_mean
    denominator = np.sqrt(batch_variance + epsilon)
    batch_normed = numerator / denominator
    return batch_normed




# BACKWARDS

In [9]:

def linear_backward(dZ, cache):
    '''
    Implements the linear part of the backward propagation process for a single layer
    |dZ – the gradient of the cost with respect to the linear output of the current layer (layer l)
    |cache – tuple of values (A_prev, W, b) coming from the forward propagation in the current laye
    Returns:
        dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1)
        dW -- Gradient of the cost with respect to W (current layer l)
        db -- Gradient of the cost with respect to b (current layer l)
    '''
    A_prev, W, b= cache['A'], cache['W'], cache['b']
    layer_inputs_count = A_prev.shape[0]
    dA_prev = np.dot(W.T, dZ)
    dW = float(1) / layer_inputs_count *  np.dot(dZ, A_prev.T)
    db = float(1) /layer_inputs_count * np.sum(dZ, axis=1, keepdims=True)
   
    return dA_prev, dW, db

def relu_backward(dA, activation_cache):
    '''
    Implements backward propagation for a ReLU unit
    |dA – the post-activation gradient
    |activation_cache – contains Z (stored during the forward propagation)
    Returns:
        dZ – gradient of the cost with respect to Z
    '''
    Z = activation_cache
    dZ = np.copy(dA)
    #  Reset dZ when z <= 0
    dZ[Z <= 0] = 0
    return dZ

        

def softmax_backward(dA, activation_cache):
    '''
    Implements backward propagation for a softmax unit
    |dA – the post-activation gradient
    |activation_cache – contains Z (stored during the forward propagation)
    Returns:
        dZ – gradient of the cost with respect to Z
    '''
    Z = activation_cache 
    A,cache = softmax(Z) 
    dSig = np.multiply(A,(1-A)) 
    dZ = np.multiply(dA,dSig)
    return dZ


def linear_activation_backward(dA, cache, activation):
    '''
    Implements the backward propagation for the LINEAR->ACTIVATION layer.
    The function first computes dZ and then applies the linear_backward function.
    |dA – post activation gradient of the current layer
    |cache – contains both the linear cache and the activations cache
    Returns:
        dA_prev – Gradient of the cost with respect to the activation (of the previous layer l-1)
        dW – Gradient of the cost with respect to W (current layer l)
        db – Gradient of the cost with respect to b (current layer l)
    '''
    
    # extract activation cache and linear cache from dict:
    activation_cache = cache['Z']
    linear_cache = {key: cache[key] for key in cache.keys() & {'A', 'W', 'b'}} 
    
    activations_backwards = {'softmax':softmax_backward, 'relu':relu_backward}
    
    dZ = activations_backwards[activation.lower()](dA, activation_cache)
    #dA_prev, dW, db = linear_backward(dZ, linear_cache)
    dA_prev, dW, db = linear_backward(dZ, cache)
    return (dA_prev, dW, db)



def L_model_backward(AL, Y, caches):
    '''
    Implement the backward propagation process for the entire network.
    |AL - the probabilities vector, the output of the forward propagation (L_model_forward)
    |Y - the true labels vector (the "ground truth" - true classifications)
    |Caches - list of caches containing for each layer: a) the linear cache; b) the activation cache
    Returns:
        Grads - dictionary with the gradients.
    '''
    num_of_layers = len(caches)
    last_layer_index = num_of_layers - 1
    last_layer_cache = caches[last_layer_index]
    grads = {}
    # dZ = AL - Y
    dZ = np.subtract(AL, Y)
    epsilon = 1e-5
    AL[AL == 1] -= epsilon
    AL[AL == 0] += epsilon
    #derivative of last layer
    
    dAL = -1 * (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))

    #backpropagation for the softmax function applied once as only the output layers uses it
    
    grads["dA{}".format(str(num_of_layers))], grads["dW{}".format(str(num_of_layers))],grads["db{}".format(str(num_of_layers))] = linear_activation_backward(dAL, last_layer_cache, activation = "softmax")
    # Apply RELU backwards activation on the rest of the layers:
    

    for l in reversed(range(1,num_of_layers)):
        dA_prev = grads['dA{}'.format(str(l+1))] # the activation derivative of the l+1 layer (calculated above with softmax)
        current_layer_cache = caches[l-1]
        dA_prev, dW, db = linear_activation_backward(dA_prev, current_layer_cache, activation = "relu")
        grads["dA{}".format(str(l))] = dA_prev
        grads["dW{}".format(str(l))] = dW
        grads["db{}".format(str(l))] = db
    return grads




In [10]:
def update_parameters(parameters, grads, learning_rate):
    '''
    Updates parameters using gradient descent
    |parameters – a dictionary containing the DNN architecture’s parameters
    |grads – a dictionary containing the gradients (generated by L_model_backward)
    |learning_rate – the learning rate used to update the parameters (the “alpha”)
    Returns:
        parameters – the updated values of the parameters object provided as input
    '''
    weights =  {key: parameters[key] for key in parameters.keys() if 'W' in key}
    biases = {key: parameters[key] for key in parameters.keys() if 'b' in key} 
    num_of_layers = len(weights.keys())
    for l in range(num_of_layers):
        #update Wl+1 = WL - alpha*dWL+1
         parameters["W{}".format(str(l+1))] -= grads["dW{}".format(str(l+1))] * learning_rate
        #update  bl+1 = bL - alpha*dbL+1
         parameters["b{}".format(str(l+1))] -= grads["db{}".format(str(l+1))] * learning_rate
    return parameters


# IMPLEMENT Neural Network with L layers

In [11]:
def create_dataset(X, Y , num_of_classes, test_size = 0.2):
    X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size= test_size)
    X_train = np.swapaxes(X_train.reshape(-1, 28*28), 0, 1)
    X_validation = np.swapaxes(X_validation.reshape(-1, 28*28), 0, 1)
    X_train = X_train / 255
    X_validation = X_validation / 255
    Y_train = np.swapaxes(np_utils.to_categorical(Y_train, num_of_classes), 0, 1)
    Y_validation = np.swapaxes(np_utils.to_categorical(Y_validation, num_of_classes), 0, 1)
    return X_train, X_validation , Y_train, Y_validation



In [12]:
def L_layer_model(X, Y, layers_dims, learning_rate, num_epochs, batch_size, use_batchnorm, dropout, improvement_epsilon, verbose = False):
    '''
    Implements a L-layer neural network
    Last layer apply softmax
    Rest of the layers apply Relu
    Network Output is equal to the number of labels within the data
    Logic:
    initialize -> L_model_forward -> compute_cost -> L_model_backward -> update parameters
    
    |X– the input data (GRAYSCALE), a numpy array of shape (height*width , number_of_examples)
    |Y – the “real” labels of the data, a vector of shape (num_of_classes, number of examples)
    |Layer_dims – a list containing the dimensions of each layer
    |batch_size – the number of examples in a single training batch.
    Returns:
        parameters – the parameters learnt by the system during the training 
        train_costs & validation_costs – the values of the cost function (one value per 100 iterations)
    '''
    
    
    #Parameters initialization.
    epsilon = sys.float_info.epsilon
    old_cost = math.inf
    min_validation_cost = 1000000
    min_validation_cost_age = 0
    num_of_classes = 10

    parameters = initialize_parameters(layers_dims)
    train_costs , validation_costs = [] , []
    #inialize cost with high val 

   
    X_train, X_validation , Y_train, Y_validation = create_dataset(X, Y , num_of_classes, 0.2)
   
    # Start Gradient descent: (the only loop)
    num_training_samples = X_train.shape[1]
    shuffle_train = [i for i in range(num_training_samples)]
    stop = False
    iterations_completed = 0
    epochs_completed = 0
    
    for j in range(num_epochs):
        if stop:
            break
        
        sum_validations_costs = 0
        
        # shuffle indices (training samples order) before every epoch
        np.random.shuffle(shuffle_train)
        
        # run through the epoch batch by batch
        num_batches = math.ceil(num_training_samples / batch_size)
        print('num of samples {}'.format(num_training_samples))
        print('num of batched : {}'.format(num_batches))
        for i in range(num_batches):
            # batch size is the standard size, except for the last batch which is however many samples are left (less than or equal to a normal batch)
            this_batch_size = batch_size if i < (num_batches-1) else (num_training_samples - num_batches * i)
            batch_indices = shuffle_train[i * batch_size : (i * batch_size) + this_batch_size]

            # now do a single iteration
            
            X_batch = X_train[:, batch_indices]
            Y_batch = Y_train[:, batch_indices]

            # L_model_forward
            AL, caches = L_model_forward(X_batch, parameters, use_batchnorm = use_batchnorm, dropout = dropout, training = True)
            # Compute_cost
            train_cost = compute_cost(AL, Y_batch)
            # L_model_backward
            grads = L_model_backward(AL, Y_batch, caches)
            # Update parameters
            parameters = update_parameters(parameters, grads, learning_rate)


            # VALIDATION ( training = False)
            validation_AL, validation_Caches = L_model_forward(X_validation, parameters,use_batchnorm = use_batchnorm, dropout = dropout, training = False)
            validation_cost = compute_cost(validation_AL, Y_validation)
            # append costs
            
            if (iterations_completed % 100 == 0):
                if verbose:
                    print('finished iteration {} - train score - {} , validation score - {} '.format(i, train_costs, validation_costs), end=': ')
                else:
                    print('finished iteration {} ,finished {} epochs'.format(iterations_completed, j), end=': ')
                print('train cost is {:.4f} ||| validation cost is {:.4f}'.format(train_cost, validation_cost))
                train_costs.append(train_cost + epsilon)
                validation_costs.append(validation_cost + epsilon)
            
            iterations_completed += 1
            sum_validations_costs += validation_cost
            
            #stop if costs not improves for 100 iterations in a row
            if min_validation_cost - validation_cost > improvement_epsilon:
                min_validation_cost = validation_cost
                min_validation_cost_age = 1
            elif min_validation_cost_age < 100:
                min_validation_cost_age += 1
            else:
                print("cost score not improving for 100 iterations.")
                stop = True
                break

            old_cost = validation_cost
            
                
        epochs_completed += 1
        print('Epoch {} completed. Average validation cost was {:.4f}'.format(epochs_completed, sum_validations_costs/num_batches))
        sum_validations_costs = 0
    
    
    print('train_costs {}:'.format(train_costs))
    print('validation_costs {}:'.format(validation_costs))

    train_accuracy = predict(X_train, Y_train, parameters, use_batchnorm, dropout)
    val_accuracy = predict(X_validation, Y_validation, parameters, use_batchnorm, dropout)
    return parameters, train_costs, validation_costs, train_accuracy, val_accuracy, epochs_completed, iterations_completed

In [13]:
def get_strongest_index(AL, index):
    '''
    return the index of the strongest label (the label which receives the hughest confidence score)
    '''
    return np.argmax(AL[:, index], axis=0)
    

In [14]:
from sklearn import metrics 
def predict(X, Y, parameters, use_batchnorm, dropout):
    '''
    receives an input data and the true labels and calculates the accuracy of the trained neural network on the data
    |X – the input data, a numpy array of shape (height*width, number_of_examples)
    |Y – the “real” labels of the data, a vector of shape (num_of_classes, number of examples)
    |Parameters – a dictionary containing the DNN architecture’s parameters
    Returns:
        accuracy – the accuracy measure of the neural net on the provided data
    '''
    # apply model_forward over X
    samples = Y.shape[1]
    AL, caches = L_model_forward(X, parameters, use_batchnorm=use_batchnorm, dropout= dropout, training = False)
    Y_pred = np.zeros(Y.shape)
    for i in range(samples):
        label = get_strongest_index(AL, index = i)
        Y_pred[label, i] = 1
    Y_pred = np.swapaxes(Y_pred, 0, 1)
    Y = np.swapaxes(Y, 0, 1)
    return accuracy_score(Y_pred, Y)



In [None]:
def main(output_path, fields_name,class_num, epochs, batch_size, arch ,learning_rate, verbose,):
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    IMAGE_SIZE = 28 * 28
    x_test = np.swapaxes(x_test.reshape(-1, IMAGE_SIZE), 0, 1)
    x_test = x_test / 255
    y_test = np.swapaxes(np_utils.to_categorical(y_test, class_num), 0, 1)
    options = [True, False]
    with open(output_path, mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=fields_name)
        writer.writeheader()
    
    for dropout in options:
        for bachnorm in options:
            print(' -------- WORKING ON DROP {} BACHNORM {} --------'.format(dropout , bachnorm))
            params, train_cost, validation_cost, train_accuracy, val_accuracy, epochs_completed, iterations_completed = L_layer_model(x_train, y_train, arch, learning_rate, epochs, batch_size, bachnorm, dropout, improvement_epsilon, verbose)
            test_accuracy = predict(x_test, y_test, params, bachnorm, dropout)
            with open(output_path, mode='a', newline='') as file:
                writer = csv.DictWriter(file, fieldnames=fields_name)
                writer.writerow({'Learning Rate': str(learning_rate), 'Bachnrom': str(bachnorm), 'dropout': str(dropout),'test accuracy': str(test_accuracy),
                                 'validation accuracy': str(val_accuracy), 'train accuracy': str(train_accuracy), 'epochs_completed': str(epochs_completed), 'iterations_completed': str(iterations_completed)
                                 })

                pd.DataFrame.from_dict(train_cost).to_csv('train_costs-{}-{}.csv'.format(dropout,bachnorm))
                pd.DataFrame.from_dict(validation_cost).to_csv('validation_costs-{}-{}.csv'.format(dropout,bachnorm))

arch = [28*28, 20, 7, 5, 10]
classes = 10
epochs = 200
batch_size = 1024
learning_rate = 0.009
improvement_epsilon = 0.000001
fields_name = ['Learning Rate', 'Bachnrom', 'dropout', 'test accuracy', 'validation accuracy', 'train accuracy', 'epochs_completed', 'iterations_completed']
main('final_res.csv', fields_name, classes, epochs, batch_size, arch,learning_rate, False)
