# Deep Learning Tutorial - Modeling

In [1]:
import numpy as np
import pprint

In [2]:
INPUT_SIZE = 2
NUM_CLASSES = 7
LAYER_SIZES = [5, 5]

LAYER_SIZES.insert(0, INPUT_SIZE)

LAYER_SIZES.append(NUM_CLASSES)

LAYER_ACTIVATIONS = ['relu', 'relu', 'softmax']

In [3]:
print(f'The layer sizes include: {LAYER_SIZES}.')

The layer sizes include: [2, 5, 5, 7].


In [4]:
def initialize_network():
    architecture = {}
    for layer in range(1, len(LAYER_SIZES)):
        architecture[f'layer_{layer}'] = {
            'w': np.random.randn(LAYER_SIZES[layer],
                                 LAYER_SIZES[layer-1]) * 1,
            'b': np.zeros(LAYER_SIZES[layer]),
            'activation': LAYER_ACTIVATIONS[layer-1]
        }
    return architecture

In [5]:
network = initialize_network()
pprint.pprint(network)

{'layer_1': {'activation': 'relu',
             'b': array([0., 0., 0., 0., 0.]),
             'w': array([[ 0.6436408 ,  0.80006476],
       [ 0.73563585,  1.27752534],
       [ 0.09365557, -1.98953266],
       [-0.03147536,  1.17531548],
       [-1.21081442,  1.27439103]])},
 'layer_2': {'activation': 'relu',
             'b': array([0., 0., 0., 0., 0.]),
             'w': array([[ 1.47388233,  1.01047831, -0.43945397, -0.03241503, -0.7398889 ],
       [ 0.42247096,  0.73227551,  0.0689344 , -0.76239729,  0.49743781],
       [-0.43257698, -0.52342636,  0.64388953,  1.20068519, -0.28892381],
       [-0.07131636, -0.51847173, -1.12926377,  0.81151567, -1.44163607],
       [ 1.11800781, -1.05299348, -0.27996439, -1.63622441,  0.15015977]])},
 'layer_3': {'activation': 'softmax',
             'b': array([0., 0., 0., 0., 0., 0., 0.]),
             'w': array([[ 0.69802614, -1.76077029, -0.77230799, -1.28211465,  0.43704024],
       [ 0.30109275,  0.67040528, -0.96376584, -0.33572579,  0.0

In [6]:
def sigmoid_activation(Z):
    activation = 1/(1 + np.exp(-1*Z))
    return activation

In [7]:
def softmax_activation(Z):
    activation = np.exp(Z) / np.sum(np.exp(Z))
    return activation

In [8]:
def relu_activation(Z):
    activation = np.max(0, Z)
    return activation

In [9]:
def dZ_sigmoid(dA, Z):
    sigmoid = sigmoid_activation(Z)
    dZ = dA * sigmoid * (1 - sigmoid)
    return dZ

In [10]:
def dZ_softmax(Z):
    softmax = softmax_activation(Z)
    softmax_matrix = np.tile(softmax)
    dZ = np.diag(softmax) - (softmax_matrix*np.transpose(softmax_matrix))

In [11]:
def dZ_relu(dA, Z):
    dZ = np.copy(dA)
    dZ[Z <= 0] = 0
    return dZ

In [12]:
act_map = {
    'sigmoid': sigmoid_activation,
    'relu': relu_activation,
    'softmax': softmax_activation
}

In [13]:
dZ_map = {
    'sigmoid': dZ_sigmoid,
    'relu': dZ_relu,
    'softmax': dZ_softmax
}

In [14]:
def single_forward_pass(A_previous, W, b, activation):
    try:
        act_function = act_map[activation]
    except KeyError:
        print(f'The activation {activation} is not recognized.\nIt must be one of the following: {list(act_map.keys())}')
        return None
    
    Z = np.dot(W, A_previous) + b
    A = act_function(Z)
    
    return A, Z

In [15]:
def full_forward_pass(X, network):
    
    cache = {}
    A = X
    
    for layer in range(1, len(network) + 1):
        
        A_previous = A
        A, Z = single_forward_pass(A_previous, network[layer]['W'], network[layer]['b'], network[layer]['activation'])
        
        cache[f'A_{layer-1}'] = A_previous
        cache[f'Z_{layer}'] = Z
        
    return A, cache

In [16]:
def compute_cross_entropy_cost(y_pred, y):
    
    cost = np.sum(-1*(y * np.log(y_pred)))
    
    return cost

In [17]:
def single_backward_pass(dA, W, b, Z, A_previous, activation):
    
    try:
        backprop_activation = dZ_map[activation]
    except KeyError:
        print(f'The backprop activation {activation} is not recognized.\nIt must be one of the following: {list(dZ_map.keys())}')
        return None
    
    m = A_previous.shape[1]
    
    if activations == 'softmax':
        dZ = backprop_activation(Z)
    else:
        dZ = backprop_activation(dA, Z)
    
    dW = np.dot(dZ, np.transpose(A_previous)) / m
    db = np.sum(dZ, axis=1, keepdims=True) / m
    dA_previous = np.dot(np.transpose(dW), dZ)
    
    return dA_previous, dW, db

In [18]:
def full_backward_pass(y_pred, y, cache, network):
    
    stored_grads = {}
    m = y.shape[1]
    
    dA_previous = A - y
    
    for layer in reversed(range(1, len(network) + 1)):
        activation = network[layer]['activation']
        layer_previous = layer - 1
        
        dA = dA_previous
        
        A_previous = cache[f'A_{layer_previous}']
        Z = cache[f'Z_{layer}']
        W = network[layer]['W']
        b = network[layer]['b']
        
        dA_previous, dW, db = single_backward_pass(dA, W, b, Z, A_previous, activation)
        stored_grads[f'dW_{layer}'] = dW
        stored_grads[f'db_{layer}'] = db
        
    return stored_grads

In [19]:
def update_network(network, stored_grads, learning_rate):
    for layer in range(1, len(network) + 1):
        network[layer]['W'] -= learning_rate * stored_grads[f'dW_{layer}']
        network[layer]['b'] -= learning_rate * stored_grads[f'db_{layer}']
    return network

In [20]:
HYPER_PARAMS = {
    'epochs': 50,
    'learning_rate': 0.01
}

In [21]:
def train_nn(X, y, network):
    
    stored_cost = []
    
    for epoch in range(HYPER_PARAMS['epochs']):
        y_pred, cache = full_forward_pass(X, network)
        cost = compute_cross_entropy_cost(y_pred, y)
        stored_cost.append(cost)
        stored_grads = full_backward_pass(y_pred, y, cache, network)
        network = update_network(network, stored_grads, HYPER_PARAMS['learning_rate'])
    return network, stored_cost

### Resources
This notebook has been inspired by the Towards Data Science post [Let’s code a Neural Network in plain NumPy](https://towardsdatascience.com/lets-code-a-neural-network-in-plain-numpy-ae7e74410795).

Additional resources include:

* [The Softmax Function Derivative (Part 1)](https://aimatters.wordpress.com/2019/06/17/the-softmax-function-derivative/).
* [Creating a Neural Network from Scratch in Python: Multi-class Classification](https://stackabuse.com/creating-a-neural-network-from-scratch-in-python-multi-class-classification/).
* [The Softmax Function Derivative (Part 1)](https://aimatters.wordpress.com/2019/06/17/the-softmax-function-derivative/).
* [A Gentle Introduction to Cross-Entropy for Machine Learning](https://machinelearningmastery.com/cross-entropy-for-machine-learning/).