# Deep Learning Tutorial - Modeling

In [1]:
import numpy as np
import pandas as pd
import pprint

In [2]:
NUM_FEATURES = 20
NUM_CLASSES = 7
LAYER_SIZES = [25, 25]

LAYER_SIZES.insert(0, NUM_FEATURES)

LAYER_SIZES.append(NUM_CLASSES)

LAYER_ACTIVATIONS = ['relu', 'relu', 'softmax']

In [3]:
print(f'The layer sizes include: {LAYER_SIZES}.')

The layer sizes include: [20, 25, 25, 7].


In [4]:
def initialize_network():
    architecture = {}
    for layer in range(1, len(LAYER_SIZES)):
        architecture[f'layer_{layer}'] = {
            'W': np.random.randn(LAYER_SIZES[layer],
                                 LAYER_SIZES[layer-1]) * 0.1,
            'b': np.random.randn(LAYER_SIZES[layer], 1) * 0.1,
            'activation': LAYER_ACTIVATIONS[layer-1]
        }
    return architecture

In [5]:
network = initialize_network()
# pprint.pprint(network)

In [6]:
def sigmoid_activation(Z):
    activation = 1/(1 + np.exp(-1*Z))
    return activation

In [7]:
def softmax_activation(Z):
    activation = np.exp(Z) / np.sum(np.exp(Z))
    return activation

In [8]:
def relu_activation(Z):
    activation = np.maximum(0.0, Z)
    return activation

In [9]:
def dZ_sigmoid(dA, Z):
    sigmoid = sigmoid_activation(Z)
    dZ = dA * sigmoid * (1.0 - sigmoid)
    return dZ

In [10]:
def dZ_softmax(dA, Z):
    sigmoid = sigmoid_activation(Z)
    dZ = dA * sigmoid * (1.0 - sigmoid)
    return dZ

In [11]:
def _dZ_softmax(Z):
    softmax = softmax_activation(Z)
    softmax_matrix = np.tile(softmax)
    dZ = np.diag(softmax) - (softmax_matrix*np.transpose(softmax_matrix))

In [12]:
def dZ_relu(dA, Z):
    dZ = np.copy(dA)
    dZ[Z <= 0.0] = 0.0
    return dZ

In [13]:
act_map = {
    'sigmoid': sigmoid_activation,
    'relu': relu_activation,
    'softmax': softmax_activation
}

In [14]:
dZ_map = {
    'sigmoid': dZ_sigmoid,
    'relu': dZ_relu,
    'softmax': dZ_softmax
}

In [15]:
def single_forward_pass(A_previous, W, b, activation):
    try:
        act_function = act_map[activation]
    except KeyError:
        print(f'The activation {activation} is not recognized.\nIt must be one of the following: {list(act_map.keys())}')
        return None
    
    Z = np.dot(W, A_previous) + b
    A = act_function(Z)
    
    return A, Z

In [16]:
def full_forward_pass(X, network):
    
    cache = {}
    A = np.transpose(X)
    
    for layer in range(1, len(network) + 1):
        A_previous = A
        A, Z = single_forward_pass(A_previous, 
                                   network[f'layer_{layer}']['W'], 
                                   network[f'layer_{layer}']['b'], 
                                   network[f'layer_{layer}']['activation'])
        
        cache[f'A_{layer-1}'] = A_previous
        cache[f'Z_{layer}'] = Z
        
    return A, cache

In [17]:
def compute_cross_entropy_cost(y_pred, y):
    
    cost = np.sum(-1*(y * np.log(np.transpose(y_pred))))
    
    return cost

In [18]:
def single_backward_pass(dA, W, b, Z, A_previous, activation):
    
    try:
        backprop_activation = dZ_map[activation]
    except KeyError:
        print(f'The backprop activation {activation} is not recognized.\nIt must be one of the following: {list(dZ_map.keys())}')
        return None
    
    m = A_previous.shape[1]
    
    dZ = backprop_activation(dA, Z)
    
    dW = np.dot(dZ, np.transpose(A_previous)) / m
    db = np.sum(dZ, axis=1, keepdims=True) / m
    dA_previous = np.dot(np.transpose(dW), dZ)
    
    return dA_previous, dW, db

In [19]:
def full_backward_pass(y_pred, y, cache, network):
    
    stored_grads = {}
    m = y.shape[1]
    
    dA_previous = y_pred - np.transpose(y)
    
    for layer in reversed(range(1, len(network) + 1)):
        activation = network[f'layer_{layer}']['activation']
        layer_previous = layer - 1
        
        dA = dA_previous
        
        A_previous = cache[f'A_{layer_previous}']
        Z = cache[f'Z_{layer}']
        W = network[f'layer_{layer}']['W']
        b = network[f'layer_{layer}']['b']
        
        dA_previous, dW, db = single_backward_pass(dA, W, b, Z, A_previous, activation)
        stored_grads[f'dW_{layer}'] = dW
        stored_grads[f'db_{layer}'] = db
        
    return stored_grads

In [20]:
def update_network(network, stored_grads, learning_rate):
    for layer in range(1, len(network) + 1):
        network[f'layer_{layer}']['W'] -= learning_rate * stored_grads[f'dW_{layer}']
        network[f'layer_{layer}']['b'] -= learning_rate * stored_grads[f'db_{layer}']
    return network

In [21]:
HYPER_PARAMS = {
    'epochs': 50,
    'learning_rate': 0.01
}

In [22]:
def train_nn(X, y, network):
    
    stored_cost = []
    
    for epoch in range(HYPER_PARAMS['epochs']):
        y_pred, cache = full_forward_pass(X, network)
        cost = compute_cross_entropy_cost(y_pred, y)
        print(f' * The cost at epoch {epoch} is {cost:0.3f}.')
        stored_cost.append(cost)
        stored_grads = full_backward_pass(y_pred, y, cache, network)
        network = update_network(network, stored_grads, HYPER_PARAMS['learning_rate'])
    return network, stored_cost

## Train Model
**Plan:**
1. Load data set.
2. Split into labels and features.
3. One-hot encode features.
4. One-hot encode labels.
5. Run through model.
### Load Dataset

In [23]:
header_values = []
with open('./data/zoo.dat', 'r') as zoo_file:
    for line in zoo_file:
        if '@attribute' in line:
            header_values.append(line.split()[1])

In [24]:
df = pd.read_csv('./data/zoo.dat', skiprows=21, header=None, names=header_values)

In [25]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Hair,101.0,0.425743,0.496921,0.0,0.0,0.0,1.0,1.0
Feathers,101.0,0.19802,0.400495,0.0,0.0,0.0,0.0,1.0
Eggs,101.0,0.584158,0.495325,0.0,0.0,1.0,1.0,1.0
Milk,101.0,0.405941,0.493522,0.0,0.0,0.0,1.0,1.0
Airborne,101.0,0.237624,0.42775,0.0,0.0,0.0,0.0,1.0
Aquatic,101.0,0.356436,0.481335,0.0,0.0,0.0,1.0,1.0
Predator,101.0,0.554455,0.499505,0.0,0.0,1.0,1.0,1.0
Toothed,101.0,0.60396,0.491512,0.0,0.0,1.0,1.0,1.0
Backbone,101.0,0.821782,0.384605,0.0,1.0,1.0,1.0,1.0
Breathes,101.0,0.792079,0.407844,0.0,1.0,1.0,1.0,1.0


### Separate Features and Labels

In [26]:
df_X = df.drop(columns='Type')

In [27]:
df_y = df['Type']

### One-hot Encode Features
All features except 'Legs' include 1-0 values. Since 'Legs' is a categorical variable, it needs to be one-hot encoded. We can do this using Pandas `get_dummies()` method.

In [28]:
df_X_one_hot = pd.get_dummies(df_X, columns=['Legs'], drop_first=True)

In [29]:
df_X_one_hot.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Hair,101.0,0.425743,0.496921,0.0,0.0,0.0,1.0,1.0
Feathers,101.0,0.19802,0.400495,0.0,0.0,0.0,0.0,1.0
Eggs,101.0,0.584158,0.495325,0.0,0.0,1.0,1.0,1.0
Milk,101.0,0.405941,0.493522,0.0,0.0,0.0,1.0,1.0
Airborne,101.0,0.237624,0.42775,0.0,0.0,0.0,0.0,1.0
Aquatic,101.0,0.356436,0.481335,0.0,0.0,0.0,1.0,1.0
Predator,101.0,0.554455,0.499505,0.0,0.0,1.0,1.0,1.0
Toothed,101.0,0.60396,0.491512,0.0,0.0,1.0,1.0,1.0
Backbone,101.0,0.821782,0.384605,0.0,1.0,1.0,1.0,1.0
Breathes,101.0,0.792079,0.407844,0.0,1.0,1.0,1.0,1.0


In [30]:
X = df_X_one_hot.values

### One-hot Encode Labels

In [31]:
NUM_CLASSES = len(pd.unique(df_y))
print(f'There are {NUM_CLASSES} unique classes for the labels, which are {pd.unique(df_y)}.')

There are 7 unique classes for the labels, which are [1 4 7 2 6 3 5].


In [32]:
def encode_labels(x):
    encoded = np.zeros(NUM_CLASSES)
    encoded[x-1] = 1
    return encoded

In [33]:
df_y_one_hot = df_y.apply(lambda x: encode_labels(x))

In [34]:
df_y_one_hot.head()

0    [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
1    [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
2    [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
3    [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
4    [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
Name: Type, dtype: object

In [35]:
y = df_y_one_hot.values

In [36]:
y = np.stack(y)

In [37]:
y[:5]

array([[1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0.]])

In [43]:
HYPER_PARAMS = {
    'epochs': 50,
    'learning_rate': 0.1
}

In [44]:
X.shape

(101, 20)

In [45]:
y.shape

(101, 7)

In [46]:
y[:3]

array([[1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.]])

In [47]:
network, stored_cost = train_nn(X, y, network)

The cost at epoch 0 is 655.358.
The cost at epoch 1 is 655.119.
The cost at epoch 2 is 654.882.
The cost at epoch 3 is 654.646.
The cost at epoch 4 is 654.412.
The cost at epoch 5 is 654.179.
The cost at epoch 6 is 653.948.
The cost at epoch 7 is 653.718.
The cost at epoch 8 is 653.489.
The cost at epoch 9 is 653.262.
The cost at epoch 10 is 653.037.
The cost at epoch 11 is 652.813.
The cost at epoch 12 is 652.590.
The cost at epoch 13 is 652.369.
The cost at epoch 14 is 652.150.
The cost at epoch 15 is 651.932.
The cost at epoch 16 is 651.716.
The cost at epoch 17 is 651.501.
The cost at epoch 18 is 651.288.
The cost at epoch 19 is 651.076.
The cost at epoch 20 is 650.866.
The cost at epoch 21 is 650.658.
The cost at epoch 22 is 650.451.
The cost at epoch 23 is 650.246.
The cost at epoch 24 is 650.042.
The cost at epoch 25 is 649.841.
The cost at epoch 26 is 649.640.
The cost at epoch 27 is 649.442.
The cost at epoch 28 is 649.244.
The cost at epoch 29 is 649.049.
The cost at epoch 30

### Resources
This notebook has been inspired by the Towards Data Science post [Let’s code a Neural Network in plain NumPy](https://towardsdatascience.com/lets-code-a-neural-network-in-plain-numpy-ae7e74410795).

Additional resources include:

* [The Softmax Function Derivative (Part 1)](https://aimatters.wordpress.com/2019/06/17/the-softmax-function-derivative/).
* [Creating a Neural Network from Scratch in Python: Multi-class Classification](https://stackabuse.com/creating-a-neural-network-from-scratch-in-python-multi-class-classification/).
* [A Gentle Introduction to Cross-Entropy for Machine Learning](https://machinelearningmastery.com/cross-entropy-for-machine-learning/).