# Deep Learning Tutorial - Modeling

In [1]:
import numpy as np
import pandas as pd
import pprint

In [2]:
NUM_FEATURES = 20
NUM_CLASSES = 7
LAYER_SIZES = [25, 25]

LAYER_SIZES.insert(0, NUM_FEATURES)

LAYER_SIZES.append(NUM_CLASSES)

LAYER_ACTIVATIONS = ['relu', 'relu', 'softmax']

In [3]:
print(f'The layer sizes include: {LAYER_SIZES}.')

The layer sizes include: [20, 25, 25, 7].


In [4]:
def initialize_network():
    architecture = {}
    for layer in range(1, len(LAYER_SIZES)):
        architecture[f'layer_{layer}'] = {
            'W': np.random.randn(LAYER_SIZES[layer],
                                 LAYER_SIZES[layer-1]) * 0.1,
            'b': np.random.randn(LAYER_SIZES[layer], 1) * 0.1,
            'activation': LAYER_ACTIVATIONS[layer-1]
        }
    return architecture

In [5]:
network = initialize_network()
# pprint.pprint(network)

In [6]:
def sigmoid_activation(Z):
    activation = 1/(1 + np.exp(-1*Z))
    return activation

In [7]:
def _softmax_activation(Z):
    activation = np.exp(Z) / np.sum(np.exp(Z))
    return activation

In [8]:
def softmax_activation(Z):
    exp_Z = np.exp(Z - np.max(Z))
    activation = exp_Z / np.sum(exp_Z, axis=0, keepdims=True)
    return activation

In [9]:
def relu_activation(Z):
    activation = np.maximum(0.0, Z)
    return activation

In [10]:
def dZ_sigmoid(dA, Z):
    sigmoid = sigmoid_activation(Z)
    dZ = dA * sigmoid * (1.0 - sigmoid)
    return dZ

In [11]:
def dZ_softmax(dA, Z):
    sigmoid = sigmoid_activation(Z)
    dZ = dA * sigmoid * (1.0 - sigmoid)
    return dZ

In [12]:
def _dZ_softmax(Z):
    softmax = softmax_activation(Z)
    softmax_matrix = np.tile(softmax)
    dZ = np.diag(softmax) - (softmax_matrix*np.transpose(softmax_matrix))

In [13]:
def dZ_relu(dA, Z):
    dZ = np.copy(dA)
    dZ[Z <= 0.0] = 0.0
    return dZ

In [14]:
act_map = {
    'sigmoid': sigmoid_activation,
    'relu': relu_activation,
    'softmax': softmax_activation
}

In [15]:
dZ_map = {
    'sigmoid': dZ_sigmoid,
    'relu': dZ_relu,
    'softmax': dZ_softmax
}

In [16]:
def single_forward_pass(A_previous, W, b, activation):
    try:
        act_function = act_map[activation]
    except KeyError:
        print(f'The activation {activation} is not recognized.\nIt must be one of the following: {list(act_map.keys())}')
        return None
    
    Z = np.dot(W, A_previous) + b
    A = act_function(Z)
    
    return A, Z

In [17]:
def full_forward_pass(X, network):
    
    cache = {}
    A = np.transpose(X)
    
    for layer in range(1, len(network) + 1):
        A_previous = A
        A, Z = single_forward_pass(A_previous, 
                                   network[f'layer_{layer}']['W'], 
                                   network[f'layer_{layer}']['b'], 
                                   network[f'layer_{layer}']['activation'])
        
        cache[f'A_{layer-1}'] = A_previous
        cache[f'Z_{layer}'] = Z
        
    return A, cache

In [18]:
def compute_cross_entropy_cost(y_pred, y):
    
    # cost = np.sum(-1*(y * np.log(np.transpose(y_pred))))
    cost = -1*np.mean(y * np.log(np.transpose(y_pred)))
    
    return cost

In [19]:
def single_backward_pass(dA, W, b, Z, A_previous, activation):
    
    try:
        backprop_activation = dZ_map[activation]
    except KeyError:
        print(f'The backprop activation {activation} is not recognized.\nIt must be one of the following: {list(dZ_map.keys())}')
        return None
    
    m = A_previous.shape[1]
    
    dZ = backprop_activation(dA, Z)
    
    dW = np.dot(dZ, np.transpose(A_previous)) / m
    db = np.sum(dZ, axis=1, keepdims=True) / m
    dA_previous = np.dot(np.transpose(dW), dZ)
    
    return dA_previous, dW, db

In [20]:
def full_backward_pass(y_pred, y, cache, network):
    
    stored_grads = {}
    m = y.shape[1]
    
    dA_previous = y_pred - np.transpose(y)
    
    for layer in reversed(range(1, len(network) + 1)):
        activation = network[f'layer_{layer}']['activation']
        layer_previous = layer - 1
        
        dA = dA_previous
        
        A_previous = cache[f'A_{layer_previous}']
        Z = cache[f'Z_{layer}']
        W = network[f'layer_{layer}']['W']
        b = network[f'layer_{layer}']['b']
        
        dA_previous, dW, db = single_backward_pass(dA, W, b, Z, A_previous, activation)
        stored_grads[f'dW_{layer}'] = dW
        stored_grads[f'db_{layer}'] = db
        
    return stored_grads

In [21]:
def update_network(network, stored_grads, learning_rate):
    for layer in range(1, len(network) + 1):
        network[f'layer_{layer}']['W'] = network[f'layer_{layer}']['W'] - learning_rate * stored_grads[f'dW_{layer}']
        network[f'layer_{layer}']['b'] = network[f'layer_{layer}']['b'] - learning_rate * stored_grads[f'db_{layer}']
    return network

In [22]:
HYPER_PARAMS = {
    'epochs': 50,
    'learning_rate': 0.01
}

In [23]:
def compute_accuracy(y_pred, y):
    y_pred_transpose = np.transpose(y_pred)
    print(f'y_pred_transpose.shape: {y_pred_transpose.shape}')
    print(f'y.shape: {y.shape}')
    y_pred_flat = np.argmax(y_pred_transpose, 1)
    print(f'np.max(y_pred_flat): {np.max(y_pred_flat)}')
    y_flat = np.argmax(y, 1)
    print(list(zip(y_pred_flat, y_flat)))
    print(f'np.unique(y_pred_flat): {np.unique(y_pred_flat, return_counts=True)}')
    print(f'np.unique(y_flat): {np.unique(y_flat, return_counts=True)}')
    accuracy = np.mean(y_pred_flat == y_flat) / 100
    return accuracy

In [24]:
def train_nn(X, y, network):
    
    stored_cost = []
    
    for epoch in range(HYPER_PARAMS['epochs']):
        y_pred, cache = full_forward_pass(X, network)
        cost = compute_cross_entropy_cost(y_pred, y)
        # print(f' * The cost at epoch {epoch} is {cost:0.3f}.')
        stored_cost.append(cost)
        stored_grads = full_backward_pass(y_pred, y, cache, network)
        network = update_network(network, stored_grads, HYPER_PARAMS['learning_rate'])
    final_accuracy = compute_accuracy(y_pred, y)
    print(f' * Final accuracy: {final_accuracy}')
    return network, stored_cost

## Train Model
**Plan:**
1. Load data set.
2. Split into labels and features.
3. One-hot encode features.
4. One-hot encode labels.
5. Run through model.
### Load Dataset

In [25]:
header_values = []
with open('./data/zoo.dat', 'r') as zoo_file:
    for line in zoo_file:
        if '@attribute' in line:
            header_values.append(line.split()[1])

In [26]:
df = pd.read_csv('./data/zoo.dat', skiprows=21, header=None, names=header_values)

In [27]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Hair,101.0,0.425743,0.496921,0.0,0.0,0.0,1.0,1.0
Feathers,101.0,0.19802,0.400495,0.0,0.0,0.0,0.0,1.0
Eggs,101.0,0.584158,0.495325,0.0,0.0,1.0,1.0,1.0
Milk,101.0,0.405941,0.493522,0.0,0.0,0.0,1.0,1.0
Airborne,101.0,0.237624,0.42775,0.0,0.0,0.0,0.0,1.0
Aquatic,101.0,0.356436,0.481335,0.0,0.0,0.0,1.0,1.0
Predator,101.0,0.554455,0.499505,0.0,0.0,1.0,1.0,1.0
Toothed,101.0,0.60396,0.491512,0.0,0.0,1.0,1.0,1.0
Backbone,101.0,0.821782,0.384605,0.0,1.0,1.0,1.0,1.0
Breathes,101.0,0.792079,0.407844,0.0,1.0,1.0,1.0,1.0


### Separate Features and Labels

In [28]:
df_X = df.drop(columns='Type')

In [29]:
df_y = df['Type']

### One-hot Encode Features
All features except 'Legs' include 1-0 values. Since 'Legs' is a categorical variable, it needs to be one-hot encoded. We can do this using Pandas `get_dummies()` method.

In [30]:
df_X_one_hot = pd.get_dummies(df_X, columns=['Legs'], drop_first=True)

In [31]:
df_X_one_hot.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Hair,101.0,0.425743,0.496921,0.0,0.0,0.0,1.0,1.0
Feathers,101.0,0.19802,0.400495,0.0,0.0,0.0,0.0,1.0
Eggs,101.0,0.584158,0.495325,0.0,0.0,1.0,1.0,1.0
Milk,101.0,0.405941,0.493522,0.0,0.0,0.0,1.0,1.0
Airborne,101.0,0.237624,0.42775,0.0,0.0,0.0,0.0,1.0
Aquatic,101.0,0.356436,0.481335,0.0,0.0,0.0,1.0,1.0
Predator,101.0,0.554455,0.499505,0.0,0.0,1.0,1.0,1.0
Toothed,101.0,0.60396,0.491512,0.0,0.0,1.0,1.0,1.0
Backbone,101.0,0.821782,0.384605,0.0,1.0,1.0,1.0,1.0
Breathes,101.0,0.792079,0.407844,0.0,1.0,1.0,1.0,1.0


In [32]:
X = df_X_one_hot.values

### One-hot Encode Labels

In [33]:
NUM_CLASSES = len(pd.unique(df_y))
print(f'There are {NUM_CLASSES} unique classes for the labels, which are {pd.unique(df_y)}.')

There are 7 unique classes for the labels, which are [1 4 7 2 6 3 5].


In [34]:
def encode_labels(x):
    encoded = np.zeros(NUM_CLASSES)
    encoded[x-1] = 1
    return encoded

In [35]:
df_y_one_hot = df_y.apply(lambda x: encode_labels(x))

In [36]:
df_y_one_hot.head()

0    [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
1    [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
2    [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
3    [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
4    [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
Name: Type, dtype: object

In [37]:
y = df_y_one_hot.values

In [38]:
y = np.stack(y)

In [39]:
y[:5]

array([[1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0.]])

In [40]:
HYPER_PARAMS = {
    'epochs': 1000,
    'learning_rate': 10
}

In [41]:
X.shape

(101, 20)

In [42]:
y.shape

(101, 7)

In [43]:
y[:3]

array([[1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.]])

In [44]:
network, stored_cost = train_nn(X, y, network)

y_pred_transpose.shape: (101, 7)
y.shape: (101, 7)
np.max(y_pred_flat): 3
[(0, 0), (0, 0), (0, 0), (0, 0), (0, 3), (0, 6), (0, 1), (0, 1), (0, 3), (0, 0), (0, 0), (0, 0), (0, 0), (0, 3), (0, 6), (0, 1), (0, 1), (0, 0), (0, 0), (0, 0), (0, 0), (0, 3), (0, 6), (0, 1), (0, 1), (0, 0), (0, 0), (0, 0), (0, 0), (0, 3), (0, 6), (0, 1), (0, 1), (0, 5), (3, 2), (0, 0), (0, 0), (0, 0), (0, 0), (0, 3), (0, 6), (0, 1), (0, 1), (0, 3), (0, 0), (0, 0), (0, 0), (0, 0), (0, 3), (0, 6), (0, 1), (0, 1), (0, 5), (0, 3), (0, 0), (0, 0), (0, 0), (0, 0), (0, 3), (0, 6), (0, 1), (0, 1), (0, 5), (0, 5), (0, 0), (0, 2), (0, 2), (0, 2), (0, 4), (0, 0), (0, 0), (0, 0), (0, 0), (0, 3), (0, 6), (0, 1), (0, 1), (0, 0), (0, 0), (0, 0), (0, 0), (0, 3), (0, 6), (0, 1), (0, 1), (0, 5), (0, 5), (0, 2), (0, 4), (0, 0), (0, 0), (0, 0), (0, 0), (0, 3), (0, 6), (0, 1), (0, 1), (0, 5), (0, 5), (0, 4), (0, 4)]
np.unique(y_pred_flat): (array([0, 3]), array([100,   1]))
np.unique(y_flat): (array([0, 1, 2, 3, 4, 5, 6]), array([4

In [47]:
41/100

0.41

In [48]:
20 / 100

0.2

In [52]:
print(pd.Series(df_y).value_counts(normalize=True).reset_index().sort_values(by='index'))

   index      Type
0      1  0.405941
1      2  0.198020
5      3  0.049505
2      4  0.128713
6      5  0.039604
4      6  0.079208
3      7  0.099010


### Resources
This notebook has been inspired by the Towards Data Science post [Let’s code a Neural Network in plain NumPy](https://towardsdatascience.com/lets-code-a-neural-network-in-plain-numpy-ae7e74410795).

Additional resources include:

* [A Gentle Introduction to Cross-Entropy for Machine Learning](https://machinelearningmastery.com/cross-entropy-for-machine-learning/).
* [Creating a Neural Network from Scratch in Python: Multi-class Classification](https://stackabuse.com/creating-a-neural-network-from-scratch-in-python-multi-class-classification/).
* [The Softmax Function Derivative (Part 1)](https://aimatters.wordpress.com/2019/06/17/the-softmax-function-derivative/).
* [Understanding and implementing Neural Network with SoftMax in Python from scratch](http://www.adeveloperdiary.com/data-science/deep-learning/neural-network-with-softmax-in-python/)