# MLP Walkthough

# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Function to read .csv file data into a Pandas data frame

In [None]:
def df_read(datafile):
    df = pd.read_csv(datafile, delimiter = ',')
    return df

# Function to Z score normalise the dataframe

In [None]:
def z_score(df):
    df_std = df.copy()
    for column in df_std.columns:
        df_std[column] = (df_std[column] - df_std[column].mean()) / df_std[column].std()
    return df_std

# Define softmax activation function

In [None]:
def softmax(Z):
    s = np.exp(Z)
    softmax = s/np.sum(s, axis=1, keepdims=True)
    return softmax

# Define loss function

In [None]:
def softmax_loss(Y,Y_hat):
    minval = 0.000000000001             # Value to avoid zero division errors
    m = Y.shape[0]
    loss = (-1.0/m) * np.sum(Y * np.log(Y_hat.clip(min=minval)))
    return loss

# Define softmax loss derivative

In [None]:
def softmax_loss_derivative(Y,Y_hat):
    loss_deriv = (Y_hat-Y)
    return loss_deriv

# Define tanh derivative

In [None]:
def tanh_derivative(x):
    tanh_deriv = (1.0 - np.power(np.tanh(x), 2))
    return tanh_deriv

# Function to create network layer sizes

In [None]:
def layer_sizes(X, nn_h1, nn_h2, Y):
    nn_x = X.shape[1] # size of input layer
    nn_y = Y.shape[1] # size of output layer
    return nn_x, nn_h1, nn_h2, nn_y

# Function to initialize W & b values for each network layer

In [None]:
def network_parameters(nn_X, nn_h1, nn_h2, nn_Y):
    
    # Initialize weight & bias matrices for Layer 1
    W1 = np.random.randn(nn_X, nn_h1) 
    b1 = np.zeros((1, nn_h1)) 
    # Initialize weight & bias matrices for Layer 2
    W2 = np.random.randn(nn_h1, nn_h2)
    b2 = np.zeros((1, nn_h2))
    # Initialize weight & bias matrices for Layer 3 (Ouput Layer)
    W3 = np.random.randn(nn_h2, nn_Y)
    b3 = np.zeros((1, nn_Y))
    
    parameters = {'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2,'W3': W3,'b3': b3}

    return parameters

# Function to apply forward propagation

In [None]:
def forward_prop(X, parameters):

    # Retrieve each parameter from the "parameters" dictionary
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    W3 = parameters["W3"]
    b3 = parameters["b3"]
    
    # 1st layer linear step
    Z1 = (X).dot(W1) + b1
    # 1st layer tanh activation function
    A1 = np.tanh(Z1)
    
    # 2nd layer linear step
    Z2 = A1.dot(W2) + b2
    # 2nd layer tanh activation function
    A2 = np.tanh(Z2)
    
    # 3rd layer linear step
    Z3 = A2.dot(W3) + b3
    # 3rd layer softmax activation function
    A3 = softmax(Z3)

    cache = {'Z1': Z1,'A1': A1,'Z2': Z2,'A2': A2,'Z3': Z3,'A3': A3}
    
    return cache

# Function to apply backward propagation

In [None]:
def backward_prop(X, parameters, cache, Y):
    
    # Retrieve each parameter from the "parameters" dictionary
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    W3 = parameters["W3"]
    b3 = parameters["b3"]
    
    # Retrieve Z & A values from the "cache" dictionary
    Z1 = cache["Z1"]
    A1 = cache["A1"]
    Z2 = cache["Z2"]
    A2 = cache["A2"]
    Z3 = cache["Z3"]
    A3 = cache["A3"]
   
    m = Y.shape[0]
    
    # 3rd layer derivatives:
    dZ3 = softmax_loss_derivative(Y,A3)
    dW3 = (1.0/m)*(A2.T).dot(dZ3)
    db3 = (1.0/m)*np.sum(dZ3, axis=0)
    
    # 2nd layer derivatives:
    dZ2 = np.multiply(dZ3.dot(W3.T), tanh_derivative(Z2))
    dW2 = (1.0/m)*np.dot(A1.T, dZ2)
    db2 = (1.0/m)*np.sum(dZ2, axis=0)
    
    # 1st layer derivatives:
    dZ1 = np.multiply(dZ2.dot(W2.T),tanh_derivative(Z1))
    dW1 = (1.0/m)*np.dot(X.T,dZ1)
    db1 = (1.0/m)*np.sum(dZ1,axis=0)
    
    # Store gradients in dictionary
    grads = {'dW3': dW3, 'db3': db3, 'dW2': dW2,'db2': db2,'dW1': dW1,'db1': db1}
    
    return grads

# Function to update network parameters

In [None]:
def update_parameters(parameters, grads, learning_rate):

    # Retrieve each parameter from the "parameters" dictionary
    W3 = parameters["W3"]
    b3 = parameters["b3"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    W1 = parameters["W1"]
    b1 = parameters["b1"]

    # Retrieve each derivative from the "grads" dictionary
    dW3 = grads["dW3"]
    db3 = grads["db3"]
    dW2 = grads["dW2"]
    db2 = grads["db2"]
    dW1 = grads["dW1"]
    db1 = grads["db1"]

    # Update 3rd layer parameters using derivatives
    W3 = W3 - learning_rate * dW3
    b3 = b3 - learning_rate * db3
    
    # Update 2nd layer parameters using derivatives
    W2 = W2 - learning_rate * dW2
    b2 = b2 - learning_rate * db2
    
    # Update 1st layer parameters using derivatives
    W1 = W1 - learning_rate * dW1
    b1 = b1 - learning_rate * db1

    parameters = {"W1": W1, "b1": b1, "W2": W2,"b2": b2, "W3": W3, "b3": b3}

    return parameters

# Bring everything together to train the network

In [None]:
def train(parameters, X, Y, learning_rate, epochs):
    epoch_array  =  np.zeros(epochs)
    losses_array =  np.zeros(epochs)
    accur_array  =  np.zeros(epochs)
    for i in range(0, epochs):
        # Apply forward propagtion
        cache = forward_prop(X, parameters)                           
        # Apply backpopagation
        grads = backward_prop(X, parameters, cache, Y)
        # Update parameters using derivatives
        parameters = update_parameters(parameters, grads, learning_rate)
        # Calculate the cost/loss function
        A3 = cache['A3']
        Y_hat = np.argmax(A3, axis=1)
        Y_true = Y.argmax(axis=1)
        loss = softmax_loss(Y,A3)
        epoch_array[i] = i
        losses_array[i] = loss
        accur_array[i] = accuracy_score(Y_hat,Y_true)*100
        if i % 500 == 0:
            print('Loss after epoch        ',i,':',loss)
            print('Accuracy after iteration',i,':',accuracy_score(Y_hat,Y_true)*100,'%')
    
    return parameters, epoch_array, losses_array, accur_array

# Read in data, seperate class labels & encode class labels to create Y
# CAREFUL THIS MUST BE IN THE CORRECT FORMAT!
# The classification column must be called 'target' & range from 0 to N

In [None]:
#df = pd.read_csv('wine.data')
df = pd.read_csv('iris_data.csv')

Y_classes = df.pop('target')
Y_classes = Y_classes.to_numpy()
num_classes = np.max(Y_classes+1)

print('Number of classes:', num_classes)
print('Length of Y_classes array:', len(Y_classes))
Y = np.zeros((len(Y_classes), num_classes))
print('Length of Y array:', len(Y))
print('Shape of Y array:', Y.shape)
for i in range(0,len(Y)):
    for j in range(0,num_classes):
        if Y_classes[i] == j:
            Y[i][j] = 1
#print(Y)
X = z_score(df).to_numpy()
print('Length of X array:', (len(X)))
print('Shape of X array:', X.shape)

# Split into Training & Testing Data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Set up network for input data

In [None]:
np.random.seed(42)
nn_x, h1, h2, nn_y = layer_sizes(X_train, 3, 4, Y_train)
parameters = network_parameters(nn_x, h1, h2, nn_y)
print('X_train shape:', X_train.shape)
print('Y_train shape:', Y_train.shape)
print('Input features:', nn_x)
print('Neurons in hidden layer 1:', h1)
print('Neurons in hidden layer 2:', h2)
print('Output features:', nn_y)
print('W1 shape:', parameters['W1'].shape )
#print(parameters['W1'])
print('b1 shape:', parameters['b1'].shape )
#print(parameters['b1'])
print('W2 shape:', parameters['W2'].shape )
#print(parameters['W2'])
print('b2 shape:', parameters['b2'].shape )
#print(parameters['b2'])
print('W3 shape:', parameters['W3'].shape )
#print(parameters['W3'])
print('b3 shape:', parameters['b3'].shape )
#print(parameters['b3'])

# Train the network

In [None]:
model, epochs, losses, accuracy = train(parameters, X_train, Y_train, learning_rate=0.005,epochs=10000)

# Plot the loss vs the epoch number

In [None]:
plt.plot(epochs,losses)

# Plot the accuracy vs the epoch number

In [None]:
plt.plot(epochs, accuracy)

# Evaluate the performance of the neural network on the test set
# (Backpropagation isn't used! We are using the final network parameters after training has finished)

In [None]:
model_test = forward_prop(X_test, model)
A3 = model_test['A3']
print(A3.shape)
Y_hat = np.argmax(A3, axis=1)
Y_true = Y_test.argmax(axis=1)
Test_accuracy = accuracy_score(Y_hat,Y_true)*100
print('Test_accuracy', Test_accuracy, '%')