#Implementing a neural network from scratch using NumPy
https://studymachinelearning.com/implementation-of-neural-network-from-scratch-using-numpy/

Here we will explain how to develop the neural network from scratch using the NumPy library. 

*  The neural network consists mainly of **4 steps** (iteratively): 
1.   Forward Propagation
2.   Cost Function
3.   Back Propagation
4.   Gradient Descent

 

In [None]:
# Import required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(2) 

# Read the data
df = pd.read_csv("dataset.csv")
df.shape
df.head()

In [None]:
# Let's print the distribution of the target variable in class 0 & 1
df['target'].value_counts()

In [None]:
# Let's plot the distribution of the target variable  (à :red , 1 : blue)
plt.scatter(df['x1'], df['x2'], c=df['target'].values.reshape(200,), cmap=plt.cm.Spectral)
plt.title('Distribution of the target variable')

In [None]:
# Let's prepare the data for model training
X = df[['x1','x2']].values.T
Y = df['target'].values.reshape(1,-1)
X.shape,Y.shape

In [None]:
m = X.shape[1]             # m - Number of training samples
m

In [None]:
# Set the hyperparameters
n_x = 2                    # No. of neurons in first layer  (x1 and x2)
n_h = 10                   # No. of neurons in hidden layer
n_y = 1                    # No. of neurons in output layer  (target)
num_of_iters = 1000
learning_rate = 0.3

# 1. Forward Propagation

In [None]:
# Define the sigmoid activation function
def sigmoid(z):
    return 1/(1 + np.exp(-z))

In [None]:
# Initialize weigth & bias parameters
def initialize_parameters(n_x, n_h, n_y):
    W1 = np.random.randn(n_h, n_x)
    b1 = np.zeros((n_h, 1))
    W2 = np.random.randn(n_y, n_h)
    b2 = np.zeros((n_y, 1))

    parameters = {
        "W1": W1,
        "b1" : b1,
        "W2": W2,
        "b2" : b2
      }
    return parameters

In [None]:
# Function for forward propagation
def forward_prop(X, parameters):
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]

    Z1 = np.dot(W1, X) + b1
    A1 = np.tanh(Z1)
    Z2 = np.dot(W2, A1) + b2
    A2 = sigmoid(Z2)

    cache = {
      "A1": A1,
      "A2": A2
     }
    return A2, cache

# 2. Cost Function

Objective: Adjust the parameters w and b in order to obtain the best model (minimum error between Actual and Predicted values) 

> **Cost function**: Quantify the errors made by the model







In [None]:
# Function to calculate the loss
def calculate_cost(A2, Y):
    cost = -np.sum(np.multiply(Y, np.log(A2)) +  np.multiply(1-Y, np.log(1-A2)))/m
    cost = np.squeeze(cost)
    return cost

# 3. Backword Propagation

In [None]:
# Function for back-propagation
def backward_prop(X, Y, cache, parameters):
    A1 = cache["A1"]
    A2 = cache["A2"]

    W2 = parameters["W2"]

    dZ2 = A2 - Y
    dW2 = np.dot(dZ2, A1.T)/m
    db2 = np.sum(dZ2, axis=1, keepdims=True)/m
    dZ1 = np.multiply(np.dot(W2.T, dZ2), 1-np.power(A1, 2))
    dW1 = np.dot(dZ1, X.T)/m
    db1 = np.sum(dZ1, axis=1, keepdims=True)/m

    grads = {
    "dW1": dW1,
    "db1": db1,
    "dW2": dW2,
    "db2": db2
    }

    return grads

In [None]:
# Function to update the weigth & bias parameters
def update_parameters(parameters, grads, learning_rate):
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]

    dW1 = grads["dW1"]
    db1 = grads["db1"]
    dW2 = grads["dW2"]
    db2 = grads["db2"]

    W1 = W1 - learning_rate*dW1
    b1 = b1 - learning_rate*db1
    W2 = W2 - learning_rate*dW2
    b2 = b2 - learning_rate*db2

    new_parameters = {
    "W1": W1,
    "W2": W2,
    "b1" : b1,
    "b2" : b2
    }

    return new_parameters

In [None]:
# Define the Model
def model(X, Y, n_x, n_h, n_y, num_of_iters, learning_rate,display_loss=False):
    parameters = initialize_parameters(n_x, n_h, n_y)

    for i in range(0, num_of_iters+1):
        a2, cache = forward_prop(X, parameters)

        cost = calculate_cost(a2, Y)

        grads = backward_prop(X, Y, cache, parameters)

        parameters = update_parameters(parameters, grads, learning_rate)
        
        if display_loss:
            if(i%100 == 0):
                print('Cost after iteration# {:d}: {:f}'.format(i, cost))

    return parameters

In [None]:
trained_parameters = model(X, Y, n_x, n_h, n_y, num_of_iters, learning_rate,display_loss=True)


In [None]:
# Define function for prediction
def predict(parameters, X):
    A2, cache = forward_prop(X,parameters)
    predictions = A2 > 0.5
    
    return predictions

In [None]:
# Define function to plot the decision boundary
def plot_decision_boundary(model, X, y):
    # Set min and max values and give it some padding
    x_min, x_max = X[0, :].min() - 1, X[0, :].max() + 1
    y_min, y_max = X[1, :].min() - 1, X[1, :].max() + 1
    h = 0.01
    # Generate a grid of points with distance h between them
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    # Predict the function value for the whole grid
    Z = model(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    # Plot the contour and training examples
    plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
    plt.ylabel('x2')
    plt.xlabel('x1')
    plt.scatter(X[0, :], X[1, :], c=y.reshape(200,), cmap=plt.cm.Spectral)

In [None]:
# Plot the decision boundary
plot_decision_boundary(lambda x: predict(trained_parameters, x.T), X, Y)

In [None]:
# Let's see how our Neural Network work with different hidden layer sizes
plt.figure(figsize=(15, 10))
hidden_layer_sizes = [1, 2, 3, 5, 10,20]
for i, n_h in enumerate(hidden_layer_sizes):
    plt.subplot(2, 3, i+1)
    plt.title('Hidden Layer of size %d' % n_h)
    
    parameters = model(X, Y, n_x, n_h, n_y, num_of_iters, learning_rate)
    plot_decision_boundary(lambda x: predict(parameters, x.T), X, Y)

From the above results, we can say that the model gives **better performance with the more hidden units**. But, sometimes the more hidden units **overfit** the data.

Overfitted model works best on training data but reduces the performance on test data. However, the model architecture (**number of hidden layer + number of neuron in each hidden layer**) is also **dependent on the training dataset**.

To find suitable hidden units is a tedious task. In the above example, the three red isolated data-points might be an outlier. If they are the outlier, model overfit with the hidden layer size 10 and 20. **In that case, the best hidden layer size seems to be 3.**