In [1]:
import pandas
from sklearn.model_selection import train_test_split
import math
import numpy as np
import tensorflow as tf
import os

4 neurons in input layer,
3 neurons in output layer,
2 nodes in 1 hidden layer

In [2]:
iris = pandas.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',names=['sepal.length','sepal.width','petal.length','petal.width','species'])
X = iris.loc[:, iris.columns != 'species']
X =(X-X.min())/(X.max()-X.min())
y = pandas.get_dummies(iris['species'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)


### Forward Propogation

$$f(t) = \frac{1}{1 + e^{-t}}$$
$\hat{y} = f(f(XW^{(1)} +  B^{(1)})W^{(2)}+B^{(2)})$

In [3]:
batch_size =  X_train.shape[0]
input_layer_size = X_train.shape[1]
output_layer_size = 3
hidden_layer_size = 2

def sigmoid(z):
    return(1/(1 + np.exp(z)))
           
def sigmoid_prime(z):
    return(np.exp(-z)/(1 + np.exp(z))**2)

def cost(y,y_pred):
    return(0.5 * np.sum(np.square(np.matrix(y) - np.matrix(y_pred))))

def initialize_weights(rows, columns):
    W = []
    for row in range(rows):
        W.append(np.random.uniform(low = -.5, high = .5, size = columns))
    return(np.matrix(W))

W1 = initialize_weights(input_layer_size, hidden_layer_size)
W2 = initialize_weights(hidden_layer_size, output_layer_size)
b1 = np.random.uniform(size = hidden_layer_size) 
B1 = np.matrix([b1 for i in range(batch_size)])
b2 = np.random.uniform(size = output_layer_size) 
B2 = np.matrix([b2 for i in range(batch_size)])

def forward_prop(X, y, W1, B1, W2, B2):
    Z2 = X.dot(W1)
    A2 = sigmoid(Z2 + B1)
    Z3 = A2.dot(W2)
    y_pred = sigmoid(Z3 + B2)
    return(y_pred)

In [4]:
y_pred = forward_prop(X_train, y_train, W1, B1, W2, B2)
cost(y_train, y_pred)

41.564568717854414

### Back Propogation

Cost Function:
$$J = 0.5 \sum (y - \hat{y})^2 = 0.5 \sum (y - f(f(XW^{(1)} +  B^{(1)})W^{(2)}+B^{(2)}))^2$$

Gradient of Cost Function:
$$\frac{\partial J }{\partial W^{(1)}} = X^{T}-(y - \hat{y})^2  *f'(z^{(3)}+B^{(1)})*W^{(2)T}f'(z^{(3)})$$
$$\frac{\partial J }{\partial W^{(2)}} = a^{(2)}-(y - \hat{y})^2  *f'(z^{(3)}+B^{(1)})$$
$$\frac{\partial J }{\partial B^{(1)}} = a^{(2)}-(y - \hat{y})^2  *f'(z^{(3)}+B^{(1)})$$
$$\frac{\partial J }{\partial B^{(2)}} = a^{(2)}-(y - \hat{y})^2  *f'(z^{(3)}+B^{(1)})$$

### Gradient Descent

In [5]:
def train_model(X, y, W1, B1, W2, B2, rate, iters):
    y = np.matrix(y)
    for i in range(iters):
        Z2 = X.dot(W1)
        A2 = sigmoid(Z2 + B1)
        Z3 = A2.dot(W2)
        y_pred = sigmoid(Z3 + B2)
        cost(y, y_pred)
        delta3 = -(y - y_pred).multiply(sigmoid_prime(Z3 + B2))
        delta2 = delta3.dot(W2.transpose()).multiply(sigmoid_prime(Z2 + B1))
        djdb2 = np.ones(X.shape[0]).dot(delta3)
        djdw2 = A2.transpose().dot(delta3)
        djdb1 = np.ones(X.shape[0]).dot(delta2)
        djdw1 = X.transpose().dot(delta2)
        W1 = W1 - djdw1 * rate
        B1 = B1 - djdb1.transpose() * rate
        W2 = W2 - djdw2 * rate
        B2 = B2 - djdb2.transpose() * rate
    return(W1, B1, W2, B2)

W1 = initialize_weights(input_layer_size, hidden_layer_size)
W2 = initialize_weights(hidden_layer_size, output_layer_size)
b1 = np.random.uniform(size = hidden_layer_size) 
B1 = np.matrix([b1 for i in range(batch_size)])
b2 = np.random.uniform(size = output_layer_size) 
B2 = np.matrix([b2 for i in range(batch_size)])

W1, B1, W2, B2 = train_model(X_train, y_train ,W1, B1, W2, B2, 0.01, 8)
y_pred = forward_prop(X_train, y_train, W1, B1, W2, B2)
cost(y_train, y_pred)

67.05698980367399

### TensorFlow