In [103]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Activation, Dense
from keras import optimizers
from keras.layers import Dropout

In [104]:
iris = load_iris()

In [105]:
iris.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [106]:
data = pd.DataFrame(iris['data'], columns = iris['feature_names'] )
target = pd.DataFrame(iris['target'],columns = ['target'])

In [107]:
#combine the input predictors and target so that it can be split into training and testing
data_target = data.join(target)

In [108]:
data_target.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [109]:
data_target['target'].value_counts()

2    50
1    50
0    50
Name: target, dtype: int64

In [110]:
#for this implementaion, let's make the problem a binary. So considering only '0' and '1' as the target
X = np.array(data_target[(data_target['target'] == 0) | (data_target['target'] == 1)].drop('target', axis=1))
y = np.array(data_target[(data_target['target'] == 0) | (data_target['target'] == 1)]['target']).reshape(100,1)

In [111]:
train_x,test_x,train_y,test_y = train_test_split(X,y,test_size = 0.2)

print('Shape of train_x:', train_x.shape)
print('Shape of train_y:', train_y.shape)
print('Shape of test_x:', test_x.shape)
print('Shape of test_y:', test_y.shape)

Shape of train_x: (80, 4)
Shape of train_y: (80, 1)
Shape of test_x: (20, 4)
Shape of test_y: (20, 1)


Splitting the data into 9 mini batches

In [112]:
batches = 9

#get the total number of batches
batch_count = train_x.shape[0] // batches

batch_trainx = []
batch_trainy = []

for i in range(0, batch_count):
  begin = i * batches
  end = (i + 1) * batches

  batch_trainx.append(train_x[begin:end])
  batch_trainy.append(train_y[begin:end])

#when the total count is not exactly divisible by batches
left_out = train_x.shape[0] % batches

if left_out != 0:
  batch_trainx.append(train_x[end: end + left_out])
  batch_trainy.append(train_y[end: end + left_out])

In [113]:
# intializing the weights for each layer
def intialize_weights(layer_dims):
    np.random.seed(3)
    parameters = {}
    L = len(layer_dims)
    
    #initialise the value of weights based on the number of layers
    for i in range(1,L-1):
        #for the hidden layers we will use He initialisation because of relu activation
        parameters['W'+str(i)] = np.random.randn(layer_dims[i-1],layer_dims[i]) * np.sqrt(2/layer_dims[i-1])
        parameters['b'+str(i)] = np.zeros([1, layer_dims[i]])   
            
    #for the last layer we can use Xavier initialisation 
    parameters['W' + str(i+1)] = np.random.randn(layer_dims[i],layer_dims[i+1]) * np.sqrt(1/layer_dims[i])
    parameters['b'+str(i+1)] = np.zeros([1, layer_dims[i+1]])    
    
    return parameters

In [114]:
#forward propagation
def forward_propagation(layer_dims,train_x,parameters, keep_probs,mode):
    
    caches = []
    Aprev = train_x
    L = len(layer_dims)
    
    #forward propagation for all the layers except last layer
    for i in range(1,L-1): 
        W = parameters['W'+ str(i)]
        b = parameters['b' + str(i)] 
        Z = np.dot(Aprev, W) + b  
        Aprev = np.maximum(0,Z)       
        cache = Aprev, W, b
        caches.append(cache)     
    
    #forward propagation for the last layer
    W = parameters['W'+ str(L-1)]
    b = parameters['b' + str(L-1)]
    Zlast = np.dot(Aprev, W) + b    
    Alast = 1/(1 + np.exp(-Zlast))   
    cache = Alast, W, b

    caches.append(cache)
    return caches

In [115]:
#cost function calculation
def cost_calculate(predict_y,train_y):
    m = train_y.shape[0]
    cost = -(np.dot(train_y.T, np.log(predict_y)) + np.dot((1-train_y).T, np.log(1-predict_y)))/m
    return cost

In [116]:
def backward_propagation(layer_dims, caches, parameters, train_x, train_y, learning_rate):
    #backward propagation for the last layer
    #Extract the last array from the caches, as this corresponds to the final output
    L = len(caches)    
    Acurr,Wcurr,bcurr = caches[L - 1]  
    Aprev,Wprev,bprev = caches[L - 2]

    m = train_y.shape[0]   
    
    dzprev = (Acurr - train_y)    
    dwlast = np.dot(Aprev.T, dzprev)/m    
    dblast = np.sum(dzprev, keepdims = True, axis = 0)/m        
    parameters['W' + str(L)]= parameters['W' + str(L)] - (learning_rate * dwlast)    
    parameters['b' + str(L)]= parameters['b' + str(L)] - (learning_rate * dblast)   
            
    for i in reversed(range(L-1)):
        Anext,Wnext,bnext = caches[i+1]
        Acurr,Wcurr,bcurr = caches[i]  
        if i == 0:
            Aprev = train_x
        else:            
            Aprev,Wprev,bprev = caches[i-2]
                
        dzcurr = np.where(Acurr > 0,1,Acurr)                     
        dzprev = np.multiply(np.dot(dzprev,Wnext.T), dzcurr)
        
        dW = np.dot(Aprev.T,dzprev)/m
        db = np.sum(dzprev, keepdims = True, axis = 0)/m  
        parameters['W' + str(i+1)]= parameters['W' + str(i+1)] - (learning_rate * dW)
        parameters['b' + str(i+1)]= parameters['b' + str(i+1)] - (learning_rate * db)     
    return parameters

In [117]:
def complete_model(layer_dims, train_x, train_y, learning_rate, iterations, keep_probs,mode):
    
    L = len(layer_dims)
    # Intialize the weights
    parameters = intialize_weights(layer_dims)
    
    for i in range(iterations):
      for j in range(len(batch_trainx)):
        #forward propagation
        caches = forward_propagation(layer_dims,batch_trainx[j],parameters, keep_probs,mode)
        
        #calculate the cost 
        A,W,b = caches[-1]
        cost = cost_calculate(A,batch_trainy[j])
        if i%1000 == 0:
            print('The cost after iteration {}: {}'.format(i, np.squeeze(cost)))
        #backward propagation
        parameters = backward_propagation(layer_dims, caches, parameters,batch_trainx[j], batch_trainy[j], learning_rate)

    return parameters

In [118]:
layer_dims = [4,5,3,1]
learning_rate = 0.15
iterations = 5000
keep_probs = 0.8
mode = 'train'
parameters = complete_model(layer_dims, train_x, train_y, learning_rate, iterations, keep_probs, mode)

The cost after iteration 0: 0.5998951433889075
The cost after iteration 0: 0.4940940578040702
The cost after iteration 0: 0.435549694259776
The cost after iteration 0: 0.5115873715437167
The cost after iteration 0: 0.5166993127752532
The cost after iteration 0: 0.5001224461195737
The cost after iteration 0: 0.43298035075996216
The cost after iteration 0: 0.22453076395646365
The cost after iteration 0: 0.3652804271854403
The cost after iteration 1000: 0.0010456320689345992
The cost after iteration 1000: 0.00045916666005238753
The cost after iteration 1000: 0.0007032498779519961
The cost after iteration 1000: 0.0010466535059201589
The cost after iteration 1000: 0.0010425825095893326
The cost after iteration 1000: 0.0010419275588093916
The cost after iteration 1000: 0.0008791508581443363
The cost after iteration 1000: 0.0003805516174784836
The cost after iteration 1000: 0.0007962785353275919
The cost after iteration 2000: 0.0005193358979705577
The cost after iteration 2000: 0.000226834799