In [1]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from keras.models import Sequential

from keras.layers import Activation, Dense
from keras import optimizers
from keras.layers import Dropout

In [2]:
iris = load_iris()

In [3]:
iris.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [4]:
data = pd.DataFrame(iris['data'], columns = iris['feature_names'] )
target = pd.DataFrame(iris['target'],columns = ['target'])

In [5]:
#combine the input predictors and target so that it can be split into training and testing
data_target = data.join(target)

In [6]:
data_target.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [7]:
data_target['target'].value_counts()

2    50
1    50
0    50
Name: target, dtype: int64

In [8]:
#for this implementaion, let's make the problem a binary. So considering only '0' and '1' as the target
X = np.array(data_target[(data_target['target'] == 0) | (data_target['target'] == 1)].drop('target', axis=1))
y = np.array(data_target[(data_target['target'] == 0) | (data_target['target'] == 1)]['target']).reshape(100,1)

In [9]:
train_x,test_x,train_y,test_y = train_test_split(X,y,test_size = 0.2)

print('Shape of train_x:', train_x.shape)
print('Shape of train_y:', train_y.shape)
print('Shape of test_x:', test_x.shape)
print('Shape of test_y:', test_y.shape)

Shape of train_x: (80, 4)
Shape of train_y: (80, 1)
Shape of test_x: (20, 4)
Shape of test_y: (20, 1)


Splitting the data into 9 mini batches

In [10]:
batches = 9

#get the total number of batches
batch_count = train_x.shape[0] // batches

batch_trainx = []
batch_trainy = []

for i in range(0, batch_count):
  begin = i * batches
  end = (i + 1) * batches

  batch_trainx.append(train_x[begin:end])
  batch_trainy.append(train_y[begin:end])

#when the total count is not exactly divisible by batches
left_out = train_x.shape[0] % batches

if left_out != 0:
  batch_trainx.append(train_x[end: end + left_out])
  batch_trainy.append(train_y[end: end + left_out])

In [31]:
# intializing the weights for each layer
def intialize_weights(layer_dims):
    np.random.seed(3)
    parameters = {}
    L = len(layer_dims)
    
    #initialise the value of weights based on the number of layers
    for i in range(1,L-1):
        #for the hidden layers we will use He initialisation because of relu activation
        parameters['W'+str(i)] = np.random.randn(layer_dims[i-1],layer_dims[i]) * np.sqrt(2/layer_dims[i-1])
        parameters['b'+str(i)] = np.zeros([1, layer_dims[i]])   
        parameters['gamma'+str(i)] = np.ones([1, layer_dims[i]])   
        parameters['beta'+str(i)] = np.zeros([1, layer_dims[i]])   
        parameters['mean'+str(i)] = 0 
        parameters['var'+str(i)] = 0
            
    #for the last layer we can use Xavier initialisation 
    parameters['W' + str(i+1)] = np.random.randn(layer_dims[i],layer_dims[i+1]) * np.sqrt(1/layer_dims[i])
    parameters['b'+str(i+1)] = np.zeros([1, layer_dims[i+1]])    
    parameters['gamma'+str(i+1)] = np.ones([1, layer_dims[i+1]])
    parameters['beta'+str(i+1)] = np.zeros([1, layer_dims[i+1]])
    parameters['mean'+str(i+1)] = 0
    parameters['var'+str(i+1)] = 0
    
    return parameters

In [32]:
#forward propagation
def forward_propagation(layer_dims,train_x,parameters, keep_probs, mode, momentum, running_mean, running_var):
    
    caches = []
    Aprev = train_x
    L = len(layer_dims)
    
    #forward propagation for all the layers except last layer
    for i in range(1,L-1): 
        W = parameters['W'+ str(i)]
        b = parameters['b' + str(i)] 
        gamma = parameters['gamma' + str(i)]
        beta = parameters['beta' + str(i)]
        running_mean = parameters['mean' + str(i)]
        running_var = parameters['var' + str(i)]

        Z = np.dot(Aprev, W) + b  

        if mode == 'train':          
          mean = Z.mean(axis=0, keepdims=True)
          var = Z.var(axis=0, keepdims=True)
          std = np.sqrt(var + 1e-8)
          Z_centered = (Z - mean)
          Z_norm = Z_centered / std 
          output = (gamma * Z_norm) + beta

          running_mean = (momentum * running_mean) + (1 - momentum) * mean
          running_var = (momentum * running_var) + (1 - momentum) * var
          parameters['mean' + str(i)] = running_mean
          parameters['var' + str(i)] = running_var

          Aprev = np.maximum(0,output)       
          cache = Aprev, W, b,Z_norm, std, Z_centered

        else:
          Z_norm = (Z - running_mean) / np.sqrt(running_var + 1e-8)
          output = (gamma * Z_norm) + beta

          Aprev = np.maximum(0,output) 
          cache = Aprev, W, b
        
        caches.append(cache)     
    
    #forward propagation for the last layer
    W = parameters['W'+ str(L-1)]
    b = parameters['b' + str(L-1)]
    gamma = parameters['gamma' + str(L-1)]
    beta = parameters['beta' + str(L-1)]
    running_mean = parameters['mean' + str(L-1)]
    running_var = parameters['var' + str(L-1)]

    Zlast = np.dot(Aprev, W) + b  
    if mode == 'train':   
      mean = Zlast.mean(axis=0, keepdims=True)
      var = Zlast.var(axis=0, keepdims=True)
      std = np.sqrt(var + 1e-8)
      Z_centered = (Zlast - mean)
      Z_norm = Z_centered / std 
      output = (gamma * Z_norm) + beta   

      running_mean = (momentum * running_mean) + (1 - momentum) * mean
      running_var = (momentum * running_var) + (1 - momentum) * var
      parameters['mean' + str(L-1)] = running_mean
      parameters['var' + str(L-1)] = running_var

      Alast = 1/(1 + np.exp(-output))   
      cache = Alast, W, b, Z_norm, std, Z_centered

    else:
      Z_norm = (Zlast - running_mean) / np.sqrt(running_var + 1e-8)
      output = (gamma * Z_norm) + beta

      Alast = 1/(1 + np.exp(-output)) 
      cache = Aprev, W, b

    caches.append(cache)
    return caches

In [33]:
#cost function calculation
def cost_calculate(predict_y,train_y):
    m = train_y.shape[0]
    cost = -(np.dot(train_y.T, np.log(predict_y)) + np.dot((1-train_y).T, np.log(1-predict_y)))/m
    return cost

In [34]:
def backward_propagation(layer_dims, caches, parameters, train_x, train_y, learning_rate):
    #backward propagation for the last layer
    #Extract the last array from the caches, as this corresponds to the final output
    L = len(caches)    
    Acurr,Wcurr,bcurr,Znorm_curr,std_curr,Zcenter_curr= caches[L - 1]  
    Aprev,Wprev,bprev,Znorm_prev,std_prev,Zcenter_prev = caches[L - 2]

    m = train_y.shape[0]   
    
    dzprev = (Acurr - train_y)    
    dwlast = np.dot(Aprev.T, dzprev)/m    
    dblast = np.sum(dzprev, keepdims = True, axis = 0)/m   
    dgamma = (dzprev * Znorm_curr).sum(axis=0) 
    dbeta  = dzprev.sum(axis=0)
    parameters['W' + str(L)]= parameters['W' + str(L)] - (learning_rate * dwlast)    
    parameters['b' + str(L)]= parameters['b' + str(L)] - (learning_rate * dblast)
    gamma = parameters['gamma' + str(L)]

    parameters['gamma' + str(L)]= parameters['gamma' + str(L)] - (learning_rate * dgamma)    
    parameters['beta' + str(L)]= parameters['beta' + str(L)] - (learning_rate * dbeta)      
    
    dZnorm = dzprev * gamma    
    dZcentered = dZnorm / std_curr
    dmean = -(dZcentered.sum(axis=0) + 2/m * Zcenter_curr.sum(axis=0))
    dstd = (dZnorm * Zcenter_curr * -std_curr**(-2)).sum(axis=0)
    dvar = dstd / 2 / std_curr
    dzprev = dZcentered + (dmean + dvar * 2 * Zcenter_curr) / m
            
    for i in reversed(range(L-1)):
        Anext,Wnext,bnext,Znorm_next,std_next,Zcenter_next = caches[i+1]
        Acurr,Wcurr,bcurr,Znorm_curr,std_curr,Zcenter_curr = caches[i]  
        if i == 0:
            Aprev = train_x
        else:            
            Aprev,Wprev,bprev,Znorm_prev,std_prev,Zcenter_prev = caches[i-1]
                
        dzcurr = np.where(Acurr > 0,1,Acurr)                     
        dzprev = np.multiply(np.dot(dzprev,Wnext.T), dzcurr)
        
        dW = np.dot(Aprev.T,dzprev)/m
        db = np.sum(dzprev, keepdims = True, axis = 0)/m  
        dgamma = (dzprev * Znorm_curr).sum(axis=0) 
        dbeta  = dzprev.sum(axis=0)
        parameters['W' + str(i+1)]= parameters['W' + str(i+1)] - (learning_rate * dW)
        parameters['b' + str(i+1)]= parameters['b' + str(i+1)] - (learning_rate * db)  
        gamma = parameters['gamma' + str(i+1)]

        parameters['gamma' + str(i+1)]= parameters['gamma' + str(i+1)] - (learning_rate * dgamma)    
        parameters['beta' + str(i+1)]= parameters['beta' + str(i+1)] - (learning_rate * dbeta)  

        dZnorm = dzprev * gamma    
        dZcentered = dZnorm / std_curr
        dmean = -(dZcentered.sum(axis=0) + 2/m * Zcenter_curr.sum(axis=0))
        dstd = (dZnorm * Zcenter_curr * -std_curr**(-2)).sum(axis=0)
        dvar = dstd / 2 / std_curr
        dzprev = dZcentered + (dmean + dvar * 2 * Zcenter_curr) / m   

    return parameters

In [37]:
def complete_model(layer_dims, train_x, train_y, learning_rate, iterations, keep_probs,mode,momentum):
    
    L = len(layer_dims)
    # Intialize the weights
    parameters = intialize_weights(layer_dims)
    running_mean=0
    running_var = 0
    
    for i in range(iterations):
      for j in range(len(batch_trainx)):
        #forward propagation
        caches = forward_propagation(layer_dims,batch_trainx[j],parameters, keep_probs,mode,momentum,running_mean, running_var)
        
        #calculate the cost 
        A,W,b,_,_,_ = caches[-1]
        cost = cost_calculate(A,batch_trainy[j])
        if i%1000 == 0:
            print('The cost after iteration {}: {}'.format(i, np.squeeze(cost)))
        #backward propagation
        parameters = backward_propagation(layer_dims, caches, parameters,batch_trainx[j], batch_trainy[j], learning_rate)

    return parameters

In [41]:
layer_dims = [4,5,3,1]
learning_rate = 0.15
iterations = 2000
keep_probs = 0.8
momentum = 0.9
mode = 'train'
parameters = complete_model(layer_dims, train_x, train_y, learning_rate, iterations, keep_probs, mode, momentum=0.9)

The cost after iteration 0: 1.2101867439489764
The cost after iteration 0: 0.7520314867826491
The cost after iteration 0: 0.5474389831384234
The cost after iteration 0: 0.6688163750304513
The cost after iteration 0: 0.4416965752526794
The cost after iteration 0: 0.28954219535274184
The cost after iteration 0: 0.23832935767689267
The cost after iteration 0: 0.2821308861140581
The cost after iteration 0: 0.15666508173759636
The cost after iteration 1000: 9.259146607144412e-05
The cost after iteration 1000: 1.3022761554795474e-05
The cost after iteration 1000: 0.0007565095045377697
The cost after iteration 1000: 5.7340811672004426e-05
The cost after iteration 1000: 5.7813368250757745e-05
The cost after iteration 1000: 1.296708644961703e-05
The cost after iteration 1000: 5.93523245804709e-05
The cost after iteration 1000: 0.0005729326734101812
The cost after iteration 1000: 1.8026234726226896e-05


In [69]:
from keras.models import Sequential
from keras.layers import Activation, Dense
from keras import optimizers
from keras.layers import BatchNormalization, Dropout

In [70]:
model = Sequential()
model.add(Dense(50, input_shape = (4, ),kernel_initializer='he_normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(50,kernel_initializer='he_normal'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(1,kernel_initializer='glorot_normal'))
model.add(BatchNormalization())
model.add(Activation('sigmoid'))

In [71]:
sgd = optimizers.SGD(lr = 0.001)
model.compile(optimizer = sgd, loss = 'binary_crossentropy', metrics = ['accuracy'])

In [72]:
model.fit(train_x, train_y, batch_size=9, validation_data=(test_x, test_y), epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f2ca4ed7b70>