In [1]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Activation, Dense
from keras import optimizers
from keras.layers import Dropout

Using TensorFlow backend.


In [2]:
iris = load_iris()

In [3]:
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [4]:
data = pd.DataFrame(iris['data'], columns = iris['feature_names'] )
target = pd.DataFrame(iris['target'],columns = ['target'])

In [5]:
#combine the input predictors and target so that it can be split into training and testing
data_target = data.join(target)

In [6]:
data_target.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [7]:
data_target['target'].value_counts()

2    50
1    50
0    50
Name: target, dtype: int64

In [8]:
#for this implementaion, let's make the problem a binary. So considering only '0' and '1' as the target
X = np.array(data_target[(data_target['target'] == 0) | (data_target['target'] == 1)].drop('target', axis=1))
y = np.array(data_target[(data_target['target'] == 0) | (data_target['target'] == 1)]['target']).reshape(100,1)

In [9]:
train_x,test_x,train_y,test_y = train_test_split(X,y,test_size = 0.2)

print('Shape of train_x:', train_x.shape)
print('Shape of train_y:', train_y.shape)
print('Shape of test_x:', test_x.shape)
print('Shape of test_y:', test_y.shape)

Shape of train_x: (80, 4)
Shape of train_y: (80, 1)
Shape of test_x: (20, 4)
Shape of test_y: (20, 1)


Splitting the data into 9 mini batches

In [10]:
batches = 9

#get the total number of batches
batch_count = train_x.shape[0] // batches

batch_trainx = []
batch_trainy = []

for i in range(0, batch_count):    
    begin = i * batches
    end = (i + 1) * batches

    batch_trainx.append(train_x[begin:end])
    batch_trainy.append(train_y[begin:end])

#when the total count is not exactly divisible by batches
left_out = train_x.shape[0] % batches

if left_out != 0:    
    batch_trainx.append(train_x[end: end + left_out])
    batch_trainy.append(train_y[end: end + left_out])

In [26]:
# intializing the weights for each layer
def intialize_weights(layer_dims):
    np.random.seed(3)
    parameters = {}
    L = len(layer_dims)
    
    #initialise the value of weights based on the number of layers
    for i in range(1,L-1):
        #for the hidden layers we will use He initialisation because of relu activation
        parameters['W'+str(i)] = np.random.randn(layer_dims[i-1],layer_dims[i]) * np.sqrt(2/layer_dims[i-1])
        parameters['v1'+str(i)] = np.zeros([layer_dims[i-1],layer_dims[i]])
        parameters['v2'+str(i)] = np.zeros([layer_dims[i-1],layer_dims[i]])
        parameters['b'+str(i)] = np.zeros([1, layer_dims[i]])   
        parameters['u1'+str(i)] = np.zeros([1, layer_dims[i]])   
        parameters['u2'+str(i)] = np.zeros([1, layer_dims[i]])   
        
            
    #for the last layer we can use Xavier initialisation 
    parameters['W' + str(i+1)] = np.random.randn(layer_dims[i],layer_dims[i+1]) * np.sqrt(1/layer_dims[i])
    parameters['v1' + str(i+1)] = np.zeros([layer_dims[i],layer_dims[i+1]])
    parameters['v2' + str(i+1)] = np.zeros([layer_dims[i],layer_dims[i+1]])
    parameters['b'+str(i+1)] = np.zeros([1, layer_dims[i+1]])    
    parameters['u1'+str(i+1)] = np.zeros([1, layer_dims[i+1]])    
    parameters['u2'+str(i+1)] = np.zeros([1, layer_dims[i+1]])    
    
    return parameters

In [27]:
#forward propagation
def forward_propagation(layer_dims,train_x,parameters):
    
    caches = []
    Aprev = train_x
    L = len(layer_dims)
    
    #forward propagation for all the layers except last layer
    for i in range(1,L-1): 
        W = parameters['W'+ str(i)]
        b = parameters['b' + str(i)] 
        Z = np.dot(Aprev, W) + b  
        Aprev = np.maximum(0,Z)       
        cache = Aprev, W, b
        caches.append(cache)     
    
    #forward propagation for the last layer
    W = parameters['W'+ str(L-1)]
    b = parameters['b' + str(L-1)]
    Zlast = np.dot(Aprev, W) + b    
    Alast = 1/(1 + np.exp(-Zlast))   
    cache = Alast, W, b

    caches.append(cache)
    return caches

In [28]:
#cost function calculation
def cost_calculate(predict_y,train_y):
    m = train_y.shape[0]
    cost = -(np.dot(train_y.T, np.log(predict_y)) + np.dot((1-train_y).T, np.log(1-predict_y)))/m
    return cost

In [38]:
def backward_propagation(layer_dims, caches, parameters, train_x, train_y, learning_rate, beta1, beta2, iteration):
    #backward propagation for the last layer
    #Extract the last array from the caches, as this corresponds to the final output
    L = len(caches)    
    Acurr,Wcurr,bcurr = caches[L - 1]  
    Aprev,Wprev,bprev = caches[L - 2]
    v1 = parameters['v1'+str(L)]
    u1 = parameters['u1'+str(L)]
    v2 = parameters['v2'+str(L)]
    u2 = parameters['u2'+str(L)]
    epsilon = 10e-8

    m = train_y.shape[0]   
    
    dzprev = (Acurr - train_y)    
    dwlast = np.dot(Aprev.T, dzprev)/m    
    dblast = np.sum(dzprev, keepdims = True, axis = 0)/m   
    dv1last = (beta1 * v1) + ((1-beta1) * dwlast)/(1-np.power(beta1, iteration))
    du1last = (beta1 * u1) + ((1-beta1) * dblast)/(1-np.power(beta1, iteration))
    dv2last = (beta2 * v2) + ((1-beta2) * np.power(dwlast,2))/(1-np.power(beta2, iteration))
    du2last = (beta2 * u2) + ((1-beta2) * np.power(dblast,2))/(1-np.power(beta2, iteration))
    
    parameters['W' + str(L)]= parameters['W' + str(L)] - (learning_rate * dv1last/np.sqrt(dv2last+epsilon))    
    parameters['b' + str(L)]= parameters['b' + str(L)] - (learning_rate * du1last/np.sqrt(du2last+epsilon))   
    parameters['v1' + str(L)]= dv1last
    parameters['u1' + str(L)]= du1last
    parameters['v2' + str(L)]= dv2last
    parameters['u2' + str(L)]= du2last
            
    for i in reversed(range(L-1)):
        Anext,Wnext,bnext = caches[i+1]
        Acurr,Wcurr,bcurr = caches[i]  
        v1 = parameters['v1'+str(i+1)]
        u1 = parameters['u1'+str(i+1)]
        v2 = parameters['v2'+str(i+1)]
        u2 = parameters['u2'+str(i+1)]
        
        if i == 0:
            Aprev = train_x
        else:            
            Aprev,Wprev,bprev = caches[i-1]
                
        dzcurr = np.where(Acurr > 0,1,Acurr)                     
        dzprev = np.multiply(np.dot(dzprev,Wnext.T), dzcurr)
        
        dW = np.dot(Aprev.T,dzprev)/m
        db = np.sum(dzprev, keepdims = True, axis = 0)/m  
        dv1 = (beta1 * v1) + ((1-beta1) * dW)/(1-np.power(beta1, iteration))
        du1 = (beta1 * u1) + ((1-beta1) * db)/(1-np.power(beta1, iteration))
        dv2 = (beta2 * v2) + ((1-beta2) * np.power(dW,2))/(1-np.power(beta2, iteration))
        du2 = (beta2 * u2) + ((1-beta2) * np.power(db,2))/(1-np.power(beta2, iteration))
        
        parameters['W' + str(i+1)]= parameters['W' + str(i+1)] - (learning_rate * dv1/np.sqrt(dv2+epsilon))
        parameters['b' + str(i+1)]= parameters['b' + str(i+1)] - (learning_rate * du1/np.sqrt(du2+epsilon))    
        parameters['v1' + str(i+1)]= dv1
        parameters['u1' + str(i+1)]= du1
        parameters['v2' + str(i+1)]= dv2
        parameters['u2' + str(i+1)]= du2
        
    return parameters

In [39]:
def complete_model(layer_dims, batch_trainx, batch_trainy, learning_rate, beta1, beta2, iteration):
        
    L = len(layer_dims)
    # Intialize the weights
    parameters = intialize_weights(layer_dims)
    act_iteration = 0
    
    for i in range(iterations):        
        for j in range(len(batch_trainx)):            
            #forward propagation
            caches = forward_propagation(layer_dims,batch_trainx[j],parameters)
        
            #calculate the cost 
            A,W,b = caches[-1]
            cost = cost_calculate(A,batch_trainy[j])
            act_iteration += 1
            parameters = backward_propagation(layer_dims, caches, parameters,batch_trainx[j], batch_trainy[j], learning_rate, beta1, beta2, act_iteration)
            
        if i%1000 == 0:
            print('The cost after iteration {}: {}'.format(i, np.squeeze(cost)))
        #backward propagation            

    return parameters

In [40]:
layer_dims = [4,5,3,1]
learning_rate = 0.001
iterations = 5000
beta1 = 0.9
beta2 = 0.99
parameters = complete_model(layer_dims, batch_trainx, batch_trainy, learning_rate, beta1, beta2, iterations)

The cost after iteration 0: 0.3845395014051043
The cost after iteration 1000: 0.00016910230289556455
The cost after iteration 2000: 2.2398821097787057e-05
The cost after iteration 3000: 1.1788991810088925e-05
The cost after iteration 4000: 7.994388214255581e-06


In [41]:
caches = forward_propagation(layer_dims,test_x,parameters)
A,W,b = caches[-1]
cost = cost_calculate(A, test_y)
cost


array([[6.82274379e-06]])

#### RMSprop using keras implementation

In [42]:
from keras.models import Sequential
from keras.layers import Activation, Dense
from keras import optimizers
from keras.layers import BatchNormalization, Dropout

In [43]:
model = Sequential()
model.add(Dense(50, input_shape = (4, ),kernel_initializer='he_normal'))
model.add(Activation('relu'))
model.add(Dense(50,kernel_initializer='he_normal'))
model.add(Activation('relu'))
model.add(Dense(1,kernel_initializer='glorot_normal'))
model.add(Activation('sigmoid'))

In [44]:
adam = optimizers.Adam(lr = 0.001)
model.compile(optimizer = adam, loss = 'binary_crossentropy', metrics = ['accuracy'])

In [45]:
model.fit(train_x, train_y, batch_size=9, validation_data=(test_x, test_y), epochs=50)

Train on 80 samples, validate on 20 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x2a18d2347c8>