In [1]:
import numpy as np

In [2]:
#AND dataset
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) #X1 and X2
Y = np.array([[0, 0, 0, 1]]).T #Y 

In [3]:
#sigmoid function for the single unit in the o/p unit
def sig(z):
    return 1/(1 + np.exp(-z))

In [6]:
#d(sig(z))/d(z)
def derivative_sig(z):
    return sig(z) * (1 - sig(z))

In [25]:
#initializing weights
weights = 2 * np.random.random((2, 1)) - 1
bias = 2 * np.random.random(1) - 1
lr = 0.1 #learning rate for updating weights using gradient descent
weights, bias

(array([[ 0.20007817],
        [-0.16827016]]),
 array([0.11647475]))

In [22]:
for iter in range(10000):
    #Forward propagation
    output0 = X
    output = sig(np.dot(output0, weights) + bias)

    #gradient descent for weight optimization
    first_term = output - Y #d(E)/d(Oj) = y_pred - y_actual
    input_for_last_layer = np.dot(output0, weights) + bias #i/p_j
    second_term = derivative_sig(input_for_last_layer) #d(Oj)/d(i/p_j) = sig(z)(1 - sig(z))
    first_two = first_term * second_term
    #Now, we have our first two terms of gradient descent ready. And, as expected their dimension is 4X1, as we said in the notes.
    #Use this to verify: first_two.shape
    #Now, let's compute the third term: d(i/p_j)/d(w_ij), and then we need to  multipy it with the first two terms and do summation for each training data point in order to get our gradient descent.
    #In our code, to do that we just need to:
    #For w11: multiply each x1 value with its corresponding element of first_two
    #For w21: multiply each x2 value with its corresponding element of first_two
    changes = np.array([[0.0], [0.0]]) # array in which we will store the summation (the sum values). Same dimension as that of weights i.e. 2X1 
    #So, changes, will be our updated weights!
    for i in range(2): #we have to go through values of 2 features x1 nad x2
        for j in range(4): #we have 4 rows i.e. 4 values of each feature
            #If we're dealing with w11, then changes[0][0], for w21 -> changes[1][0]; therefore, changes[i_th feature][0_th column]
            changes[i][0] += first_two[j][0] * output0[j][i] 
    weights = weights - lr * changes
    #We're done with weights updation, but, we still have to find a way to update the biases.
    #It's exactly same, we just need to multiply  with 1 instead of output[j][i]. Also, as we just have a single bias, we don't iterate twice, we just iterate once over the 4 rows of i/p
    bias_change = 0.0
    for j in range(4):
        bias_change += first_two[j][0] * 1
    bias = bias - lr * bias_change
    #Finally, we have successfully updated our bias as well as our weights

    #We'll run this cell many times, say 10000, as we updating just once won't be enough to get the optimal weights
output = sig(np.dot(X, weights) + bias) #calculating o/p final time
weights, bias, output

(array([[5.47433843],
        [5.47433846]]),
 array([-8.30422242]),
 array([[2.47408472e-04],
        [5.57305043e-02],
        [5.57305029e-02],
        [9.33668371e-01]]))

In [23]:
#The third array shows the output. We expect it to be 0,0,0,1 but it is what you are seeing above. However, it does tell us
#that we're going in the correct direction uptil now, as the last one is >0.5, the first three are <0.5.
#But, we want the last one to be very close to 1 and the first three to be very close to 0.
#So, first of all let's try and change the learning rate, make it 0.01

#Yes, as we incresed the learning rate from 0.001 to 0.01, we got much better results.
#Let's give another shot and make it 0.1
#Yay! We're getting much better results now!

In [24]:
#Thus, we have successfully trained our first neural network having no hidden layers.

In [26]:
#Optimizing above cell's code by replacing for loops for weight updation with vector operations, here, metrix multiplication
#And replacing for loop for bias updation with np.sum
for iter in range(10000):
    #Forward propagation
    output0 = X
    output = sig(np.dot(output0, weights) + bias)

    #gradient descent for weight optimization
    first_term = output - Y #d(E)/d(Oj) = y_pred - y_actual
    input_for_last_layer = np.dot(output0, weights) + bias #i/p_j
    second_term = derivative_sig(input_for_last_layer) #d(Oj)/d(i/p_j) = sig(z)(1 - sig(z))
    first_two = first_term * second_term
    #Now, we have our first two terms of gradient descent ready. And, as expected their dimension is 4X1, as we said in the notes.
    #Use this to verify: first_two.shape
    #Now, let's compute the third term: d(i/p_j)/d(w_ij), and then we need to  multipy it with the first two terms and do summation for each training data point in order to get our gradient descent.
    #In our code, to do that we just need to:
    #For w11: multiply each x1 value with its corresponding element of first_two
    #For w21: multiply each x2 value with its corresponding element of first_two
    changes = np.dot(output0.T, first_two)
    weights = weights - lr * changes
    #We're done with weights updation, but, we still have to find a way to update the biases.
    #It's exactly same, we just need to multiply  with 1 instead of output[j][i]. Also, as we just have a single bias, we don't iterate twice, we just iterate once over the 4 rows of i/p
    bias_change = np.sum(first_two)
    bias = bias - lr * bias_change
    #Finally, we have successfully updated our bias as well as our weights

    #We'll run this cell many times, say 10000, as we updating just once won't be enough to get the optimal weights
output = sig(np.dot(X, weights) + bias) #calculating o/p final time
weights, bias, output

#Before running this cell run the weight initialization cell once again to avoid the neural network to be pre-trained due to the above un-optimized code.

(array([[5.47567819],
        [5.47567819]]),
 array([-8.30622607]),
 array([[2.46913370e-04],
        [5.56955759e-02],
        [5.56955762e-02],
        [9.33710215e-01]]))

In [27]:
#Thus, we have successfully trained our first neural network having no hidden layers.