# Forward and Backpropagation Lecture - Elementary Neural Network

In [504]:
import numpy as np

In [505]:
X = np.array([[0.333,0.013,1]])

In [506]:
Y = np.asarray([-2])

In [507]:
Theta_1 = np.asarray([[0.947, -0.010, 2.205], [2.299,  0.121, -2.276], [-2.465, -0.855,  0.695], [-2.309, -2.961,  1.107]])

In [508]:
Theta_2 = np.asarray([[-2.883,-1.067,0.987,-1.758,-1.435]])

In [509]:
Init_Theta_1 = Theta_1

In [510]:
Init_Theta_2 = Theta_2

In [511]:
def TANH(x):
    x = np.tanh(x)
    return x

In [512]:
def TANH_GRAD(x):
    x = 1 - np.tanh(x)**2
    return x

In [513]:
iters = 500

In [514]:
lr = 0.01 # Learning rate 

In [515]:
for i in range(iters):
    # Fowards propagation
    z = np.dot(X,Theta_1.T)
    a = np.asarray([np.append(TANH(z),1)])
    out = np.dot(a,Theta_2.T)
    J = 0.5*(out - Y)**2
    
    #Backpropagation
    J_prime = out - Y
    Delta_2 = J_prime*a
    Delta_1 = np.dot((np.dot(J_prime,Theta_2[:,:-1])*TANH_GRAD(z)).T,X)
    
    #SGD weight update
    Theta_1 = Theta_1 - lr*Delta_1
    Theta_2 = Theta_2 - lr*Delta_2
    
    if (i + 1) % 50 == 0 or i == 0:
        print("Iteration :",i+1," Loss: ",J,)
    

Iteration : 1  Loss:  [[ 1.91939201]]
Iteration : 50  Loss:  [[ 0.00134328]]
Iteration : 100  Loss:  [[  8.92859860e-07]]
Iteration : 150  Loss:  [[  5.97532208e-10]]
Iteration : 200  Loss:  [[  3.99960222e-13]]
Iteration : 250  Loss:  [[  2.67715972e-16]]
Iteration : 300  Loss:  [[  1.79197569e-19]]
Iteration : 350  Loss:  [[  1.19953685e-22]]
Iteration : 400  Loss:  [[  8.02275485e-26]]
Iteration : 450  Loss:  [[  6.66587465e-29]]
Iteration : 500  Loss:  [[  4.83177304e-30]]


In [516]:
out

array([[-2.]])

In [517]:
print(Theta_1, Theta_2)

[[ 0.94043579 -0.01025626  2.18528767]
 [ 2.28071549  0.12028619 -2.33090845]
 [-2.3760237  -0.85152645  0.96219609]
 [-2.46386524 -2.96704579  0.64193922]] [[-2.61207966 -1.31728155  0.98864537 -1.74388351 -1.16047389]]


In [518]:
def NumericalGradients(x,y,initTh1,initTh2):
    e = 1e-4
    perturb_1 = np.zeros((initTh1.shape[0],initTh1.shape[1]))
    numgrad_1 = np.zeros((initTh1.shape[0],initTh1.shape[1]))
    perturb_2 = np.zeros((initTh2.shape[0],initTh2.shape[1]))
    numgrad_2 = np.zeros((initTh2.shape[0],initTh2.shape[1]))
    for i in range(initTh1.shape[0]):
        for j in range(initTh1.shape[1]):
            perturb_1[i,j] = e
            lg2 = initTh1 + perturb_1
            lg1 = initTh1 - perturb_1
            
            z_lg2 = np.dot(x,lg2.T)
            a_lg2 = np.asarray([np.append(TANH(z_lg2),1)])
            out_lg2 = np.dot(a_lg2,initTh2.T)
            J_lg2 = 0.5*(out_lg2-y)**2
            
            z_lg1 = np.dot(x,lg1.T)
            a_lg1 = np.asarray([np.append(TANH(z_lg1),1)])
            out_lg1 = np.dot(a_lg1,initTh2.T)
            J_lg1 = 0.5*(out_lg1-y)**2
            
            numgrad_1[i,j] = (J_lg2 - J_lg1) / (2*e)
            perturb_1[i,j] = 0
    for i in range(initTh2.shape[0]):
        for j in range(initTh2.shape[1]):
            perturb_2[i,j] = e
            lg2 = initTh2 + perturb_2
            lg1 = initTh2 - perturb_2
            
            z = np.dot(x,initTh1.T)
            a = np.asarray([np.append(TANH(z),1)])
            out = np.dot(a,lg2.T)
            J_lg2 = 0.5*(out-y)**2
            out = np.dot(a,lg1.T)
            J_lg1 = 0.5*(out-y)**2
            
            numgrad_2[i,j] = (J_lg2 - J_lg1) / (2*e)
            perturb_2[i,j] = 0
    return numgrad_1, numgrad_2
    

In [519]:
numgrad_Th1, numgrad_Th2 = NumericalGradients(X,Y,Init_Theta_1,Init_Theta_2)
print(numgrad_Th1,numgrad_Th2)

[[  4.80625850e-02   1.87631713e-03   1.44332088e-01]
 [  1.23796317e-01   4.83288926e-03   3.71760714e-01]
 [ -6.32029070e-01  -2.46738075e-02  -1.89798519e+00]
 [  1.04989209e+00   4.09867781e-02   3.15282907e+00]] [[-1.93408795  1.77655213  0.26667783 -0.57006406 -1.95928151]]


In [520]:
z = np.dot(X,Init_Theta_1.T)
a = np.asarray([np.append(TANH(z),1)])
out = np.dot(a,Init_Theta_2.T)
J = 0.5*(out-Y)**2
J_prime = out - Y
Delta_2 = J_prime*a
Delta_1 = np.dot((np.dot(J_prime,Init_Theta_2[:,:-1])*TANH_GRAD(z)).T,X)
print(Delta_1,Delta_2)

[[  4.80625850e-02   1.87631713e-03   1.44332087e-01]
 [  1.23796317e-01   4.83288926e-03   3.71760712e-01]
 [ -6.32029070e-01  -2.46738075e-02  -1.89798519e+00]
 [  1.04989209e+00   4.09867781e-02   3.15282909e+00]] [[-1.93408795  1.77655213  0.26667783 -0.57006406 -1.95928151]]


In [521]:
np.linalg.norm(Delta_1-numgrad_Th1)/np.linalg.norm(Delta_1+numgrad_Th1)

2.200108735243527e-09

In [522]:
np.linalg.norm(Delta_2-numgrad_Th2)/np.linalg.norm(Delta_2+numgrad_Th2)

1.0615781173761242e-12

## Linear Regression via the "Normal Equation"

In [523]:
LR_coef = np.dot(np.linalg.pinv(np.dot(X.T,X)),X.T)*-2

In [524]:
np.dot(LR_coef.T,X.T)

array([[-2.]])

In [525]:
LR_coef

array([[-0.59942865],
       [-0.02340112],
       [-1.80008604]])

### Linear regression has a global minimum and achieves this with 3 parameters. The neural net gets very close, but  with over 5 times the number of parameters (17).