# Problem 5.6.2 
Consider multiclass classification (3 classes) with the case of 2 features and 3 data points (m=3):

$X = \begin{bmatrix} 1 & 2 & 4 \\ -2 & -5 & -8 \end{bmatrix}, Y = \begin{bmatrix} 0 & 1 & 2 \end{bmatrix}$

Assume that layer 1 has 2 units and that layer 2 has 1 units with parameter matrices:

$W^{[1]} = \begin{bmatrix} 0.5 & 0.5 \\ 0.5 & -0.5 \end{bmatrix}, b^{[1]} = \begin{bmatrix} 0.5 \\ 0.5 \end{bmatrix},  W^{[2]} = \begin{bmatrix} -1 & 1 \\ 1 & -1 \\ -2 & 1 \end{bmatrix}, b^{[2]} = \begin{bmatrix} -0.1 \\ -0.1 \\ -0.1  \end{bmatrix}$

Assume activation functions $f^{[1]}  (z)=log⁡(1+e^z$) and $f^{[2]}  (z)=softmax(z) $ and  cross entropy loss function.

In [48]:
import numpy as np

In [49]:
# inputs
X = np.array([[1,2,4],[-2,-5,-8]])
Y = np.array([[0,1,2]])
W1 = np.array([[0.5,0.5],[0.5,-0.5]])
b1 = np.array([[0.5],[0.5]])
W2 = np.array([[-1,1],[1,-1],[-2,1]])
b2 = np.array([[-0.1],[-0.1],[-0.1]])
m = X.shape[1]
nclasses = 3

**(a)**	Compute the value of the loss function for the above $W^{[1]}, b^{[1]},  W^{[2]}, b^{[2]}$

In [50]:
def onehot(Y,nclass):
    ndata = Y.shape[1]
    Y_onehot = np.zeros((nclass,ndata))
    for count in range(ndata):
        Y_onehot[int(Y[0,count]),count] = 1.0
    return Y_onehot

In [51]:
# forward propagation
# layer 1
Z1 = np.dot(W1,X) + b1
print("Z1: \n{}".format(Z1))
A1 = np.log(1+np.exp(Z1))
print("A1: \n{}".format(A1))
# layer 2
Z2 = np.dot(W2,A1) + b2
print("Z2: \n{}".format(Z2))
Z2exp = np.exp(Z2)
print("Z2exp: \n{}".format(Z2exp))
Sum = np.sum(Z2exp,axis=0,keepdims=True)
print("Sum: {}".format(Sum))
A2 = Z2exp/Sum
print("A2: \n{}".format(A2))

Z1: 
[[ 0.  -1.  -1.5]
 [ 2.   4.   6.5]]
A1: 
[[0.69314718 0.31326169 0.20141328]
 [2.12692801 4.01814993 6.50150231]]
Z2: 
[[ 1.33378083  3.60488824  6.20008903]
 [-1.53378083 -3.80488824 -6.40008903]
 [ 0.64063365  3.29162655  5.99867575]]
Z2exp: 
[[3.79536593e+00 3.67775734e+01 4.92792914e+02]
 [2.15718528e-01 2.22616850e-02 1.66140935e-03]
 [1.89768297e+00 2.68865605e+01 4.02894908e+02]]
Sum: [[  5.90876742  63.68639559 895.68948316]]
A2: 
[[6.42327859e-01 5.77479272e-01 5.50182762e-01]
 [3.65082111e-02 3.49551655e-04 1.85489433e-06]
 [3.21163930e-01 4.22171176e-01 4.49815383e-01]]


In [52]:
Yh = onehot(Y,nclasses)
print("Yh: \n{}".format(Yh))
logA2 = np.log(A2)
print("logA2: \n{}".format(logA2))
YhlogA2 = Yh*logA2
print("Yh*logA2: \n{}".format(YhlogA2))
# compute loss
Loss = -np.sum(Yh*np.log(A2))/m
print("Loss: {}".format(Loss))

Yh: 
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
logA2: 
[[ -0.44265642  -0.54908273  -0.59750476]
 [ -3.31021808  -7.95885921 -13.19768283]
 [ -1.1358036   -0.86234442  -0.79891804]]
Yh*logA2: 
[[-0.44265642 -0.         -0.        ]
 [-0.         -7.95885921 -0.        ]
 [-0.         -0.         -0.79891804]]
Loss: 3.066811223752603


**(b)**	Perform 1  epoch of training using Gradient Descent with learning rate of 0.1 and recompute the loss function with the updated $W^{[1]}, b^{[1]},  W^{[2]}, b^{[2]}$

In [53]:
# back propagation
# dL/dA2
print("Yonehot: \n{}".format(Yh))
grad_A2_L = -Yh/A2/m
print("grad_A2_L: \n{}".format(grad_A2_L))
# layer 2
# dL/dZ2
prod2 = A2*grad_A2_L
print("A2*grad_A2_L: \n{}".format(prod2))
sumterm = np.sum(prod2,axis=0,keepdims=True)
print("sumterm: {}".format(sumterm))
grad_Z2_L = prod2 - A2*sumterm
print("grad_Z2_L: \n{}".format(grad_Z2_L))
grad_W2_L = np.dot(grad_Z2_L,A1.T)
grad_b2_L = np.sum(grad_Z2_L,axis=1,keepdims=True)
grad_A1_L = np.dot(W2.T,grad_Z2_L)
print("grad_W2_L: {}".format(grad_W2_L))
print("grad_b2_L: {}".format(grad_b2_L))
print("grad_A1_L: {}".format(grad_A1_L))
# layer 1
dA1dZ1 = 1-A1*A1
print("dA1/dZ1: {}".format(dA1dZ1))
grad_Z1_L = grad_A1_L*dA1dZ1
print("grad_Z1_L: {}".format(grad_Z1_L))
grad_W1_L = np.dot(grad_Z1_L,X.T)
grad_b1_L = np.sum(grad_Z1_L,axis=1,keepdims=True)
print("grad_W1_L: {}".format(grad_W1_L))
print("grad_b1_L: {}".format(grad_b1_L))
# gradient descent epoch 1
alpha = 0.1
# update parameters
W1 = W1 - alpha*grad_W1_L
b1 = b1 - alpha*grad_b1_L
W2 = W2 - alpha*grad_W2_L
b2 = b2 - alpha*grad_b2_L
print("W1 update: \n{}".format(W1))
print("b1 update: \n{}".format(b1))
print("W2 update: {}".format(W2))
print("b2 update: {}".format(b2))
# forward propagation
# layer 1
Z1 = np.dot(W1,X) + b1
print("Z1: \n{}".format(Z1))
A1 = np.log(1+np.exp(Z1))
print("A1: \n{}".format(A1))
# layer 2
Z2 = np.dot(W2,A1) + b2
print("Z2: \n{}".format(Z2))
Z2exp = np.exp(Z2)
print("Z2exp: \n{}".format(Z2exp))
Sum = np.sum(Z2exp,axis=0,keepdims=True)
print("Sum: {}".format(Sum))
A2 = Z2exp/Sum
print("A2: \n{}".format(A2))
logA2 = np.log(A2)
print("logA2: \n{}".format(logA2))
YhlogA2 = Yh*logA2
print("Yh*logA2: \n{}".format(YhlogA2))
# recompute loss
Loss = -np.sum(Yh*np.log(A2))/m
print("Loss: {}".format(Loss))

Yonehot: 
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
grad_A2_L: 
[[-5.18945782e-01 -0.00000000e+00 -0.00000000e+00]
 [-0.00000000e+00 -9.53602503e+02 -0.00000000e+00]
 [-0.00000000e+00 -0.00000000e+00 -7.41044761e-01]]
A2*grad_A2_L: 
[[-0.33333333 -0.         -0.        ]
 [-0.         -0.33333333 -0.        ]
 [-0.         -0.         -0.33333333]]
sumterm: [[-0.33333333 -0.33333333 -0.33333333]]
grad_Z2_L: 
[[-1.19224047e-01  1.92493091e-01  1.83394254e-01]
 [ 1.21694037e-02 -3.33216816e-01  6.18298111e-07]
 [ 1.07054643e-01  1.40723725e-01 -1.83394872e-01]]
grad_W2_L: [[ 0.01459894  1.7122233 ]
 [-0.09594875 -1.31302766]
 [ 0.08134981 -0.39919564]]
grad_b2_L: [[ 0.2566633 ]
 [-0.32104679]
 [ 0.0643835 ]]
grad_A1_L: [[-8.27158358e-02 -8.07157358e-01  1.83396109e-01]
 [-2.43388074e-02  6.66433632e-01 -1.23659622e-06]]
dA1/dZ1: [[  0.51954699   0.90186712   0.95943269]
 [ -3.52382276 -15.14552884 -41.26953229]]
grad_Z1_L: [[-4.29747632e-02 -7.27948678e-01  1.75956222e-01]
 [ 8.57656435e-02 -

**(c)**	Compute the prediction based on input feature matrix X above after the 1 epoch.

In [54]:
def onehot_inverse(A):
    return np.expand_dims(np.argmax(A,axis=0),axis=0)

In [55]:
# forward propagation
# layer 1
Z1 = np.dot(W1,X) + b1
print("Z1: {}".format(Z1))
A1 = np.log(1+np.exp(Z1))
print("A1: {}".format(A1))
# layer 2
Z2 = np.dot(W2,A1) + b2
print("Z2: {}".format(Z2))
Z2exp = np.exp(Z2)
A2 = Z2exp/np.sum(Z2exp,axis=0,keepdims=True)
print("A2: {}".format(A2))
# prediction
Y_pred = onehot_inverse(A2)
print("Prediction: {}".format(Y_pred))

Z1: [[ 0.60261007  0.37752774  0.73195012]
 [15.06997018 34.168724   55.77757879]]
A1: [[ 1.03917394  0.89962214  1.12464748]
 [15.06997047 34.168724   55.77757879]]
Z2: [[ 11.32329766  27.29167364  44.97525611]
 [-12.1099923  -28.84191744 -47.3862854 ]
 [ 13.47831723  33.61972352  55.63931311]]
A2: [[1.03863094e-01 1.78232997e-03 2.33694635e-05]
 [6.91055193e-12 7.45554255e-28 1.80526485e-45]
 [8.96136906e-01 9.98217670e-01 9.99976631e-01]]
Prediction: [[2 2 2]]


**(d)**	Compute the accuracy of the prediction in (c) when compared against the actual Y specified above.

In [56]:
# compute accuracy
accuracy = np.mean(np.absolute(Y-Y_pred)<1e-7)
print("accuracy: {}".format(accuracy))

accuracy: 0.3333333333333333
