# Problem 5.6.2 
Consider multiclass classification (3 classes) with the case of 2 features and 3 data points (m=3):

$X = \begin{bmatrix} 1 & 2 & 4 \\ -2 & -5 & -8 \end{bmatrix}, Y = \begin{bmatrix} 0 & 1 & 2 \end{bmatrix}$

Assume that layer 1 has 2 units and that layer 2 has 1 units with parameter matrices:

$W^{[1]} = \begin{bmatrix} 0.5 & 0.5 \\ 0.5 & -0.5 \end{bmatrix}, b^{[1]} = \begin{bmatrix} 0.5 \\ 0.5 \end{bmatrix},  W^{[2]} = \begin{bmatrix} -1 & 1 \\ 1 & -1 \\ -2 & 1 \end{bmatrix}, b^{[2]} = \begin{bmatrix} -0.1 \\ -0.1 \\ -0.1  \end{bmatrix}$

Assume activation functions $f^{[1]}  (z)=log⁡(1+e^z$) and $f^{[2]}  (z)=softmax(z) $ and  cross entropy loss function.

In [None]:
import numpy as np

In [None]:
# inputs
X = np.array([[1,2,4],[-2,-5,-8]])
Y = np.array([[0,1,2]])
W1 = np.array([[0.5,0.5],[0.5,-0.5]])
b1 = np.array([[0.5],[0.5]])
W2 = np.array([[-1,1],[1,-1],[-2,1]])
b2 = np.array([[-0.1],[-0.1],[-0.1]])
m = X.shape[1]
nclasses = 3

**(a)**	Compute the value of the loss function for the above $W^{[1]}, b^{[1]},  W^{[2]}, b^{[2]}$

In [None]:
def onehot(Y,nclass):
    ndata = Y.shape[1]
    Y_onehot = np.zeros((nclass,ndata))
    for count in range(ndata):
        Y_onehot[int(Y[0,count]),count] = 1.0
    return Y_onehot

In [None]:
# forward propagation
# layer 1
Z1 = np.dot(W1,X) + b1
print("Z1: \n{}".format(Z1))
A1 = np.log(1+np.exp(Z1))
print("A1: \n{}".format(A1))
# layer 2
Z2 = np.dot(W2,A1) + b2
print("Z2: \n{}".format(Z2))
Z2exp = np.exp(Z2)
print("Z2exp: \n{}".format(Z2exp))
Sum = np.sum(Z2exp,axis=0,keepdims=True)
print("Sum: {}".format(Sum))
A2 = Z2exp/Sum
print("A2: \n{}".format(A2))

In [None]:
Yh = onehot(Y,nclasses)
print("Yh: \n{}".format(Yh))
logA2 = np.log(A2)
print("logA2: \n{}".format(logA2))
YhlogA2 = Yh*logA2
print("Yh*logA2: \n{}".format(YhlogA2))
# compute loss
Loss = -np.sum(Yh*np.log(A2))/m
print("Loss: {}".format(Loss))

**(b)**	Perform 1  epoch of training using Gradient Descent with learning rate of 0.1 and recompute the loss function with the updated $W^{[1]}, b^{[1]},  W^{[2]}, b^{[2]}$

In [None]:
# back propagation
# dL/dA2
print("Yonehot: \n{}".format(Yh))
grad_A2_L = -Yh/A2/m
print("grad_A2_L: \n{}".format(grad_A2_L))
# layer 2
# dL/dZ2
prod2 = A2*grad_A2_L
print("A2*grad_A2_L: \n{}".format(prod2))
sumterm = np.sum(prod2,axis=0,keepdims=True)
print("sumterm: {}".format(sumterm))
grad_Z2_L = prod2 - A2*sumterm
print("grad_Z2_L: \n{}".format(grad_Z2_L))
grad_W2_L = np.dot(grad_Z2_L,A1.T)
grad_b2_L = np.sum(grad_Z2_L,axis=1,keepdims=True)
grad_A1_L = np.dot(W2.T,grad_Z2_L)
print("grad_W2_L: {}".format(grad_W2_L))
print("grad_b2_L: {}".format(grad_b2_L))
print("grad_A1_L: {}".format(grad_A1_L))

In [None]:
# layer 1
dA1dZ1 = 1-np.exp(-A1)
print("dA1/dZ1: {}".format(dA1dZ1))
grad_Z1_L = grad_A1_L*dA1dZ1
print("grad_Z1_L: {}".format(grad_Z1_L))
grad_W1_L = np.dot(grad_Z1_L,X.T)
grad_b1_L = np.sum(grad_Z1_L,axis=1,keepdims=True)
print("grad_W1_L: {}".format(grad_W1_L))
print("grad_b1_L: {}".format(grad_b1_L))

In [None]:
# gradient descent epoch 1
alpha = 0.1
# update parameters
W1 = W1 - alpha*grad_W1_L
b1 = b1 - alpha*grad_b1_L
W2 = W2 - alpha*grad_W2_L
b2 = b2 - alpha*grad_b2_L
print("W1 update: \n{}".format(W1))
print("b1 update: \n{}".format(b1))
print("W2 update: {}".format(W2))
print("b2 update: {}".format(b2))

In [None]:
# forward propagation
# layer 1
Z1 = np.dot(W1,X) + b1
print("Z1: \n{}".format(Z1))
A1 = np.log(1+np.exp(Z1))
print("A1: \n{}".format(A1))
# layer 2
Z2 = np.dot(W2,A1) + b2
print("Z2: \n{}".format(Z2))
Z2exp = np.exp(Z2)
print("Z2exp: \n{}".format(Z2exp))
Sum = np.sum(Z2exp,axis=0,keepdims=True)
print("Sum: {}".format(Sum))
A2 = Z2exp/Sum
print("A2: \n{}".format(A2))
logA2 = np.log(A2)
print("logA2: \n{}".format(logA2))
YhlogA2 = Yh*logA2
print("Yh*logA2: \n{}".format(YhlogA2))
# recompute loss
Loss = -np.sum(Yh*np.log(A2))/m
print("Loss: {}".format(Loss))

**(c)**	Compute the prediction based on input feature matrix X above after the 1 epoch.

In [None]:
def onehot_inverse(A):
    return np.expand_dims(np.argmax(A,axis=0),axis=0)

In [None]:
# forward propagation
# layer 1
Z1 = np.dot(W1,X) + b1
print("Z1: {}".format(Z1))
A1 = np.log(1+np.exp(Z1))
print("A1: {}".format(A1))
# layer 2
Z2 = np.dot(W2,A1) + b2
print("Z2: {}".format(Z2))
Z2exp = np.exp(Z2)
A2 = Z2exp/np.sum(Z2exp,axis=0,keepdims=True)
print("A2: {}".format(A2))
# prediction
Y_pred = onehot_inverse(A2)
print("Prediction: {}".format(Y_pred))

**(d)**	Compute the accuracy of the prediction in (c) when compared against the actual Y specified above.

In [None]:
# compute accuracy
accuracy = np.mean(np.absolute(Y-Y_pred)<1e-7)
print("accuracy: {}".format(accuracy))