In [1]:
import numpy as np

#### softmax activation function

$$softmax(n_{i})=\frac{e^{z_{ni}}}{\sum_{j=1}^{k} e^{z_{nj}}}$$

In [2]:
def softmax(x):
    exp_x = np.exp(x - np.max(x))
    return exp_x / np.sum(exp_x)

* Example

In [4]:
z = np.array([1.5, -0.5, 0.75])
softmax(z)

array([0.62200588, 0.08417934, 0.29381477])

##### Exercises
* (1) Suppose a neural netowrk has 5 output nodes $n_1,\dots,n_5$

* (a) $ z_{n1} = 3, z_{n2} = 5, z_{n_3}=0.5, z_{n4}=-2, z_{n5} =1.7$

In [5]:
z_a = np.array([3,5,0.5,-2,1.7])
softmax(z_a)

array([1.14280348e-01, 8.44423906e-01, 9.38070225e-03, 7.70014931e-04,
       3.11450283e-02])

* (b) $ z_{n1} = -2, z_{n2} = -5, z_{n_3}=-0.5, z_{n4}=0.1, z_{n5} = -1.5$

In [6]:
z_b = np.array([-2, -5, -.5, 0.1, -1.5])
softmax(z_b)

array([0.065162  , 0.00324422, 0.29203583, 0.53212397, 0.10743398])

* (c) $z_{n1} = 0, z_{n2} =0.2, z_{n_3}= -0.1, z_{n4}=0.1, z_{n5} =-0.7$

In [7]:
z_c = np.array([0,0.2,-0.1,0.1, -0.7])
softmax(z_c)

array([0.21150608, 0.25833411, 0.19137862, 0.23375037, 0.10503081])

### Cross-entropy loss function

$$ L_{CE}(y,t) = -\sum_{i=1}^{k} t_{i}\ln(y_i)$$

In [8]:
def cross_entropy_loss(y_pred,y_true):
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    loss = -np.sum(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
    return loss

* (2) Compute $L_{CE}(y,t)$ for the following probability distributions
* (i) $y = (0.5, 0.3, 0.2),t = ( 1, 0, 0)$

In [9]:
y_i = np.array([0.5, 0.3,0.2])
t_i = np.array([1,0,0])
cross_entropy_loss(y_i,t_i)

1.2729656758128876

In [10]:
def ce_loss(y, t):
    loss = -np.sum(t * np.log(y)+ (1-t) * np.log(1 - y))
    return loss

In [11]:
ce_loss(y_i,t_i)

1.2729656758128876

* (ii) $y = (0.5, 0.3, 0.2),t = (0, 1, 0)$

In [12]:
y_ii = np.array([0.5, 0.3, 0.2])
t_ii = np.array([0,1,0])
ce_loss(y_ii, t_ii)

2.120263536200091

* (iii) $y = (0.5, 0.3, 0.2),t = ( 0, 0, 1)$

In [13]:
y_iii = np.array([0.5,0.3,0.2])
t_iii = np.array([0,0,1])
ce_loss(y_iii,t_iii)

2.6592600369327783

* (iv)$y = (0.1, 0.2, 0.7),t = ( 0.3, 0.3, 0.4)$

In [14]:
y_iv = np.array([0.1, 0.2, 0.7])
t_iv = np.array([0.3, 0.3, 0.4])
ce_loss(y_iv, t_iv)

2.2686134086799234

* (v) $y = (0.1, 0.2, 0.7),t = ( 0.2, 0.2, 0.6)$

In [15]:
y_v = np.array([0.1, 0.2, 0.7])
t_v = np.array([0.2, 0.2, 0.6])
ce_loss(y_v, t_v)

1.7408019427568717

##### Exercises II


* Consider a network with 2 input nodes, one hiden layer with 2 nodes,2 output nodes.
* activation function in the hidden layer is sigmoid
* output layer uses softmax and targets are ine-hot encoded

In [16]:
w1 = np.array([[-2,1], [3,0]])
w2 = np.array([[2,3], [-1,2]])
b1 = np.array([0.5,1.5])
b2 = np.array([2,-1])
x  = np.array([-1,1])
t  = np.array([1,0])

* first feed input into the network to get output and compute the loss

In [17]:
def sigmoid(x):
    return 1 / 1 + np.exp(-x)

In [18]:
hidden_1 = sigmoid(np.dot(x,w1) + b1)
output_1 = softmax(np.dot(hidden_1,w2) + b2)
print("Output", np.round(output_1,4))

Output [0.0561 0.9439]


In [19]:
def sumofsquares(y,t):
    loss_sum = 0.5 * np.sum(np.square(y-t))
    return loss_sum

In [20]:
sumofsquares(output_1,t)

0.8910265888763901

* (b) Perform one iteration of backpropagation training with n=0.1

In [21]:
def sigmoid_dt(x):
    return x*(1-x)

In [22]:
lr = 0.1
delta_err = (output_1 - t) * sigmoid_dt(output_1)
hidden_err = np.dot(delta_err, w2.T) * sigmoid_dt(hidden_1)

In [24]:
w2 = w2.astype('float64')
b2 = b2.astype('float64')
w1 = w1.astype('float64')
b1 = b1.astype('float64')

In [25]:
w1 -= lr * np.dot(x.T, hidden_err)
b1 -= lr * np.sum(hidden_err, axis=0)
w2 -= lr * np.dot(hidden_1, delta_err)
b2 -= lr * np.sum(delta_err, axis=0)

In [26]:
print("Updated weights W_1:", w1)
print("Updated weights W_2:", w2)
print("Updated biases b_1:", b1)
print("Updated biases b_2:", b2)

Updated weights W_1: [[-1.98541922  1.01458078]
 [ 3.01458078  0.01458078]]
Updated weights W_2: [[ 1.99699085  2.99699085]
 [-1.00300915  1.99699085]]
Updated biases b_1: [0.51462178 1.51462178]
Updated biases b_2: [ 2. -1.]


* Feed the input into network and check for the updated loss

In [28]:
hidden_1 = sigmoid(np.dot(x,w1)+b1)
updated_out = softmax(np.dot(hidden_1, w2)+b2)
print("updated output", np.round(updated_out,4))

updated output [0.0575 0.9425]


In [29]:
sumofsquares(updated_out,t)

0.8883528604776105