## Loss Error

In [6]:
import numpy as np
import torch
import matplotlib.pyplot as plt
import torch.nn.functional as F

#### MSE: Mean Square Error

$$
\begin{equation}
\begin{aligned}
loss &= \sum [y - (wx + b)]^{2} \\
&= \sum [y - f_{\theta}(x)]^{2} \\
\frac{\partial{loss}}{\partial{\theta}} &= 2 \cdot \sum \left( [y - f_{\theta}(x)] \cdot \frac{\partial{f_{\theta}(x)}}{\partial{\theta}} \right) \\
\end{aligned}
\end{equation}
$$

where:

$loss$: error between target **y** and prediction/output; 
$y$: target;
$w$: weights;
$x$: input;
$b$: bias;

$f_{\theta}(x)$: function **f** with parameters **$\theta$** which is same with $wx + b$;

#### Method 1 to get gradient: autograd.grad( )

In [22]:
# input x: scalar with value 1.
x = torch.ones(1)
# weight w: scalar with value 2.
# gradient of weight w is needed, so set feature 'requires_grad' as True
w = torch.full([1],2., requires_grad=True)
# bias b: scalar with value 2.
# gradient of bias b is needed, so set feature 'requires_grad' as True
b = torch.full([1],2., requires_grad=True)
# target t: scalar with value 1.
t = torch.ones(1)
# prediction/ output y: w*x
y = w*x + b
print('input is {}, weight is {}, target is: {}, output is {}'.format(x,w,t,y))
# MSE: torch.nn.functional.mse_loss(target,output)
loss = F.mse_loss(t,y)
print('loss is {}'.format(loss))
# get gradient of weight w and bias b
# attention: all parameters which need gradients should be input as a list, e.g. [w,b]
grad = torch.autograd.grad(loss,[w,b])
print('gradient is: {} with type of: {}'.format(grad, type(grad)))
print('gradient of weight is: {} with type of: {}'.format(grad[0], type(grad[0])))
print('gradient of bias is: {} with type of: {}'.format(grad[1], type(grad[1])))

input is tensor([1.]), weight is tensor([2.], requires_grad=True), target is: tensor([1.]), output is tensor([4.], grad_fn=<AddBackward0>)
loss is 9.0
gradient is: (tensor([6.]), tensor([6.])) with type of: <class 'tuple'>
gradient of weight is: tensor([6.]) with type of: <class 'torch.Tensor'>
gradient of bias is: tensor([6.]) with type of: <class 'torch.Tensor'>


#### Method 2 to get gradient: loss.backward( )

In [23]:
x = torch.ones(1)
w = torch.full([1],2., requires_grad=True)
b = torch.full([1],2., requires_grad=True)
t = torch.ones(1)
y = w*x + b
print('input is {}, weight is {}, target is: {}, output is {}'.format(x,w,t,y))
loss = F.mse_loss(t,y)
print('loss is {}'.format(loss))
# apply backpropagation to get gradients of parameters
loss.backward()
w_grad = w.grad
print('gradient of weight is: {} with type of: {}'.format(w_grad, type(w_grad)))
b_grad = b.grad
print('gradient of bias is: {} with type of: {}'.format(b_grad, type(b_grad)))

input is tensor([1.]), weight is tensor([2.], requires_grad=True), target is: tensor([1.]), output is tensor([4.], grad_fn=<AddBackward0>)
loss is 9.0
gradient of weight is: tensor([6.]) with type of: <class 'torch.Tensor'>
gradient of bias is: tensor([6.]) with type of: <class 'torch.Tensor'>


#### Softmax: soft version of max

<font size=2>

squeeze all elements into scale of (0,1), make them as probabilities whose summation is 1. Take the element with largest propability to be **max**.

Assume we have a output with **N-dimension**, which means:

$$ output = [a_{1},a_{2},...,a_{N}] $$

and softmax value of $a_{i}$ from output is denoted as $p_{i}$:
    
$$
\begin{equation}
\begin{aligned}
p_{i} &= \frac{e^{a_{i}}}{\sum^{N}_{j} e^{a_{j}}} \\
\end{aligned}
\end{equation}
$$
    
derivatives of softmax is:

$$
\begin{equation}
\begin{aligned}
\frac{\partial{p_{i}}}{\partial{a_{j}}} = \begin{cases}
p_{i}(1 - p_{j}) & i = j \\ - p_{j}p_{i} & i \neq j
\end{cases}
\end{aligned}
\end{equation}
$$

In [46]:
# output: contain 3 values
# elements in output need gradient, set 'requires_grad' as True
output = torch.tensor([1.,1.,1.], requires_grad=True)
# prob: output is converted into probability by softmax
# along with dim0
prob = F.softmax(output, dim=0)
print('prob: ', prob)
# attention:
# backpropagation is usually carried out with one 'error/loss' value to all parameters
# and 'softmax' is not an 'error/loss' with only one value, but several probabilities
# we can separately obtain gradients of some specific one probability to parameters
# in this case, it just shows the feature of 'taking gradients by error'
# so in this case '.backward()' doesn't work well, then see next case
try:
    prob.backward(retain_graph=True)
    o_grad = output.grad
    print('gradient of output is {}'.format(o_grad))
except RuntimeError as e:
    # RuntimeError: grad can be implicitly created only for scalar outputs
    print(e)

prob:  tensor([0.3333, 0.3333, 0.3333], grad_fn=<SoftmaxBackward0>)
grad can be implicitly created only for scalar outputs


In [43]:
# output: contain 3 values
# elements in output need gradient, set 'requires_grad' as True
output = torch.tensor([1.,1.,1.], requires_grad=True)
# prob: output is converted into probability by softmax
# along with dim0
prob = F.softmax(output, dim=0)
print('prob: ', prob)
# compared to previous case
# take gradients separately of each softmax value w.r.t parameters (here 'output')
for i in range(output.shape[0]):
    o_grad = torch.autograd.grad(prob[i], output, retain_graph=True)
    print('gradient of prob[{}] r.w.t output is {}'.format(i,o_grad))

prob:  tensor([0.3333, 0.3333, 0.3333], grad_fn=<SoftmaxBackward0>)
gradient of prob[0] r.w.t output is (tensor([ 0.2222, -0.1111, -0.1111]),)
gradient of prob[1] r.w.t output is (tensor([-0.1111,  0.2222, -0.1111]),)
gradient of prob[2] r.w.t output is (tensor([-0.1111, -0.1111,  0.2222]),)
