In [1]:
import torch

## Activation Functions

### sigmoid
sigmoid(x) = 1/(1+e^(-x))    in (0,1)

d sigmoid/d x = sigmoid(x)(1-sigmoid(x))

In [2]:
a = torch.linspace(-100,100,10)
torch.sigmoid(a)

tensor([0.0000e+00, 1.6655e-34, 7.4564e-25, 3.3382e-15, 1.4945e-05, 9.9999e-01,
        1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00])

### Tanh
tanh(x) = (e^x-e^(-x))/(e^x+e^(-x)) = 2sigmoid(2x)-1    in (-1,1)

d tanh(x)/d x = 1-tanh^2(x)

In [3]:
a = torch.linspace(-1,1,10)
torch.tanh(a)

tensor([-0.7616, -0.6514, -0.5047, -0.3215, -0.1107,  0.1107,  0.3215,  0.5047,
         0.6514,  0.7616])

### Rectified Linear Unit(ReLU)

In [4]:
torch.relu(a)

tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1111, 0.3333, 0.5556, 0.7778,
        1.0000])

## Loss Functions
* Mean Squared Error
* Cross Entropy Loss
    * binary
    * multi-class
    * +softmax
    * Leave it to Logistic Regression Part
* torch.autograd.grad(loss,\[w1,w2,...\])
    * \[w1 grad, w2 grad,...\]
* loss.backward()
    * w1.grad
    * w2.grad

In [2]:
from torch.nn import functional as F

In [19]:
x = torch.ones(1)
w = torch.full([1],2.)
mse = F.mse_loss(torch.ones(1),x*w)
mse

tensor(1.)

In [21]:
torch.autograd.grad(mse,[w])

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [22]:
w.requires_grad_()
mse = F.mse_loss(torch.ones(1),x*w)
torch.autograd.grad(mse,[w])

(tensor([2.]),)

### loss.backward

In [23]:
mse = F.mse_loss(torch.ones(1),x*w)
mse.backward()
w.grad

tensor([2.])

### Softmax
* soft version of max

    S(y_i) = e^(y_i)/sum_j(e^(y_j))

    d S(y_i)/d y_j = S(y_j)(1-S(y_j)), i = j

    d S(y_i)/d y_j = -S(y_j)S(y_i)

In [25]:
a = torch.rand(3)
a.requires_grad_()

tensor([0.9387, 0.6220, 0.5901], requires_grad=True)

In [31]:
p = F.softmax(a,dim=0)
torch.autograd.grad(p[1],[a],retain_graph=True), torch.autograd.grad(p[2],[a])
# loss should be [1] scale

((tensor([-0.1230,  0.2097, -0.0868]),),
 (tensor([-0.1191, -0.0868,  0.2059]),))

### Perceptron

In [32]:
x = torch.randn(1,10)
w = torch.randn(1,10,requires_grad=True)
o = torch.sigmoid(x@w.t())
o.shape

torch.Size([1, 1])

In [33]:
loss = F.mse_loss(torch.ones(1,1),o)
loss.shape

torch.Size([])

In [34]:
loss.backward()
w.grad

tensor([[-0.0905,  0.3336,  0.0114, -0.5023,  0.0928,  0.0549, -0.0823, -0.0502,
         -0.2528, -0.1147]])

### MLP and Grad

In [35]:
x = torch.randn(1,10)
w = torch.randn(2,10,requires_grad=True)
o = torch.sigmoid(x@w.t())
o.shape

torch.Size([1, 2])

In [37]:
loss = F.mse_loss(torch.ones(1,2),o)
loss

tensor(0.7157, grad_fn=<MseLossBackward>)

In [38]:
loss.backward()
w.grad

tensor([[-0.0116, -0.1854, -0.1572,  0.0973, -0.1311, -0.1948, -0.1331, -0.0420,
          0.0169, -0.0711],
        [-0.0014, -0.0224, -0.0190,  0.0118, -0.0158, -0.0235, -0.0161, -0.0051,
          0.0020, -0.0086]])

### Chain rule


In [41]:
# check
x = torch.tensor(1.)
w1 = torch.tensor(2.,requires_grad=True)
b1 = torch.tensor(1.)
w2 = torch.tensor(2.,requires_grad=True)
b2 = torch.tensor(1.)

y1 = x*w1+b1
y2 = y1*w2+b2

dy2_dy1 = torch.autograd.grad(y2,[y1],retain_graph=True)[0]
dy1_dw1 = torch.autograd.grad(y1,[w1],retain_graph=True)[0]
dy2_dw1 = torch.autograd.grad(y2,[w1],retain_graph=True)[0]

dy2_dy1*dy1_dw1

tensor(2.)

In [42]:
dy2_dw1

tensor(2.)

## Logistic Regression
* Recap
    * for continuous: y = xw + b
    * for probability output: y = sigmoid(xw + b)
* Binary Classification
    * interpret network as f: x -> p(y|x;zita)
    * output in \[0,1\]
    * which is exactly what *logistic function* cones in!

* Goal v.s. Approach
    * For regression:
        * Goal: *pred* = y
        * Approach: minimize *dist(pred,y)*
    * For classification:
        * Goal: maximize benchmark, e.g. *accuracy*
        * Approach 1: minimize *dist(p_zita(y|x),p_r(y|x))*
        * Approach 2: minimize *divergence(p_zita(y|x),p_r(y|x))*

* Q1. why not maximize accuracy?
    * *acc.* = sum(I(pred_i ==y_i))/len(Y)
    * issues 1. gradient = 0 if accuracy unchanged but weights changed
    * issues 2. gradient not continuous since the number of correct is not continuous

* Q2. why call logistic regression
    * use sigmoid
    * controversial!
        * MSE => regression
        * Cross Entropy => classification

## Loss for classification
* MSE
* Cross Entropy Loss
* Hinge Loss
    sum(max(0,1-y_i*h_zita(x_i)))

### Entropy
* Uncertainty
* measure of surprise
* higher entropy = less info.
    * *Entropy* = - sum_i(P(i)logP(i))

### Cross Entropy
* H(p,q) = - sum_i(P(i)logq(i))
* H(p,q) = H(p) + D_KL(p|q)
* P = Q
    * Cross Entropy = Entropy <= KL divergence = 0
* for onr-hot encoding
    * entropy = 1log1 = 0
* why not use MSE
    * sigmoid + MSE => gradient vanish
    * converge slower
    * but sometimes: e.g. meta-learning

In [8]:
x = torch.randn(1,784)
w = torch.randn(10,784)

logits = x@w.t()
pred = F.softmax(logits, dim=1)

pre_log = torch.log(pred)

In [9]:
F.cross_entropy(logits,torch.tensor([3])), F.nll_loss(pre_log,torch.tensor([3]))

(tensor(87.2855), tensor(87.2855))