In [2]:
import torch
from torch import Tensor
from torch.autograd import Variable
import numpy as np

## Practice problem #1: an arbitrary, complex function, broken into pieces
$$ f(x,y) = \frac{x + \sigma(y)}{\sigma(x) + (x+y)^2} $$

### Forwards pass

In [23]:
x = Variable(Tensor([2]))
y = Variable(Tensor([-2]))
print(x, y)

Variable containing:
 2
[torch.FloatTensor of size 1]
 Variable containing:
-2
[torch.FloatTensor of size 1]



In [3]:
def forwards(x, y):
    σy = torch.sigmoid(y)
    num = x + σy
    σx = torch.sigmoid(x)
    xpy = x + y
    xpysqr = xpy ** 2
    den = σx + xpysqr

    invden = 1. / den
    f = num * invden
    return f, σy, σx, num, xpy, xpysqr, den, invden

f, *values = forwards(x, y)
print(f)

Variable containing:
 2.4060
[torch.FloatTensor of size 1]



### Backwards pass

In [4]:
def backwards(σy, σx, num, xpy, xpysqr, den, invden):
    δf = 1
    δnum = invden * δf
    δinvden = num * δf
    δden = -(1/den**2) * δinvden
    δxpysqr = 1 * δden
    δσx = 1 * δden
    δxpy = 2 * xpy * δxpysqr
    δx = 1 * δxpy
    δy = 1 * δxpy
    δx += σx * (1 - σx) * δσx
    δx += 1 * δnum
    δσy = 1 * δnum
    δy += σy * (1 - σy) * δσy
    return δx, δy

δx, δy = backwards(*values)    
print(δx, δy)

Variable containing:
 0.8485
[torch.FloatTensor of size 1]
 Variable containing:
 0.1192
[torch.FloatTensor of size 1]



### Check result with PyTorch

In [5]:
xpt = Variable(Tensor([2]), requires_grad=True)
ypt = Variable(Tensor([-2]), requires_grad=True)
print(xpt, ypt)

Variable containing:
 2
[torch.FloatTensor of size 1]
 Variable containing:
-2
[torch.FloatTensor of size 1]



In [6]:
fpt, *valuespt = forwards(xpt, ypt)
print(fpt)

Variable containing:
 2.4060
[torch.FloatTensor of size 1]



Before the first time a gradient has been computed, `xpt.grad` and `ypt.grad` will be `None`, so attempting to zero their gradient values will cause an error.

In [7]:
if xpt.grad is not None:
    xpt.grad.data.zero_()
    ypt.grad.data.zero_()

If we don't pass the `retain_graph` option, PyTorch will free the memory that was used for the graph, and we won't be able to call `.backward()` more than once. Not strictly necessary, but convenient for iterating in a ipynb notebook.

In [8]:
fpt.backward(retain_graph=True)

In [9]:
print(xpt.grad, ypt.grad)

Variable containing:
 0.8485
[torch.FloatTensor of size 1]
 Variable containing:
 0.1192
[torch.FloatTensor of size 1]



`torch.equal` and `torch.eq` check for exact equality - `np.allclose` only checks that they're within a certain tolerance (1.e-5 by default). Convert to np arrays so that they're valid arguments to `np.allclose`.

In [170]:
assert np.allclose(δx.data.numpy(), xpt.grad.data.numpy())
assert np.allclose(δy.data.numpy(), ypt.grad.data.numpy())

AssertionError: 

## Practice problem #2: deriving with tensors

The goal is to derive the gradients for W, x, and b for this function:

$$ f(W, x, b) = \sum\sigma(Wx + b) $$



In [179]:
torch.manual_seed(3)
x = Variable(Tensor([1, 2, 3]).unsqueeze(1))
W = Variable(Tensor(3, 3))
b = Variable(Tensor(3, 1))
torch.set_printoptions(precision=1)
print(x, W, b)

Variable containing:
 1
 2
 3
[torch.FloatTensor of size 3x1]
 Variable containing:
 0.0e+00  1.6e+29  0.0e+00
 1.6e+29  5.6e-45  4.6e-41
 3.5e-31  1.4e-45  0.0e+00
[torch.FloatTensor of size 3x3]
 Variable containing:
    0.0e+00
    1.6e+29
    0.0e+00
[torch.FloatTensor of size 3x1]



### Forwards pass

In [180]:
def forwards(W, x, b):
    print(W.data.shape, x.data.shape)
    wx = W @ x
    print(wx.data.shape)
    wxb = wx + b
    σwxb = torch.sigmoid(wxb)
    f = torch.sum(σwxb)
    return f, wx, wxb, σwxb

f, *values = forwards(W, x, b)
print(f)

torch.Size([3, 3]) torch.Size([3, 1])
torch.Size([3, 1])
Variable containing:
    2.5
[torch.FloatTensor of size 1]



### Backwards pass

In [133]:
x = Variable(Tensor([1, 2, 3]))
W = Variable(Tensor(3, 3).fill_(1))
b = Variable(Tensor(3).fill_(-1))
print(x, W, b)

def forwards(W, x, b):
    wx = W @ x
    wxb = wx + b
    σwxb = torch.sigmoid(wxb)
    f = torch.sum(σwxb)
    return f, wx, wxb, σwxb

f, *values = forwards(W, x, b)
print(f)

def backwards(wx, wxb, σwxb):
    δf = 1
    δwxb = σwxb * (1 - σwxb) * δf
    δwx = 1 * δwxb
    δb = 1 * δwxb
    δx = W * δwx
    δW = x * δwx
    return δx, δy

δx, δy = backwards(*values)
print(δx, δy)

Variable containing:
 1
 2
 3
[torch.FloatTensor of size 3]
 Variable containing:
 1  1  1
 1  1  1
 1  1  1
[torch.FloatTensor of size 3x3]
 Variable containing:
-1
-1
-1
[torch.FloatTensor of size 3]

Variable containing:
 2.9799
[torch.FloatTensor of size 1]

Variable containing:
1.00000e-03 *
  6.6480  6.6480  6.6480
  6.6480  6.6480  6.6480
  6.6480  6.6480  6.6480
[torch.FloatTensor of size 3x3]
 Variable containing:
 0.1192
[torch.FloatTensor of size 1]



### Check result with PyTorch

In [119]:
x = Variable(Tensor([1, 2, 3]), requires_grad=True)
W = Variable(Tensor(3, 3).fill_(1), requires_grad=True)
b = Variable(Tensor(3).fill_(-1), requires_grad=True)

def forwards(W, x, b):
    wx = W @ x
    wxb = wx + b
    σwxb = torch.sigmoid(wxb)
    f = torch.sum(σwxb)
    return f, wx, wxb, σwxb

f, *values = forwards(W, x, b)

f.backward(retain_graph=True)
# print(x, x.grad, W, W.grad, b, b.grad)
f2, *values = forwards(W - W.grad, x - x.grad, b - b.grad)
print(f, f2)

Variable containing:
 2.9799
[torch.FloatTensor of size 1]
 Variable containing:
 2.9765
[torch.FloatTensor of size 1]



## Backpropagating softmax in PyTorch


In [9]:
x = Variable(Tensor([[4, 5, 6]]), requires_grad=True)

softmax = torch.nn.Softmax()
print(softmax(x))


# f, *values = forwards(W, x, b)

# f.backward(retain_graph=True)
# # print(x, x.grad, W, W.grad, b, b.grad)
# f2, *values = forwards(W - W.grad, x - x.grad, b - b.grad)
# print(f, f2)

Variable containing:
 0.0900  0.2447  0.6652
[torch.FloatTensor of size 1x3]



RuntimeError: grad can be implicitly created only for scalar outputs

## Practice problem #3: backprop'ing a two layer sigmoidal net

### Forward pass

### Backwards pass

In [191]:
np.log10(np.sqrt(10))

0.5