In [None]:
%matplotlib inline


PyTorch: Tensors and autograd
-------------------------------

A fully-connected ReLU network with one hidden layer and no biases, trained to
predict y from x by minimizing squared Euclidean distance.

This implementation computes the forward pass using operations on PyTorch
Tensors, and uses PyTorch autograd to compute gradients.


A PyTorch Tensor represents a node in a computational graph. If ``x`` is a
Tensor that has ``x.requires_grad=True`` then ``x.grad`` is another Tensor
holding the gradient of ``x`` with respect to some scalar value.



In [1]:
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 27865624.0
1 23825618.0
2 21576090.0
3 18784510.0
4 15060504.0
5 10972514.0
6 7421361.5
7 4813767.0
8 3112750.5
9 2063882.625
10 1430036.0
11 1041382.8125
12 794501.375
13 629892.875
14 514337.5625
15 428984.375
16 363471.6875
17 311446.84375
18 269090.5625
19 234004.8125
20 204543.421875
21 179571.296875
22 158224.125
23 139853.828125
24 123976.3359375
25 110188.515625
26 98171.546875
27 87660.671875
28 78449.0859375
29 70355.7578125
30 63218.98046875
31 56915.16796875
32 51340.4765625
33 46389.7109375
34 41984.99609375
35 38057.69140625
36 34547.11328125
37 31403.005859375
38 28585.07421875
39 26052.73828125
40 23772.955078125
41 21716.482421875
42 19859.75390625
43 18180.65625
44 16661.08203125
45 15282.51953125
46 14031.8994140625
47 12896.6455078125
48 11863.3212890625
49 10921.9912109375
50 10063.6484375
51 9279.6083984375
52 8562.7900390625
53 7907.1015625
54 7306.4990234375
55 6755.859375
56 6250.31591796875
57 5785.9755859375
58 5359.21728515625
59 4966.45849609375
60 4605.0