In [1]:
%matplotlib inline


PyTorch: Defining New autograd Functions
----------------------------------------

A fully-connected ReLU network with one hidden layer and no biases, trained to
predict y from x by minimizing squared Euclidean distance.

This implementation computes the forward pass using operations on PyTorch
Variables, and uses PyTorch autograd to compute gradients.

In this implementation we implement our own custom autograd function to perform
the ReLU function.



In [2]:
import torch


class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 28665102.0
1 22242076.0
2 19175632.0
3 16746028.0
4 13989446.0
5 10927047.0
6 7988110.5
7 5575290.5
8 3800734.75
9 2593030.75
10 1802356.0
11 1292487.375
12 960795.75
13 740016.8125
14 588100.375
15 479581.5
16 399223.8125
17 337492.25
18 288598.75
19 248962.5625
20 216266.9375
21 188897.28125
22 165741.390625
23 145974.296875
24 129011.5078125
25 114325.6796875
26 101566.09375
27 90450.0078125
28 80721.2109375
29 72187.1171875
30 64682.09765625
31 58066.81640625
32 52219.3515625
33 47041.6953125
34 42445.73046875
35 38358.31640625
36 34722.578125
37 31477.57421875
38 28577.6953125
39 25977.51171875
40 23645.791015625
41 21551.630859375
42 19668.943359375
43 17975.5078125
44 16449.666015625
45 15070.86328125
46 13823.9453125
47 12694.0576171875
48 11669.1298828125
49 10738.884765625
50 9892.7880859375
51 9123.10546875
52 8421.263671875
53 7780.97802734375
54 7196.2490234375
55 6661.98828125
56 6173.39013671875
57 5725.5732421875
58 5314.638671875
59 4937.32958984375
60 4590.629882812