# Implementation of NN using automatic differentiation

In [0]:
import torch

dtype = torch.float
device = torch.device('cuda:0')



# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

In [2]:
learning_rate = 1e-6

for t in range(500):
  # Forward Pass
  y_pred = x.mm(w1).clamp(min=0).mm(w2)
  
  # Compute Loss
  loss = (y_pred - y).pow(2).sum()
  print (t, loss.item())
  
  # Backward Pass
  loss.backward()
  
  # Update weights
  with torch.no_grad():
    w1 -= learning_rate * w1.grad
    w2 -= learning_rate * w2.grad
    
    # Manually zero the gradients after updating weights
    w1.grad.zero_()
    w2.grad.zero_()
  

0 32345430.0
1 28734634.0
2 25848822.0
3 21119088.0
4 15192328.0
5 9724154.0
6 5885759.0
7 3579550.25
8 2303165.0
9 1596866.75
10 1188409.5
11 934348.75
12 762917.875
13 638689.0625
14 543689.3125
15 468178.3125
16 406533.875
17 355341.84375
18 312348.5
19 275901.125
20 244724.015625
21 217890.5
22 194671.3125
23 174455.375
24 156790.59375
25 141290.4375
26 127622.9375
27 115544.984375
28 104829.828125
29 95306.046875
30 86805.59375
31 79203.859375
32 72386.1796875
33 66258.25
34 60739.85546875
35 55759.25
36 51261.66015625
37 47192.2734375
38 43499.26953125
39 40138.8515625
40 37077.91015625
41 34288.921875
42 31738.447265625
43 29404.10546875
44 27264.900390625
45 25303.828125
46 23502.40234375
47 21845.951171875
48 20324.45703125
49 18924.93359375
50 17633.60546875
51 16441.515625
52 15340.7822265625
53 14322.423828125
54 13380.95703125
55 12508.14453125
56 11699.154296875
57 10948.169921875
58 10250.7939453125
59 9602.4931640625
60 9000.140625
61 8439.724609375
62 7917.45849609375


# Defining new autograd functions

In [0]:
class MyReLU(torch.autograd.Function):
    """
    Implementing custom autograd Functions by subclassing torch.autograd.
    Function and implementing the forward and backward passes which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


In [4]:
# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 35314108.0
1 33651092.0
2 38535692.0
3 41738804.0
4 36248908.0
5 23074016.0
6 11085772.0
7 4806511.0
8 2336868.5
9 1419892.625
10 1028383.3125
11 817545.125
12 678056.625
13 573793.1875
14 490920.0625
15 423279.5
16 367169.4375
17 320093.5625
18 280315.03125
19 246488.625
20 217544.15625
21 192657.09375
22 171152.9375
23 152506.078125
24 136293.125
25 122128.265625
26 109705.8359375
27 98779.5703125
28 89140.875
29 80619.140625
30 73049.84375
31 66306.0859375
32 60295.6328125
33 54916.375
34 50097.1171875
35 45766.0390625
36 41865.7578125
37 38347.25390625
38 35168.171875
39 32290.5
40 29681.611328125
41 27311.9140625
42 25156.54296875
43 23193.794921875
44 21403.9453125
45 19769.951171875
46 18276.46875
47 16909.447265625
48 15657.05078125
49 14509.318359375
50 13456.73046875
51 12489.7265625
52 11600.05078125
53 10781.5
54 10027.0
55 9331.125
56 8689.548828125
57 8097.6142578125
58 7550.31640625
59 7044.0244140625
60 6575.15234375
61 6141.02001953125
62 5738.498046875
63 5365.22607