In [2]:
import numpy as np
import torch

In [3]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 29676595.348928213
1 24253119.379572064
2 23002742.91536378
3 22278213.6549623
4 20128243.50274066
5 16185614.188472334
6 11477726.996204652
7 7391390.066063774
8 4527552.548451962
9 2778199.5661319443
10 1776981.2527006946
11 1211308.935203616
12 882738.5937551991
13 681447.8005171997
14 549444.3689138666
15 456771.19491888845
16 387693.349834183
17 333751.96925914136
18 290144.8774181137
19 254029.91330112936
20 223624.26936864015
21 197725.89607537488
22 175510.07375188707
23 156383.3514227459
24 139747.92816846116
25 125219.4639034857
26 112535.99906019543
27 101379.29505258944
28 91527.39995303101
29 82803.41687474199
30 75048.33059720413
31 68144.19617952046
32 61981.08277357409
33 56468.94054612742
34 51522.052083454924
35 47076.5981802726
36 43069.96395238317
37 39461.9727989564
38 36220.84959761619
39 33284.840644805474
40 30620.284302278917
41 28198.61711709039
42 25999.042588937053
43 23994.908793059036
44 22166.001922669384
45 20493.246574912584
46 18962.77700310002
47 17

In [8]:
dtype = torch.float
# device = torch.device("cpu")
device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 38104156.0
1 32482886.0
2 27263696.0
3 20297444.0
4 13250225.0
5 7915971.0
6 4682388.0
7 2913009.0
8 1966539.875
9 1434216.75
10 1110150.625
11 893892.5
12 738173.75
13 619747.5
14 526144.5625
15 450312.8125
16 387862.96875
17 335892.40625
18 292263.625
19 255371.296875
20 224020.96875
21 197214.703125
22 174205.015625
23 154390.15625
24 137220.5
25 122278.5703125
26 109265.3828125
27 97872.3046875
28 87865.609375
29 79042.265625
30 71238.9921875
31 64324.01171875
32 58183.3125
33 52716.87109375
34 47842.6875
35 43484.10546875
36 39579.33203125
37 36080.8125
38 32933.07421875
39 30096.318359375
40 27536.517578125
41 25223.962890625
42 23130.89453125
43 21232.587890625
44 19508.740234375
45 17942.216796875
46 16519.572265625
47 15223.9482421875
48 14042.3251953125
49 12963.044921875
50 11976.2373046875
51 11072.8173828125
52 10244.7353515625
53 9485.5732421875
54 8788.4912109375
55 8147.76123046875
56 7558.412109375
57 7016.1923828125
58 6516.73388671875
59 6056.4873046875
60 5631.670

In [5]:
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 26919260.0
1 20365338.0
2 16251628.0
3 12804300.0
4 9687291.0
5 7042076.0
6 4985142.0
7 3502401.25
8 2484529.25
9 1801921.25
10 1345532.5
11 1035829.3125
12 820486.1875
13 665967.0625
14 551388.0
15 463865.65625
16 395133.4375
17 339838.46875
18 294501.15625
19 256775.609375
20 225063.3125
21 198128.40625
22 175073.1875
23 155231.6875
24 138068.859375
25 123156.25
26 110148.609375
27 98766.84375
28 88785.859375
29 80000.6796875
30 72235.484375
31 65359.2578125
32 59253.62890625
33 53813.953125
34 48958.8046875
35 44617.16796875
36 40725.82421875
37 37238.30859375
38 34103.30078125
39 31277.560546875
40 28725.591796875
41 26415.798828125
42 24321.783203125
43 22420.861328125
44 20691.498046875
45 19116.78125
46 17681.056640625
47 16369.296875
48 15170.4560546875
49 14072.3759765625
50 13065.4677734375
51 12141.275390625
52 11292.458984375
53 10511.794921875
54 9792.6181640625
55 9129.85546875
56 8517.91015625
57 7952.95068359375
58 7430.74609375
59 6947.37158203125
60 6499.9609375
61 

In [6]:
class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch..randn(N, D_in)
y = torch.randn(N, D_out)
# Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 731.9601440429688
1 674.9591064453125
2 626.0187377929688
3 583.9705810546875
4 547.1212158203125
5 514.1104736328125
6 484.1178894042969
7 456.6460876464844
8 431.2762756347656
9 407.6583251953125
10 385.6162414550781
11 365.0662841796875
12 345.6960754394531
13 327.2996826171875
14 309.82720947265625
15 293.2251892089844
16 277.40435791015625
17 262.2981872558594
18 247.9268341064453
19 234.25070190429688
20 221.2759552001953
21 208.91700744628906
22 197.1165313720703
23 185.9020538330078
24 175.2347869873047
25 165.09738159179688
26 155.4714813232422
27 146.36404418945312
28 137.7325897216797
29 129.57041931152344
30 121.85502624511719
31 114.5605239868164
32 107.67020416259766
33 101.18092346191406
34 95.0775146484375
35 89.34203338623047
36 83.94772338867188
37 78.8713607788086
38 74.09322357177734
39 69.60028839111328
40 65.38580322265625
41 61.429439544677734
42 57.69900894165039
43 54.19886779785156
44 50.91796112060547
45 47.82477569580078
46 44.92083740234375
47 42.19113540