In [4]:
from __future__ import print_function
import torch
import numpy as np

x = torch.rand(5, 3)
print(x)

tensor([[0.9884, 0.0783, 0.7838],
        [0.9073, 0.7749, 0.8065],
        [0.4266, 0.0912, 0.7084],
        [0.9988, 0.3934, 0.5264],
        [0.5820, 0.7630, 0.4728]])


In [5]:

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 36269185.08582856
1 36717384.56105453
2 44211919.416184515
3 48956967.851280525
4 41997646.57347813
5 24917159.65645901
6 10868168.950380016
7 4386460.052024925
8 2175244.4913519323
9 1407154.7005000464
10 1070286.6606546734
11 872432.0024120247
12 731599.320235576
13 622044.9531791848
14 533379.0748088064
15 460274.97954610223
16 399369.7276794842
17 348232.17337891797
18 305026.76202622347
19 268340.75997846166
20 236998.59493103105
21 210039.49091410122
22 186742.85291044466
23 166539.80626033203
24 148935.7542182072
25 133544.05911204318
26 120034.96808265874
27 108144.22198467856
28 97654.25214777772
29 88367.64778938262
30 80117.65618742246
31 72768.32801003236
32 66203.48417850079
33 60328.05474579806
34 55061.585385207814
35 50336.730020515926
36 46079.64062868742
37 42239.1688456907
38 38766.190569907565
39 35624.10714249723
40 32775.47359926369
41 30189.715997678803
42 27840.23858878542
43 25700.36464635798
44 23747.54338116231
45 21964.89894218874
46 20335.488913481975
47 

422 0.00022278907603108782
423 0.00021308106344122748
424 0.00020379651128707665
425 0.00019491783839622667
426 0.00018643361978900195
427 0.00017831816491053893
428 0.00017055784726470574
429 0.0001631386925634242
430 0.0001560436057781481
431 0.00014925644925725874
432 0.00014276719614135297
433 0.00013656012429271266
434 0.0001306235340518023
435 0.00012494635268850196
436 0.00011951894622401439
437 0.00011432323733963167
438 0.00010935488684014524
439 0.00010460524331640226
440 0.0001000593342797513
441 9.571502362151966e-05
442 9.155990742120551e-05
443 8.758319959742393e-05
444 8.377969244269056e-05
445 8.0141920187125e-05
446 7.666442658229241e-05
447 7.333630265497995e-05
448 7.015289353572863e-05
449 6.71103110841746e-05
450 6.41981652398437e-05
451 6.141296452263903e-05
452 5.8749998234660205e-05
453 5.620058913268197e-05
454 5.376215891105726e-05
455 5.143207505930092e-05
456 4.9202240212884395e-05
457 4.7068870858106274e-05
458 4.5028579929836666e-05
459 4.307645010822406e-

In [6]:
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

99 636.923828125
199 2.441917657852173
299 0.01646672934293747
399 0.0003320028481539339
499 4.6964218199718744e-05
