In [1]:
#Example of doing things through numpy with 2 layer network that automatically goes forward and backwards

import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 30541024.03595002
1 27714031.232440047
2 30241520.68167589
3 33289894.745095782
4 32263136.555530805
5 25260351.141372025
6 15670029.274481699
7 8174279.246829413
8 4076345.8552536336
9 2199230.5060913265
10 1367770.5477852535
11 971532.2930852395
12 755123.3888904683
13 617380.2301233415
14 518559.3926973414
15 442130.52600352163
16 380540.3572048346
17 329687.5942982977
18 287129.86658650666
19 251193.2324921179
20 220652.7121365447
21 194544.94143133512
22 172110.17273135926
23 152766.0676232214
24 136006.68073707813
25 121422.28454747882
26 108693.18439638303
27 97542.01461193622
28 87746.96683528912
29 79116.39379038269
30 71484.93900045665
31 64718.25896833277
32 58702.036865288836
33 53344.70566053638
34 48564.9622333598
35 44287.198119144865
36 40448.288215495835
37 36997.5797001878
38 33890.05063169455
39 31085.32970268223
40 28550.347119257687
41 26255.559529415958
42 24170.136457658085
43 22276.32576029586
44 20553.552570521686
45 18984.27053582153
46 17552.671207010546
47

In [2]:
import torch


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 47293272.0
1 54658648.0
2 57527472.0
3 43294912.0
4 21532082.0
5 8037773.5
6 3369303.75
7 1971986.0
8 1444978.375
9 1156182.25
10 955237.625
11 800894.5
12 677798.375
13 577936.9375
14 496089.1875
15 428394.875
16 371871.625
17 324329.65625
18 284101.625
19 249920.796875
20 220725.015625
21 195612.65625
22 173913.40625
23 155104.84375
24 138727.921875
25 124404.3125
26 111830.1171875
27 100758.921875
28 90983.9140625
29 82328.203125
30 74638.03125
31 67791.59375
32 61684.0859375
33 56220.63671875
34 51322.19140625
35 46922.18359375
36 42962.6875
37 39389.98828125
38 36163.16015625
39 33248.4765625
40 30607.4453125
41 28208.798828125
42 26028.232421875
43 24042.41015625
44 22232.44921875
45 20579.599609375
46 19068.142578125
47 17685.220703125
48 16417.14453125
49 15253.365234375
50 14184.453125
51 13202.0126953125
52 12297.7763671875
53 11464.78515625
54 10696.044921875
55 9986.078125
56 9330.052734375
57 8723.12109375
58 8161.40966796875
59 7640.994140625
60 7158.3369140625
61 6710.

In [3]:
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 26047828.0
1 20972716.0
2 20949448.0
3 23002512.0
4 24822308.0
5 23906930.0
6 19683988.0
7 13563834.0
8 8188627.0
9 4591754.0
10 2600576.5
11 1571022.875
12 1044499.125
13 761006.9375
14 595174.0
15 487866.5
16 411766.9375
17 353791.46875
18 307451.75
19 269156.8125
20 236908.84375
21 209457.125
22 185872.90625
23 165501.25
24 147790.3125
25 132322.8125
26 118769.1171875
27 106867.0703125
28 96367.28125
29 87075.4609375
30 78827.875
31 71488.578125
32 64945.515625
33 59095.3671875
34 53856.73828125
35 49159.828125
36 44937.484375
37 41137.53125
38 37704.8125
39 34603.25
40 31797.470703125
41 29252.9609375
42 26945.212890625
43 24846.283203125
44 22934.1796875
45 21190.14453125
46 19597.384765625
47 18140.583984375
48 16808.12890625
49 15586.9873046875
50 14466.7392578125
51 13437.541015625
52 12490.8974609375
53 11619.548828125
54 10816.3759765625
55 10075.8681640625
56 9392.30859375
57 8760.7109375
58 8177.33837890625
59 7637.28369140625
60 7136.74951171875
61 6672.48681640625
62 62

In [4]:
import torch
"""
define own autograd functions
"""

class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 29020492.0
1 22605222.0
2 18610390.0
3 14931603.0
4 11303966.0
5 8066131.0
6 5539452.5
7 3755747.25
8 2575647.5
9 1814553.5
10 1324425.5
11 1002163.5625
12 783761.9375
13 630009.75
14 517636.5
15 432537.40625
16 366224.8125
17 313399.8125
18 270424.8125
19 234928.875
20 205244.15625
21 180166.890625
22 158810.78125
23 140501.5625
24 124727.203125
25 111043.609375
26 99126.75
27 88709.734375
28 79571.328125
29 71532.421875
30 64442.7109375
31 58163.09375
32 52587.64453125
33 47623.29296875
34 43194.3671875
35 39235.84375
36 35690.63671875
37 32503.81640625
38 29637.50390625
39 27055.4765625
40 24724.546875
41 22619.1953125
42 20713.078125
43 18985.62109375
44 17418.0234375
45 15994.1923828125
46 14699.169921875
47 13520.201171875
48 12446.6181640625
49 11466.583984375
50 10571.255859375
51 9752.47265625
52 9002.873046875
53 8317.1005859375
54 7688.27734375
55 7111.07861328125
56 6580.7529296875
57 6093.2421875
58 5644.8701171875
59 5232.0322265625
60 4851.62646484375
61 4500.88671875


In [5]:
import tensorflow as tf
import numpy as np

# First we set up the computational graph:

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create placeholders for the input and target data; these will be filled
# with real data when we execute the graph.
x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

# Create Variables for the weights and initialize them with random data.
# A TensorFlow Variable persists its value across executions of the graph.
w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

# Forward pass: Compute the predicted y using operations on TensorFlow Tensors.
# Note that this code does not actually perform any numeric operations; it
# merely sets up the computational graph that we will later execute.
h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

# Compute loss using operations on TensorFlow Tensors
loss = tf.reduce_sum((y - y_pred) ** 2.0)

# Compute gradient of the loss with respect to w1 and w2.
grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

# Update the weights using gradient descent. To actually update the weights
# we need to evaluate new_w1 and new_w2 when executing the graph. Note that
# in TensorFlow the the act of updating the value of the weights is part of
# the computational graph; in PyTorch this happens outside the computational
# graph.
learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

# Now we have built our computational graph, so we enter a TensorFlow session to
# actually execute the graph.
with tf.Session() as sess:
    # Run the graph once to initialize the Variables w1 and w2.
    sess.run(tf.global_variables_initializer())

    # Create numpy arrays holding the actual data for the inputs x and targets
    # y
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    for _ in range(500):
        # Execute the graph many times. Each time it executes we want to bind
        # x_value to x and y_value to y, specified with the feed_dict argument.
        # Each time we execute the graph we want to compute the values for loss,
        # new_w1, and new_w2; the values of these Tensors are returned as numpy
        # arrays.
        loss_value, _, _ = sess.run([loss, new_w1, new_w2],
                                    feed_dict={x: x_value, y: y_value})
        print(loss_value)

Instructions for updating:
Colocations handled automatically by placer.
35442780.0
36571704.0
39629576.0
37383532.0
27523912.0
16209927.0
8752696.0
4520337.5
2498213.5
1573527.6
1123344.2
874911.44
716836.5
603689.9
516414.4
446208.44
388257.75
339694.12
298507.8
263413.88
233324.75
207351.62
184792.78
165108.23
147893.25
132788.22
119482.03
107720.83
97304.19
88054.99
79815.0
72454.91
65868.27
59962.67
54656.207
49882.277
45579.113
41707.5
38209.562
35040.79
32165.816
29554.207
27178.678
25013.402
23037.566
21234.596
19587.203
18079.676
16700.123
15436.533
14276.857
13209.895
12228.912
11327.129
10497.422
9733.902
9030.524
8381.901
7783.1606
7229.3716
6717.87
6245.302
5808.421
5404.293
5030.123
4683.801
4363.1533
4065.7559
3789.9995
3534.296
3296.8762
3076.428
2871.6223
2681.2593
2504.17
2339.4507
2186.2944
2043.574
1910.7566
1787.0215
1671.7319
1564.223
1464.0071
1370.5579
1283.5366
1202.5315
1126.8892
1056.2343
990.25525
928.6113
870.9834
817.07855
766.69507
719.53723
675.4172
634.1

In [6]:
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(x)

    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 696.5569458007812
1 644.9361572265625
2 600.3717651367188
3 560.9060668945312
4 525.7178955078125
5 494.32806396484375
6 465.71795654296875
7 439.3870544433594
8 414.987060546875
9 392.5325622558594
10 371.64984130859375
11 352.0174865722656
12 333.3733825683594
13 315.6193542480469
14 298.8123779296875
15 282.9483947753906
16 267.87652587890625
17 253.4488983154297
18 239.61268615722656
19 226.4390411376953
20 213.93923950195312
21 202.03057861328125
22 190.6658935546875
23 179.88600158691406
24 169.65072631835938
25 159.882568359375
26 150.64276123046875
27 141.89244079589844
28 133.57847595214844
29 125.70649719238281
30 118.26515197753906
31 111.24663543701172
32 104.63664245605469
33 98.41358947753906
34 92.54393768310547
35 87.02767181396484
36 81.8392562866211
37 76.9708023071289
38 72.37564086914062
39 68.05683135986328
40 63.983577728271484
41 60.1573600769043
42 56.56723403930664
43 53.20025634765625
44 50.03168869018555
45 47.06732940673828
46 44.27090072631836
47 41.64638

In [7]:
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

0 710.9492797851562
1 694.2800903320312
2 678.144775390625
3 662.455322265625
4 647.1746826171875
5 632.2796020507812
6 617.7559814453125
7 603.570556640625
8 589.7734375
9 576.3124389648438
10 563.2774047851562
11 550.5986328125
12 538.2666015625
13 526.2684936523438
14 514.5733642578125
15 503.20123291015625
16 492.0708923339844
17 481.21240234375
18 470.60675048828125
19 460.2120056152344
20 450.0552062988281
21 440.1452331542969
22 430.4493103027344
23 420.95379638671875
24 411.6967468261719
25 402.6528015136719
26 393.7955322265625
27 385.1625061035156
28 376.72796630859375
29 368.47320556640625
30 360.3594665527344
31 352.4029235839844
32 344.59942626953125
33 336.98138427734375
34 329.5643615722656
35 322.3186950683594
36 315.243408203125
37 308.3241882324219
38 301.517822265625
39 294.8563232421875
40 288.3172912597656
41 281.8915100097656
42 275.5853271484375
43 269.39788818359375
44 263.3323059082031
45 257.3753356933594
46 251.51580810546875
47 245.7725067138672
48 240.13331

In [8]:
import torch


class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 693.3182373046875
1 641.7061767578125
2 597.06982421875
3 558.0242309570312
4 523.2684326171875
5 491.86187744140625
6 463.6939697265625
7 437.9382019042969
8 414.1200866699219
9 391.7523498535156
10 370.77532958984375
11 350.97430419921875
12 332.25347900390625
13 314.5902404785156
14 297.90277099609375
15 282.0373229980469
16 266.9511413574219
17 252.5845947265625
18 238.88516235351562
19 225.75404357910156
20 213.23345947265625
21 201.30718994140625
22 189.93565368652344
23 179.13687133789062
24 168.87387084960938
25 159.06712341308594
26 149.75509643554688
27 140.93309020996094
28 132.58255004882812
29 124.69144439697266
30 117.23127746582031
31 110.19950103759766
32 103.5298843383789
33 97.24795532226562
34 91.31658172607422
35 85.74190521240234
36 80.50448608398438
37 75.58103942871094
38 70.9612045288086
39 66.63707733154297
40 62.568565368652344
41 58.74722671508789
42 55.16654586791992
43 51.81181716918945
44 48.67140197753906
45 45.72787857055664
46 42.97789001464844
47 40.