# PyTorch Tutorial

Learning with self-contained examples.

In [4]:
import torch
from torch.autograd import Variable

In [12]:
#Simple forward and backwards pass
batch_size, D_in, hidden_size, D_out = 64, 1000, 100, 10
lr = 1e-6
dtype = torch.FloatTensor #CPU
dtype = torch.cuda.FloatTensor #GPU

x = Variable(torch.randn(batch_size, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(batch_size, D_out).type(dtype), requires_grad=False)

w1 = Variable(torch.randn(D_in, hidden_size).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(hidden_size, D_out).type(dtype), requires_grad=True)

for epoch in range(500):
    #Clamp @ min=0 is equivalent to relu
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    #L2 loss - Loss is Variable shape (1,), so loss.data is Tensor of shape (1,)
    loss = (y_pred-y).pow(2).sum()
    print("Loss at epoch %d: %f" % (epoch, loss.data[0]))
    
    #Autograd. Computes backward pass wrt all Variables with requires_grad=True
    #After this op, all Variables with requires_grad will hold the gradient of the loss with w1 and w2
    #NOTE: PyTorch ADDS to the current grad.data in Variables in backward(), so remember to reset
    loss.backward()
    
    #SGD
    w1.data -= lr * w1.grad.data
    w2.data -= lr * w2.grad.data
    
    w1.grad.data.zero_()
    w2.grad.data.zero_()


Loss at epoch 0: 31326668.000000
Loss at epoch 1: 32817700.000000
Loss at epoch 2: 41571508.000000
Loss at epoch 3: 49915588.000000
Loss at epoch 4: 46967600.000000
Loss at epoch 5: 30265800.000000
Loss at epoch 6: 13298404.000000
Loss at epoch 7: 4960097.000000
Loss at epoch 8: 2200034.500000
Loss at epoch 9: 1328253.375000
Loss at epoch 10: 984668.562500
Loss at epoch 11: 796770.437500
Loss at epoch 12: 666703.062500
Loss at epoch 13: 566347.312500
Loss at epoch 14: 485390.906250
Loss at epoch 15: 418794.093750
Loss at epoch 16: 363340.375000
Loss at epoch 17: 316857.312500
Loss at epoch 18: 277654.156250
Loss at epoch 19: 244298.734375
Loss at epoch 20: 215763.156250
Loss at epoch 21: 191215.953125
Loss at epoch 22: 170003.656250
Loss at epoch 23: 151595.531250
Loss at epoch 24: 135629.484375
Loss at epoch 25: 121680.085938
Loss at epoch 26: 109438.171875
Loss at epoch 27: 98637.984375
Loss at epoch 28: 89085.796875
Loss at epoch 29: 80627.343750
Loss at epoch 30: 73102.242188
Loss 

## Defining new autograd functions
Each autograd operator is just a class with two functions: forward() and backward()
- `forward()` : Compute output Tensors from input Tensors
- `backward()`: Compute gradient of input Tensors with respect to that some scalar value. Receives the gradient of the output Tensor wrt to some scalar value (loss)

In [18]:
class MyReLU(torch.autograd.Function):
    """
    Subclass torch.autograd.Function, then implement forward() and backward()
    """
    def forward(self, input):
        """
        We can cache any Tensor for use in backward pass by using save_for_backward() method
        """
        self.save_for_backward(input) #NOTE: this only saves input or output, but NOT intermediate
        return input.clamp(min=0)
    
    def backward(self, grad_output):
        """Input to this will be a Tensor (not variable), containing gradient wrt to output"""
        input, = self.saved_tensors #Outputs in a tuple
        grad_input = grad_output.clone()
        grad_input[grad_input < 0] = 0.
        return grad_input
    
dtype = torch.cuda.FloatTensor
batch_size, D_in, hidden_size, D_out = 64, 1000, 100, 10
lr = 1e-6

x = Variable(torch.randn(batch_size, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(batch_size, D_out).type(dtype), requires_grad=False)

w1 = Variable(torch.randn(D_in, hidden_size).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(hidden_size, D_out).type(dtype), requires_grad=True)

for epoch in range(500):
    relu = MyReLU()
    #Clamp @ min=0 is equivalent to relu
    y_pred = relu(x.mm(w1)).mm(w2)
    
    #L2 loss - Loss is Variable shape (1,), so loss.data is Tensor of shape (1,)
    loss = (y_pred-y).pow(2).sum()
    print("Loss at epoch %d: %f" % (epoch, loss.data[0]))
    
    #Autograd. Computes backward pass wrt all Variables with requires_grad=True
    #After this op, all Variables with requires_grad will hold the gradient of the loss with w1 and w2
    #NOTE: PyTorch ADDS to the current grad.data in Variables in backward(), so remember to reset
    loss.backward()
    
    #SGD
    w1.data -= lr * w1.grad.data
    w2.data -= lr * w2.grad.data
    
    w1.grad.data.zero_()
    w2.grad.data.zero_()

Loss at epoch 0: 35289344.000000
Loss at epoch 1: 35238464.000000
Loss at epoch 2: 34307684.000000
Loss at epoch 3: 28130868.000000
Loss at epoch 4: 18151094.000000
Loss at epoch 5: 9870788.000000
Loss at epoch 6: 5244858.500000
Loss at epoch 7: 3235786.250000
Loss at epoch 8: 2369459.000000
Loss at epoch 9: 1927296.875000
Loss at epoch 10: 1642875.500000
Loss at epoch 11: 1428844.375000
Loss at epoch 12: 1256006.500000
Loss at epoch 13: 1112217.875000
Loss at epoch 14: 990908.312500
Loss at epoch 15: 887374.187500
Loss at epoch 16: 798356.250000
Loss at epoch 17: 721344.375000
Loss at epoch 18: 654369.750000
Loss at epoch 19: 595734.125000
Loss at epoch 20: 544118.062500
Loss at epoch 21: 498466.312500
Loss at epoch 22: 457903.031250
Loss at epoch 23: 421703.468750
Loss at epoch 24: 389290.250000
Loss at epoch 25: 360161.562500
Loss at epoch 26: 333908.312500
Loss at epoch 27: 310173.500000
Loss at epoch 28: 288620.875000
Loss at epoch 29: 269008.437500
Loss at epoch 30: 251126.078125

## Higher level layers - Example with nn module
Sometimes, raw autograd can be too low-level. PyTorch offers a higher level abstraction with `nn` module

In [19]:
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)
print(x.requires_grad)
# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Variables for its weight and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Variable of input data to the Module and it produces
    # a Variable of output data.
    y_pred = model(x)

    # Compute and print loss. We pass Variables containing the predicted and true
    # values of y, and the loss function returns a Variable containing the
    # loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.data[0])

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Variables with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Variable, so
    # we can access its data and gradients like we did before.
    for param in model.parameters():
        param.data -= learning_rate * param.grad.data

False
0 662.49462890625
1 611.0245361328125
2 565.9374389648438
3 526.3433227539062
4 491.1385803222656
5 459.2030029296875
6 430.5286865234375
7 404.2904052734375
8 380.19427490234375
9 358.1907653808594
10 337.7488098144531
11 318.6429748535156
12 300.6151428222656
13 283.73333740234375
14 267.9111328125
15 253.01820373535156
16 238.95181274414062
17 225.61273193359375
18 212.92849731445312
19 200.87100219726562
20 189.3870849609375
21 178.47413635253906
22 168.12863159179688
23 158.35069274902344
24 149.1097412109375
25 140.37469482421875
26 132.1210479736328
27 124.33000946044922
28 116.9700927734375
29 110.01622009277344
30 103.43307495117188
31 97.22976684570312
32 91.37891387939453
33 85.88009643554688
34 80.70825958251953
35 75.85533142089844
36 71.29464721679688
37 67.01573181152344
38 63.00053024291992
39 59.232120513916016
40 55.7006950378418
41 52.38496398925781
42 49.27217102050781
43 46.351600646972656
44 43.61233139038086
45 41.04492950439453
46 38.63540267944336
47 36.3

## Standard model wrapper
We have seen how nn.Module can help us simplify our code. We can also use that to wrap around arbitrary number of operations to create a model. Using nn.module also allows us to use more complicated pre-built optimizers (as oppose to just looping through params and manually doing naive SGD).

This becomes our bread-and-butter way of writing models in PyTorch: using a nn.Module wrapper.

In [42]:
class twoLayerNet(torch.nn.Module):
    def __init__(self, D_in, hidden_size, D_out):
        super(twoLayerNet, self).__init__() # ???
        self.dense1 = torch.nn.Linear(D_in, hidden_size)
        self.dense2 = torch.nn.Linear(hidden_size, D_out)
        
    def forward(self, x):
        h_out = torch.nn.ReLU()(self.dense1(x))
        pred = self.dense2(h_out)
        return pred

model = twoLayerNet(D_in, hidden_size, D_out)

x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

#Construct loss function and Optimizer.
#model.parameters() returns all learnable parameters of the nn.Linear modules
criterion = torch.nn.MSELoss(size_average=True)
opt = torch.optim.SGD(model.parameters(), lr=1e-2)

for epoch in range(500):
    y_pred = model(x)
    loss = criterion(y_pred, y)
    print(epoch, loss.data[0])
    
    opt.zero_grad()
    loss.backward()
    opt.step()


0 1.0392284393310547
1 1.027318000793457
2 1.0156762599945068
3 1.0043089389801025
4 0.9931926727294922
5 0.9823950529098511
6 0.9718162417411804
7 0.9614356756210327
8 0.9512289762496948
9 0.9411896467208862
10 0.931373119354248
11 0.9217436909675598
12 0.9122875928878784
13 0.9030243754386902
14 0.8939418792724609
15 0.8849756121635437
16 0.8761796951293945
17 0.8675147294998169
18 0.8589730262756348
19 0.8505562543869019
20 0.8422958254814148
21 0.8341854810714722
22 0.8262068033218384
23 0.8183488845825195
24 0.8105946779251099
25 0.8029730916023254
26 0.7954890727996826
27 0.7881265878677368
28 0.7808834314346313
29 0.773739755153656
30 0.7666952610015869
31 0.759775698184967
32 0.75295090675354
33 0.7462307214736938
34 0.739641547203064
35 0.7331834435462952
36 0.7268016338348389
37 0.7205035090446472
38 0.7142890691757202
39 0.7081447839736938
40 0.7020708322525024
41 0.696076512336731
42 0.6901798844337463
43 0.6843438744544983
44 0.6785628199577332
45 0.6728572249412537
46 0.6

In [49]:
#Example of using the dynamic graph to construct variable sized model
# Here we have a variable number of (re-used) middle hidden layer
import random
class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, hidden_size, D_out):
        super(DynamicNet, self).__init__()
        self.dense_in = torch.nn.Linear(D_in, hidden_size)
        self.dense_middle = torch.nn.Linear(hidden_size, hidden_size)
        self.dense_out = torch.nn.Linear(hidden_size, D_out)
        
    def forward(self, x):
        h_relu = torch.nn.ReLU()(self.dense_in(x))
        for _ in range(random.randint(0,3)):
            h_relu = torch.nn.ReLU()(self.dense_middle(h_relu))
        y_pred = self.dense_out(h_relu)
        return y_pred

x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out))

model = DynamicNet(D_in, hidden_size, D_out)
model.cuda()

criterion = torch.nn.MSELoss()
opt = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9)
for epoch in range(500):
    y_pred = model(x.cuda())
    loss = criterion(y_pred, y.cuda())
    print(epoch, loss.data[0])
    
    opt.zero_grad()
    loss.backward()
    opt.step()

0 1.0263596773147583
1 0.9831148386001587
2 0.9827384948730469
3 0.9589020013809204
4 0.9592604637145996
5 1.020407795906067
6 0.9803665280342102
7 0.9587608575820923
8 0.9787673950195312
9 1.0124536752700806
10 1.0095927715301514
11 0.9580065608024597
12 0.9749660491943359
13 0.9991628527641296
14 0.9571253657341003
15 0.9573929905891418
16 0.9709998965263367
17 0.9570616483688354
18 0.9560775756835938
19 0.9799356460571289
20 0.9555155038833618
21 0.9734967350959778
22 0.9696098566055298
23 0.9649871587753296
24 0.9558753967285156
25 0.963191032409668
26 0.9622885584831238
27 0.9536253809928894
28 0.9602544903755188
29 0.9550212621688843
30 0.9548455476760864
31 0.9546681642532349
32 0.956295371055603
33 0.9552812576293945
34 0.9517379999160767
35 0.9514510035514832
36 0.9511398077011108
37 0.9229507446289062
38 0.9504969716072083
39 0.9501649737358093
40 0.916441798210144
41 0.9480305910110474
42 0.9110943078994751
43 0.9077723622322083
44 0.9525190591812134
45 0.9447699785232544
46