In [2]:
import torch

In [43]:
t1 = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.float32)
t2 = torch.ones((2, 3))
t3 = torch.rand((2, 3))
t1, t2, t3

(tensor([[1., 2., 3.],
         [4., 5., 6.]]),
 tensor([[1., 1., 1.],
         [1., 1., 1.]]),
 tensor([[0.1619, 0.0282, 0.7678],
         [0.6931, 0.2147, 0.3768]]))

In [34]:
torch.cat([t1, t2, t3], dim=0)

tensor([[1.0000, 2.0000, 3.0000],
        [4.0000, 5.0000, 6.0000],
        [1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000],
        [0.2993, 0.6715, 0.1830],
        [0.5794, 0.2616, 0.4787]])

In [32]:
torch.stack([t1, t2, t3], dim=2)

tensor([[[1.0000, 1.0000, 0.2993],
         [2.0000, 1.0000, 0.6715],
         [3.0000, 1.0000, 0.1830]],

        [[4.0000, 1.0000, 0.5794],
         [5.0000, 1.0000, 0.2616],
         [6.0000, 1.0000, 0.4787]]])

In [44]:
t1 @ t2.T

tensor([[ 6.,  6.],
        [15., 15.]])

In [None]:
torch.einsum('ij,kj->ik', t1, t2)

42.0

In [58]:
t1.requires_grad

True

In [59]:
t2.requires_grad

False

In [63]:
r1 = torch.einsum('ij,kj->ik', t1, t2)
r1

tensor([[36., 36.],
        [90., 90.]], grad_fn=<ViewBackward0>)

In [75]:
r1.grad_fn.next_functions[0][0].next_functions[0][0].next_functions

((<BmmBackward0 at 0x7f27a20ef3d0>, 0),)

In [80]:
r1.sum().backward()

In [121]:
b1 = torch.ones((2, 3, 4), dtype=torch.float32)
b2 = torch.arange(4)
b1, b2

(tensor([[[1., 1., 1., 1.],
          [1., 1., 1., 1.],
          [1., 1., 1., 1.]],
 
         [[1., 1., 1., 1.],
          [1., 1., 1., 1.],
          [1., 1., 1., 1.]]]),
 tensor([0, 1, 2, 3]))

In [122]:
b1 + b2[:, None, None]

RuntimeError: The size of tensor a (2) must match the size of tensor b (4) at non-singleton dimension 0

In [146]:
b1.to('cuda:3')

tensor([[[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]],

        [[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]]], device='cuda:3')

In [166]:
# Cell 1: setup
import torch, gc, inspect, psutil, os

device = "cuda" if torch.cuda.is_available() else "cpu"
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats(device)
print("Start alloc:", torch.cuda.memory_allocated(device) / 1e6, "MB")


Start alloc: 25.29792 MB


In [163]:
# Cell 2: deliberate leak
losses = []                    # bad: grows every iteration
for _ in range(2000):
    x = torch.randn(1024, 1024, device=device, requires_grad=True)
    y = torch.randn(1024, 1024, device=device)
    loss = (x @ y).mean()
    losses.append(loss)        # leak happens here
print("After loop:", torch.cuda.max_memory_allocated(device) / 1e6, "MB")


After loop: 16799.344128 MB


In [177]:
import torch, torch.nn as nn

class TinyNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(4, 4, bias=True)          # parameter auto-registered
        self.W = nn.Parameter(torch.ones((4, 4)))  # parameter
        self.register_parameter("W2", nn.Parameter(torch.ones((4, 4))))  # parameter
        self.register_buffer("running_sum", torch.zeros(4))  # buffer

    def forward(self, x):
        return self.linear(x) + self.running_sum

net = TinyNet()
print("Parameters:", [n for n, _ in net.named_parameters()])
print("Buffers:", [n for n, _ in net.named_buffers()])


Parameters: ['W', 'W2', 'linear.weight', 'linear.bias']
Buffers: ['running_sum']


In [180]:
net.state_dict()

OrderedDict([('W',
              tensor([[1., 1., 1., 1.],
                      [1., 1., 1., 1.],
                      [1., 1., 1., 1.],
                      [1., 1., 1., 1.]])),
             ('W2',
              tensor([[1., 1., 1., 1.],
                      [1., 1., 1., 1.],
                      [1., 1., 1., 1.],
                      [1., 1., 1., 1.]])),
             ('running_sum', tensor([0., 0., 0., 0.])),
             ('linear.weight',
              tensor([[-0.4288, -0.2519, -0.1056,  0.3239],
                      [ 0.1157, -0.0893,  0.1409, -0.1096],
                      [ 0.1950,  0.3643,  0.0233,  0.2756],
                      [ 0.4444, -0.2664, -0.0512,  0.2670]])),
             ('linear.bias', tensor([ 0.4106, -0.4493, -0.2060,  0.3858]))])

In [181]:
class BadReLU(nn.Module):
    def forward(self, x):
        return torch.relu_(x)      # in-place

bad = BadReLU()
t = torch.randn(3, requires_grad=True)
try:
    bad(t).sum().backward()
except RuntimeError as e:
    print("Error:", e)


Error: a leaf Variable that requires grad is being used in an in-place operation.
