In [1]:
import gc
import math
import numpy as np

import torch
import torch.nn as nn

# 1. Data

### Examples

In [16]:
# Example 1
z = torch.zeros(3,4)
z, z.dtype, z.shape, z.device, z.requires_grad

(tensor([[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]]),
 torch.float32,
 torch.Size([3, 4]),
 device(type='cpu'),
 False)

In [17]:
# Example 2
q = torch.zeros(3,4, dtype=torch.float64, device='cuda', requires_grad=True).view(2,6)
q, q.dtype, q.shape, q.device, q.requires_grad

(tensor([[0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.]], device='cuda:0', dtype=torch.float64,
        grad_fn=<ViewBackward0>),
 torch.float64,
 torch.Size([2, 6]),
 device(type='cuda', index=0),
 True)

In [18]:
# Example 3
w = torch.tensor([[1,2,3], [4,5,6]], dtype=torch.float64, device='cuda', requires_grad=True)
w, w.dtype, w.shape, w.device, w.requires_grad

(tensor([[1., 2., 3.],
         [4., 5., 6.]], device='cuda:0', dtype=torch.float64,
        requires_grad=True),
 torch.float64,
 torch.Size([2, 3]),
 device(type='cuda', index=0),
 True)

In [28]:
# Printing all the contents of a tensor --> doesn't work ???
t_large = torch.Tensor(1000, 1000)
torch.set_printoptions(threshold=10)
t_large

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

### 1.1 Init form values -> random

In [48]:
torch.manual_seed(12)

print(torch.rand(2,3))
print(torch.rand(2,3))

tensor([[0.5, 0.2, 0.5],
        [0.6, 0.4, 0.1]])
tensor([[0.6, 0.2, 0.7],
        [0.7, 0.2, 0.7]])


In [49]:
# the same
torch.manual_seed(12)

print(torch.rand(2,3))
print(torch.rand(2,3))

tensor([[0.5, 0.2, 0.5],
        [0.6, 0.4, 0.1]])
tensor([[0.6, 0.2, 0.7],
        [0.7, 0.2, 0.7]])


In [None]:
# can't control the output of empty() but can for uniform
torch.manual_seed(12)

# usage of empty() -> weight init 
z1 = torch.empty(2,3)
print(z1)
nn.init.uniform_(z1)
print(z1)

print("=================")

z2 = torch.empty(2,3)
print(z2)
# uniform(-1/sqrt(in_features), 1/sqrt(in_features)) [He init]
nn.init.kaiming_uniform_(z2, a=math.sqrt(5))
print(z2)

### 1.2 Init from values -> const

In [73]:
# zeros
z = torch.zeros(1,3)
print(z)

# ones
z = torch.ones(1,3)
print(z)

# const val
z = torch.full((1,3), 3.)
print(z)


tensor([[0., 0., 0.]])
tensor([[1., 1., 1.]])
tensor([[3., 3., 3.]])


In [83]:
# diag
z = torch.eye(3)
print(z)

# diag const values
z = torch.diag(torch.rand(3))
print(z)

tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]])
tensor([[0.7, 0.0, 0.0],
        [0.0, 0.2, 0.0],
        [0.0, 0.0, 0.3]])


In [84]:
# range
z = torch.arange(3,8,2)
print(z)

z = torch.linspace(3,8,20)
print(z)

tensor([3, 5, 7])
tensor([3.0, 3.3, 3.5,  ..., 7.5, 7.7, 8.0])


### 2 Init from memory hash

In [122]:
# totally different pointers !!!
z = torch.empty(2,3)
print(z)

z = torch.Tensor(2,3)
print(z)

z = torch.FloatTensor(2,3)
print(z)

tensor([[ 4.4e+34,  4.6e-41,  3.3e-18],
        [ 3.1e-41, -1.3e+06,  4.6e-41]])
tensor([[4.4e+34, 4.6e-41, 1.3e+23],
        [3.1e-41, 4.5e-44, 0.0e+00]])
tensor([[ 4.4e+34,  4.6e-41,  1.3e+23],
        [ 3.1e-41, -1.3e+06,  4.6e-41]])


### 3.1 Init from data -> list

In [40]:
d_list = [[1,2], [3,4]]
t_list = torch.tensor(d_list)
t_list, t_list.dtype

(tensor([[1, 2],
         [3, 4]]),
 torch.int64)

In [41]:
t_list = torch.Tensor(d_list)
t_list

tensor([[1., 2.],
        [3., 4.]])

### 3.2 Init from data -> np

In [42]:
d_np = np.array(d_list)
t_np = torch.tensor(d_np)
print(t_np)

# with link
t_np_linked = torch.from_numpy(d_np)
d_np[0][0] = 9
print(t_np_linked)

tensor([[1, 2],
        [3, 4]])
tensor([[9, 2],
        [3, 4]])


In [43]:
a_t = torch.Tensor(2,2)
print(a_t)
a_np = a_t.numpy()
a_np

tensor([[-3.1835e-15,  3.0747e-41],
        [-3.1863e-15,  3.0747e-41]])


array([[-3.1834581e-15,  3.0747291e-41],
       [-3.1863313e-15,  3.0747291e-41]], dtype=float32)

In [44]:
a_np[0][0] = 2
a_t

tensor([[ 2.0000e+00,  3.0747e-41],
        [-3.1863e-15,  3.0747e-41]])

In [45]:
# torch.as_tensor == torch.from_numpy
v_np = np.array([1,2,3])
v_t = torch.as_tensor(v_np)
print(v_t)

v_t[0] = 9.
v_np

tensor([1, 2, 3])


array([9, 2, 3])

### 3.3 Init from data -> tensor

In [46]:
t_tensor = torch.empty_like(t_np)
print(t_tensor)

t_tensor = torch.zeros_like(t_np)
print(t_tensor)

t_tensor = torch.ones_like(t_np)
print(t_tensor)

t_tensor = torch.rand_like(t_list, dtype=torch.float)
print(t_tensor)

tensor([[140550944836736, 140550944836736],
        [             32,             112]])
tensor([[0, 0],
        [0, 0]])
tensor([[1, 1],
        [1, 1]])
tensor([[0.5770, 0.3337],
        [0.0947, 0.5099]])


In [47]:
s_tensor = torch.as_tensor(t_tensor)
print(s_tensor)

tensor([[0.5770, 0.3337],
        [0.0947, 0.5099]])


# 2. Type

In [84]:
# way 1
a = torch.tensor([1,2,3], dtype=torch.float16)
a.dtype

torch.float16

In [89]:
# way 2
a = torch.tensor([1,2,3])
print(a.dtype)

a = a.to(torch.float32)
print(a.dtype)

torch.int64
torch.float32


In [14]:
a = torch.tensor([1,2,3], dtype=torch.int16)
print(a.dtype)
a = a*1.
a.dtype, type(a)

torch.int16


(torch.float32, torch.Tensor)

In [3]:
a_cuda = torch.cuda.FloatTensor(2,3)
a_cuda

tensor([[0., 0., 0.],
        [0., 0., 0.]], device='cuda:0')

In [4]:
a_cpu = torch.FloatTensor(2,3)
a_cpu

tensor([[1.0072e-11, 7.7199e-10, 4.2970e-05],
        [4.1199e-11, 2.7001e-06, 2.9573e-18]])

# 3. Shape

# 4. Device

#### Keep in mind that copying large tensors across devices can be expensive in terms of time and memory!

In [5]:
%%timeit
t_cpu1 = torch.zeros(10000,100000)

301 ms ± 4.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
t_cpu2 = torch.zeros(10000,100000)

In [7]:
%%timeit
t_gpu_1 = t_cpu2.to('cuda')

RuntimeError: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero.

In [8]:
%%timeit
# the time is creation on cpu + moving to cuda
# more useful than .cuda() as you can specify it as device on tne top of the code
t_gpu_2 = torch.zeros(10000,100000).to('cuda')

721 ms ± 2.22 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
%%timeit
# the time is creation on cpu + moving to cuda
t_gpu_3 = torch.zeros(10000,100000).cuda()

729 ms ± 10.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%%timeit
# directly on cuda
t_gpu_4 = torch.zeros(10000,100000, device='cuda')

OutOfMemoryError: CUDA out of memory. Tried to allocate 3.73 GiB (GPU 0; 5.81 GiB total capacity; 3.73 GiB already allocated; 667.19 MiB free; 3.73 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [9]:
del t_gpu_1

NameError: name 't_gpu_2' is not defined

In [8]:
torch.cuda.empty_cache()
gc.collect()

28

In [16]:
torch.cuda.memory_cached(), torch.cuda.memory_allocated(), torch.cuda.memory_reserved()



(0, 0, 0)

#### tensor.to()

In [3]:
t1 = torch.Tensor(3,4)
t1.device

device(type='cpu')

In [4]:
t1.to("cuda")
t1.device

device(type='cpu')

In [10]:
t2 = t1.to("cuda", copy=False)
t2.device, t1.device

(device(type='cuda', index=0), device(type='cpu'))

####

# 5. Gradients

In [20]:
# 2 ways of setting track of the grads, and their are almost the same
a = torch.Tensor(2,3)
a.requires_grad = True
a

tensor([[-3.1455e-15,  3.0747e-41,  7.3984e+24],
        [ 3.0747e-41,  1.1210e-43,  0.0000e+00]], requires_grad=True)

In [9]:
b = torch.Tensor(2,3)
b.requires_grad_(True)
b

tensor([[-3.1570e-15,  3.0747e-41, -1.2921e-28],
        [ 4.5856e-41,  8.9683e-44,  0.0000e+00]], requires_grad=True)

In [10]:
# can't if dtype is not floating point
x = torch.tensor([2,3], requires_grad=True)
x


RuntimeError: Only Tensors of floating point and complex dtype can require gradients

For example if you have a non-leaf tensor, setting it to True using self.requires_grad=True will produce an error, but not when you do requires_grad_(True).
Both perform some error checking, such as verifying that the tensor is a leaf, before calling into the same set_requires_grad function (implemented in cpp).

In [59]:
x = torch.rand((2,3), requires_grad=True)
y = x + 2
x, y

# you can't
y.requires_grad = True

RuntimeError: you can only change requires_grad flags of leaf variables.

In [60]:
# you can with requires_grad_
y.requires_grad_(True)

tensor([[2.4252, 2.0157, 2.1997],
        [2.9721, 2.9665, 2.9220]], grad_fn=<AddBackward0>)

In [61]:
a = torch.rand(2,3, requires_grad=True)
b = a * 4
s = torch.sum(b)
s.backward()
print(a, b, s)

# b has no gradients
a.grad, b.grad

tensor([[0.5712, 0.5696, 0.2435],
        [0.3342, 0.5906, 0.6291]], requires_grad=True) tensor([[2.2848, 2.2783, 0.9741],
        [1.3367, 2.3626, 2.5166]], grad_fn=<MulBackward0>) tensor(11.7531, grad_fn=<SumBackward0>)


  a.grad, b.grad


(tensor([[4., 4., 4.],
         [4., 4., 4.]]),
 None)

In [63]:
# retain_grad() can save grads for non-leaf elements
a = torch.rand(2,3, requires_grad=True)
b = a * 4
b.retain_grad()
s = torch.sum(b)
s.backward()
print(a, b, s)

b.grad

tensor([[0.0417, 0.8857, 0.1298],
        [0.7915, 0.9485, 0.5585]], requires_grad=True) tensor([[0.1667, 3.5430, 0.5192],
        [3.1659, 3.7941, 2.2339]], grad_fn=<MulBackward0>) tensor(13.4228, grad_fn=<SumBackward0>)


tensor([[1., 1., 1.],
        [1., 1., 1.]])

# 6. Ops

Over 100 tensor operations, including 
- arithmetic, 
- linear algebra, 
- matrix manipulation
    - indexing,
    - transposing / permuting / reshaping,
    - slicing,
    - joining 
- sampling 
and more

In [126]:
# math ops
r = (torch.rand(2,2) - 0.5) * 2
print(r)

r = torch.abs(r)
print(r)

r1 = torch.det(r)
print(r1)

r2 = torch.svd(r)
print(r2)

m = torch.max(r)
print(m)

tensor([[ 0.2, -0.8],
        [ 0.1,  0.9]])
tensor([[0.2, 0.8],
        [0.1, 0.9]])
tensor(0.1)
torch.return_types.svd(
U=tensor([[ 0.7,  0.7],
        [ 0.7, -0.7]]),
S=tensor([1.2, 0.1]),
V=tensor([[ 0.2,  1.0],
        [ 1.0, -0.2]]))
tensor(0.9)


In [63]:
t = torch.Tensor(2,3)

t_agg = torch.sum(t)
t_agg.item()

15818893033472.0

In [49]:
# cool
z = torch.tensor([[[1]]])
z.item()

1

##### Joining

In [28]:
t = torch.rand(2,3)
t

tensor([[0.5950, 0.6926, 0.7652],
        [0.7031, 0.5762, 0.0528]])

In [29]:
t1 = torch.cat([t,t])
t1

tensor([[0.5950, 0.6926, 0.7652],
        [0.7031, 0.5762, 0.0528],
        [0.5950, 0.6926, 0.7652],
        [0.7031, 0.5762, 0.0528]])

In [30]:
t2 = torch.cat([t,t], dim=0)
t2

tensor([[0.5950, 0.6926, 0.7652],
        [0.7031, 0.5762, 0.0528],
        [0.5950, 0.6926, 0.7652],
        [0.7031, 0.5762, 0.0528]])

In [31]:
t2 = torch.cat([t,t], dim=1)
t2

tensor([[0.5950, 0.6926, 0.7652, 0.5950, 0.6926, 0.7652],
        [0.7031, 0.5762, 0.0528, 0.7031, 0.5762, 0.0528]])

In [39]:
t = torch.Tensor(1,3)

In [45]:
t3 = torch.stack([t,t,t], dim=0)
print(t3.shape)
t3

torch.Size([3, 1, 3])


tensor([[[7.8562e+12, 3.0781e-41, 7.9047e+12]],

        [[7.8562e+12, 3.0781e-41, 7.9047e+12]],

        [[7.8562e+12, 3.0781e-41, 7.9047e+12]]])

In [46]:
t4 = torch.stack([t,t,t], dim=1)
print(t4.shape)
t4

torch.Size([1, 3, 3])


tensor([[[7.8562e+12, 3.0781e-41, 7.9047e+12],
         [7.8562e+12, 3.0781e-41, 7.9047e+12],
         [7.8562e+12, 3.0781e-41, 7.9047e+12]]])

In [47]:
t4 = torch.stack([t,t,t], dim=-1)
print(t4.shape)
t4

torch.Size([1, 3, 3])


tensor([[[7.8562e+12, 7.8562e+12, 7.8562e+12],
         [3.0781e-41, 3.0781e-41, 3.0781e-41],
         [7.9047e+12, 7.9047e+12, 7.9047e+12]]])

In [55]:
# vstack == cat(dim=0)
t5_vstack = torch.vstack([t,t,t])
t5_cat = torch.cat([t,t,t], dim=0)
print(t5_vstack.shape, t5_cat.shape)
t5_vstack, t5_cat

torch.Size([3, 3]) torch.Size([3, 3])


(tensor([[7.8562e+12, 3.0781e-41, 7.9047e+12],
         [7.8562e+12, 3.0781e-41, 7.9047e+12],
         [7.8562e+12, 3.0781e-41, 7.9047e+12]]),
 tensor([[7.8562e+12, 3.0781e-41, 7.9047e+12],
         [7.8562e+12, 3.0781e-41, 7.9047e+12],
         [7.8562e+12, 3.0781e-41, 7.9047e+12]]))

In [52]:
t6 = torch.hstack([t,t,t])
t6.shape

torch.Size([1, 9])

#### Matrix multiplication / linear algebra

In [56]:
t = torch.Tensor(2,3)

In [60]:
t.matmul(t.T)

tensor([[1.2753e+26, 2.4778e-28],
        [2.4778e-28, 0.0000e+00]])

In [59]:
t @ t.T

tensor([[1.2753e+26, 2.4778e-28],
        [2.4778e-28, 0.0000e+00]])