In [1]:
import torch
torch.__version__

'2.6.0'

In [2]:
torch.backends.mps.is_available(), torch.cuda.is_available()

(True, False)

### Device (Apple Silicon here)

In [3]:
device = torch.device("mps")
device

device(type='mps')

## Initializing tensors

In [4]:
t1 = torch.tensor([[1, 2, 3]], dtype=torch.int16, device=device)
t1

tensor([[1, 2, 3]], device='mps:0', dtype=torch.int16)

In [5]:
t1.device, t1.data, t1.ndim, t1.dim, t1.shape, t1.size

(device(type='mps', index=0),
 tensor([[1, 2, 3]], device='mps:0', dtype=torch.int16),
 2,
 <function Tensor.dim()>,
 torch.Size([1, 3]),
 <function Tensor.size>)

In [6]:
t1[0][0], t1[0]

(tensor(1, device='mps:0', dtype=torch.int16),
 tensor([1, 2, 3], device='mps:0', dtype=torch.int16))

In [7]:
scalar = torch.tensor(5)
scalar, scalar.ndim, scalar.shape, scalar.dtype

(tensor(5), 0, torch.Size([]), torch.int64)

In [8]:
vector = torch.tensor([1, 3])
vector, vector.ndim, vector.shape, vector.dtype

(tensor([1, 3]), 1, torch.Size([2]), torch.int64)

In [9]:
# t1.item(), scalar.item(), vector.item()

In [10]:
# So, .item() only works for scalar element

scalar.item()

5

In [11]:
matrix = torch.tensor([[1, 4], [5, 3]], dtype=torch.float32, device=device)
scalar2 = torch.tensor([6])
scalar2, scalar2.item(), scalar2.ndim, scalar2.shape, matrix, matrix.ndim, matrix.shape

(tensor([6]),
 6,
 1,
 torch.Size([1]),
 tensor([[1., 4.],
         [5., 3.]], device='mps:0'),
 2,
 torch.Size([2, 2]))

In [12]:
tensor = torch.tensor([[[1, 3, 5], [3, 6, 8], [1, 2, 3]]])
tensor, tensor.ndim, tensor.shape, tensor.size()

(tensor([[[1, 3, 5],
          [3, 6, 8],
          [1, 2, 3]]]),
 3,
 torch.Size([1, 3, 3]),
 torch.Size([1, 3, 3]))

## Initializing Random tensors

In [13]:
rand = torch.rand(size=(3, 3, 3), dtype=torch.float16)
rand2 = rand.to(device=device)
rand, rand2

(tensor([[[0.0283, 0.8301, 0.2676],
          [0.7378, 0.4502, 0.0269],
          [0.9785, 0.8896, 0.8062]],
 
         [[0.0439, 0.1187, 0.5405],
          [0.1138, 0.7256, 0.6338],
          [0.4810, 0.5610, 0.6934]],
 
         [[0.7461, 0.6890, 0.1426],
          [0.1719, 0.8516, 0.8970],
          [0.4233, 0.4683, 0.6089]]], dtype=torch.float16),
 tensor([[[0.0283, 0.8301, 0.2676],
          [0.7378, 0.4502, 0.0269],
          [0.9785, 0.8896, 0.8062]],
 
         [[0.0439, 0.1187, 0.5405],
          [0.1138, 0.7256, 0.6338],
          [0.4810, 0.5610, 0.6934]],
 
         [[0.7461, 0.6890, 0.1426],
          [0.1719, 0.8516, 0.8970],
          [0.4233, 0.4683, 0.6089]]], device='mps:0', dtype=torch.float16))

In [14]:
rand2.to(dtype=torch.int16)

tensor([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],

        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],

        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]], device='mps:0', dtype=torch.int16)

In [15]:
rand2[0], rand2[0][0], rand2[0][0][0]

(tensor([[0.0283, 0.8301, 0.2676],
         [0.7378, 0.4502, 0.0269],
         [0.9785, 0.8896, 0.8062]], device='mps:0', dtype=torch.float16),
 tensor([0.0283, 0.8301, 0.2676], device='mps:0', dtype=torch.float16),
 tensor(0.0283, device='mps:0', dtype=torch.float16))

In [16]:
# Random int
torch.randint(low=10, high=30, size=(3, 3), dtype=torch.float16, device=device)

tensor([[18., 22., 18.],
        [21., 10., 21.],
        [18., 17., 25.]], device='mps:0', dtype=torch.float16)

In [17]:
# Random distribution
torch.randn(5), torch.randn(3, 2)

(tensor([-0.6607, -0.9045, -0.5073, -0.7541,  0.0207]),
 tensor([[-0.6822,  1.0291],
         [-0.2884, -1.6073],
         [ 0.5737, -0.1332]]))

## Other tensors

In [18]:
# Zeros

torch.zeros(2, 3)

tensor([[0., 0., 0.],
        [0., 0., 0.]])

In [19]:
# Ones
torch.ones(3, 1)

tensor([[1.],
        [1.],
        [1.]])

In [20]:
# range
torch.arange(start=0, end=1.1, step=0.1)

tensor([0.0000, 0.1000, 0.2000, 0.3000, 0.4000, 0.5000, 0.6000, 0.7000, 0.8000,
        0.9000, 1.0000])

In [21]:
empty = torch.empty(3, 3)
empty, empty.size(), empty.ndim, empty.sum(), empty.dtype

(tensor([[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]]),
 torch.Size([3, 3]),
 2,
 tensor(0.),
 torch.float32)

In [22]:
# Alike tensors: of the same size
torch.ones_like(empty)

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])

## Tensor datatypes

- **int, uint:** 8bit, 16bit, 32bit, 64 bit
- **float, complex:** 16bit, 32bit, 64bit
- **Quantized int:** quint8, qint8
- etc

Lower precisions may faster models but will lower the performances.

In [23]:
dt1 = torch.tensor([3, 6])
dt2 = torch.tensor([3, 6.0])
dt3 = torch.tensor([3, 6.0], dtype=torch.float16)
dt4 = torch.tensor([3.1, 6.64])

dt1, dt1.dtype, dt2, dt2.dtype, dt3, dt3.dtype, dt4, dt4.dtype

(tensor([3, 6]),
 torch.int64,
 tensor([3., 6.]),
 torch.float32,
 tensor([3., 6.], dtype=torch.float16),
 torch.float16,
 tensor([3.1000, 6.6400]),
 torch.float32)

So, float32 is default

## Manipulating tensors

In [24]:
# addition

dt1 + 10, dt1 * 10, dt1 + 10 * 10

(tensor([13, 16]), tensor([30, 60]), tensor([103, 106]))

In [25]:
dt1.sum()

tensor(9)

In [26]:
tensor, tensor.sum(dim=2), tensor.sum(dim=1)

(tensor([[[1, 3, 5],
          [3, 6, 8],
          [1, 2, 3]]]),
 tensor([[ 9, 17,  6]]),
 tensor([[ 5, 11, 16]]))

In [27]:
# Multiplication

tensor.multiply(10), tensor.mul(10)

(tensor([[[10, 30, 50],
          [30, 60, 80],
          [10, 20, 30]]]),
 tensor([[[10, 30, 50],
          [30, 60, 80],
          [10, 20, 30]]]))

In [28]:
# multiplication 

tensor, tensor.mul(tensor), tensor * tensor

(tensor([[[1, 3, 5],
          [3, 6, 8],
          [1, 2, 3]]]),
 tensor([[[ 1,  9, 25],
          [ 9, 36, 64],
          [ 1,  4,  9]]]),
 tensor([[[ 1,  9, 25],
          [ 9, 36, 64],
          [ 1,  4,  9]]]))

## Dot product

In [29]:
tensor, torch.matmul(tensor, tensor), tensor @ tensor

(tensor([[[1, 3, 5],
          [3, 6, 8],
          [1, 2, 3]]]),
 tensor([[[15, 31, 44],
          [29, 61, 87],
          [10, 21, 30]]]),
 tensor([[[15, 31, 44],
          [29, 61, 87],
          [10, 21, 30]]]))

In [30]:
%%time

torch.matmul(tensor, tensor)

CPU times: user 49 μs, sys: 6 μs, total: 55 μs
Wall time: 37.2 μs


tensor([[[15, 31, 44],
         [29, 61, 87],
         [10, 21, 30]]])

In [31]:
%%time

tensor @ tensor

CPU times: user 176 μs, sys: 161 μs, total: 337 μs
Wall time: 269 μs


tensor([[[15, 31, 44],
         [29, 61, 87],
         [10, 21, 30]]])

So, @ is faster

In [32]:
tensor_A = torch.tensor([[1, 2],
                         [3, 4],
                         [5, 6]], dtype=torch.float32)

tensor_B = torch.tensor([[7, 10],
                         [8, 11], 
                         [9, 12]], dtype=torch.float32)

tensor_A.shape, tensor_B.shape, tensor_A.T.shape

(torch.Size([3, 2]), torch.Size([3, 2]), torch.Size([2, 3]))

In [33]:
tensor_A * tensor_A, torch.mul(tensor_A, tensor_A)

(tensor([[ 1.,  4.],
         [ 9., 16.],
         [25., 36.]]),
 tensor([[ 1.,  4.],
         [ 9., 16.],
         [25., 36.]]))

In [34]:
tensor_A * tensor_B, torch.mul(tensor_A, tensor_B)

(tensor([[ 7., 20.],
         [24., 44.],
         [45., 72.]]),
 tensor([[ 7., 20.],
         [24., 44.],
         [45., 72.]]))

In [35]:
torch.matmul(tensor_A, tensor_A.T)

tensor([[ 5., 11., 17.],
        [11., 25., 39.],
        [17., 39., 61.]])

In [36]:
torch.mm(tensor_A, tensor_A.T)

tensor([[ 5., 11., 17.],
        [11., 25., 39.],
        [17., 39., 61.]])

### Linear layer example

In [39]:
linear = torch.nn.Linear(in_features=4, out_features=1, device=device)

inp = torch.rand(size=(30, 4), device=device)
linear, inp.shape

(Linear(in_features=4, out_features=1, bias=True), torch.Size([30, 4]))

In [40]:
linear(inp)

tensor([[-0.2907],
        [-0.4820],
        [-0.3252],
        [-0.2800],
        [-0.2584],
        [ 0.0684],
        [-0.4656],
        [-0.1906],
        [-0.2500],
        [-0.1288],
        [-0.2551],
        [-0.1237],
        [-0.5341],
        [-0.3932],
        [-0.2758],
        [-0.3471],
        [-0.1721],
        [-0.5357],
        [-0.6416],
        [-0.1785],
        [-0.1923],
        [-0.1468],
        [-0.1921],
        [-0.2264],
        [-0.5574],
        [-0.2977],
        [-0.4666],
        [-0.1192],
        [-0.0720],
        [-0.3443]], device='mps:0', grad_fn=<LinearBackward0>)

In [41]:
x = torch.randint(low=10, high=100, size=(1, 20))
x

tensor([[65, 66, 34, 41, 12, 18, 46, 95, 82, 63, 11, 27, 25, 94, 22, 53, 71, 81,
         47, 26]])

In [47]:
x.min(), x.max()

(tensor(11), tensor(95))

In [51]:
# x.mean() This won't work as mean requires float datatype

In [52]:
x.to(dtype=torch.float16).mean()

tensor(48.9375, dtype=torch.float16)

In [56]:
x.median().type(torch.float16), x.mode()

(tensor(46., dtype=torch.float16),
 torch.return_types.mode(
 values=tensor([11]),
 indices=tensor([10])))

In [55]:
x.sort()

torch.return_types.sort(
values=tensor([[11, 12, 18, 22, 25, 26, 27, 34, 41, 46, 47, 53, 63, 65, 66, 71, 81, 82,
         94, 95]]),
indices=tensor([[10,  4,  5, 14, 12, 19, 11,  2,  3,  6, 18, 15,  9,  0,  1, 16, 17,  8,
         13,  7]]))

In [58]:
from collections import Counter

Counter(x[0])

Counter({tensor(65): 1,
         tensor(66): 1,
         tensor(34): 1,
         tensor(41): 1,
         tensor(12): 1,
         tensor(18): 1,
         tensor(46): 1,
         tensor(95): 1,
         tensor(82): 1,
         tensor(63): 1,
         tensor(11): 1,
         tensor(27): 1,
         tensor(25): 1,
         tensor(94): 1,
         tensor(22): 1,
         tensor(53): 1,
         tensor(71): 1,
         tensor(81): 1,
         tensor(47): 1,
         tensor(26): 1})

In [60]:
# Finding index of min and max

x, x.argmax(), x.argmin()

(tensor([[65, 66, 34, 41, 12, 18, 46, 95, 82, 63, 11, 27, 25, 94, 22, 53, 71, 81,
          47, 26]]),
 tensor(7),
 tensor(10))

## Calculating grads

In [65]:
x = torch.tensor([[2, 6.3, 9.9], [3.1, 6, 0.2]], device=device, requires_grad=True)
x, x.ndim, x.shape

(tensor([[2.0000, 6.3000, 9.9000],
         [3.1000, 6.0000, 0.2000]], device='mps:0', requires_grad=True),
 2,
 torch.Size([2, 3]))

In [None]:
out = x.pow(2).sum() # x^2
out

tensor(187.3500, device='mps:0', grad_fn=<SumBackward0>)

In [68]:
out.backward()

In [71]:
x.grad # returns 2x (d/dx of x^2)

tensor([[ 4.0000, 12.6000, 19.8000],
        [ 6.2000, 12.0000,  0.4000]], device='mps:0')

## Reshaping, Stacking, Squeezing and Unsqueezing

In [76]:
tensor_A, tensor_A.reshape(shape=(2, 3))

(tensor([[1., 2.],
         [3., 4.],
         [5., 6.]], device='mps:0'),
 tensor([[1., 2., 3.],
         [4., 5., 6.]], device='mps:0'))

In [78]:
tensor_A.view(torch.int32)

tensor([[1065353216, 1073741824],
        [1077936128, 1082130432],
        [1084227584, 1086324736]], device='mps:0', dtype=torch.int32)

In [81]:
torch.stack([tensor_A, tensor_B.to(device=device)])

tensor([[[ 1.,  2.],
         [ 3.,  4.],
         [ 5.,  6.]],

        [[ 7., 10.],
         [ 8., 11.],
         [ 9., 12.]]], device='mps:0')

In [85]:
torch.stack([tensor_A, tensor_B.to(device=device)], dim=2), torch.stack([tensor_A, tensor_B.to(device=device)], dim=1)

(tensor([[[ 1.,  7.],
          [ 2., 10.]],
 
         [[ 3.,  8.],
          [ 4., 11.]],
 
         [[ 5.,  9.],
          [ 6., 12.]]], device='mps:0'),
 tensor([[[ 1.,  2.],
          [ 7., 10.]],
 
         [[ 3.,  4.],
          [ 8., 11.]],
 
         [[ 5.,  6.],
          [ 9., 12.]]], device='mps:0'))

In [97]:
# unsqueeze: add a dimension to the specified dimension

tensor_B, tensor_B.squeeze(), tensor_B.unsqueeze(dim=0), tensor_B.unsqueeze(dim=1), tensor_B.unsqueeze(dim=1).shape

(tensor([[ 7., 10.],
         [ 8., 11.],
         [ 9., 12.]]),
 tensor([[ 7., 10.],
         [ 8., 11.],
         [ 9., 12.]]),
 tensor([[[ 7., 10.],
          [ 8., 11.],
          [ 9., 12.]]]),
 tensor([[[ 7., 10.]],
 
         [[ 8., 11.]],
 
         [[ 9., 12.]]]),
 torch.Size([3, 1, 2]))

In [102]:
# Permute: view as different shape

print(x, x.shape)
x.permute(dims=(1, 0))

tensor([[2.0000, 6.3000, 9.9000],
        [3.1000, 6.0000, 0.2000]], device='mps:0', requires_grad=True) torch.Size([2, 3])


tensor([[2.0000, 3.1000],
        [6.3000, 6.0000],
        [9.9000, 0.2000]], device='mps:0', grad_fn=<PermuteBackward0>)

In [104]:
y = torch.rand(size=(225, 220, 3))
print(y.shape)

perm = y.permute(2, 1, 0)
perm.shape

torch.Size([225, 220, 3])


torch.Size([3, 220, 225])

## Indexing

In [105]:
x = torch.arange(1, 10).reshape(1, 3, 3)
x

tensor([[[1, 2, 3],
         [4, 5, 6],
         [7, 8, 9]]])

In [111]:
x[0][0], x[0, 0]

(tensor([1, 2, 3]), tensor([1, 2, 3]))

In [110]:
x[0][:, 0], x[0, :, 0]

(tensor([1, 4, 7]), tensor([1, 4, 7]))

## Pytorch and numpy

In [112]:
import numpy as np

In [118]:
np1 = np.array([[2, 6, 7], [1, 8, 3]])
np1

array([[2, 6, 7],
       [1, 8, 3]])

In [123]:
t1 = torch.from_numpy(np1)
t1, t1.ndim, t1.shape, t1.device, t1.dtype, t1.requires_grad

(tensor([[2, 6, 7],
         [1, 8, 3]]),
 2,
 torch.Size([2, 3]),
 device(type='cpu'),
 torch.int64,
 False)

In [128]:
t1 = t1.to(device=device, dtype=torch.float32)
t1.requires_grad_() # Only for float types
t1, t1.ndim, t1.shape, t1.device, t1.dtype, t1.requires_grad

(tensor([[2., 6., 7.],
         [1., 8., 3.]], device='mps:0', requires_grad=True),
 2,
 torch.Size([2, 3]),
 device(type='mps', index=0),
 torch.float32,
 True)

In [131]:
obj = t1.square().sum() + t1.pow(3).sum()
obj

tensor(1270., device='mps:0', grad_fn=<AddBackward0>)

In [132]:
obj.backward()

In [135]:
t1.grad # would return 2t + 3t^2

tensor([[ 16., 120., 161.],
        [  5., 208.,  33.]], device='mps:0')

In [137]:
# tensor_A, tensor_A.numpy() This won't work as tensor_A is in mps. We need to convert it to cpu first

In [138]:
tensor_A, tensor_A.to(device="cpu").numpy()

(tensor([[1., 2.],
         [3., 4.],
         [5., 6.]], device='mps:0'),
 array([[1., 2.],
        [3., 4.],
        [5., 6.]], dtype=float32))

# Reproducability

Trying to make results less random: producing similar results on multiple iterations.

As long as ***torch.manual_seed()*** is set to a constant at the beginning of an application and all other sources of nondeterminism have been eliminated, the same series of random numbers will be generated each time the application is run in the same environment.

In [140]:
tensor_A.to(device='cpu').equal(tensor_B)

False

In [142]:
tensor_A.to(device='cpu') == tensor_B

tensor([[False, False],
        [False, False],
        [False, False]])

In [143]:
tensor_C = torch.rand(2, 3)
tensor_D = torch.rand(2, 3)

tensor_C, tensor_D, tensor_C == tensor_D

(tensor([[0.7355, 0.2723, 0.0496],
         [0.6839, 0.5021, 0.8806]]),
 tensor([[0.5721, 0.4580, 0.6320],
         [0.0926, 0.1080, 0.6762]]),
 tensor([[False, False, False],
         [False, False, False]]))

In [145]:
torch.manual_seed(53)
tensor_C = torch.rand(2, 3)
tensor_D = torch.rand(2, 3)

tensor_C, tensor_D, tensor_C == tensor_D

(tensor([[0.7456, 0.7613, 0.6584],
         [0.6439, 0.4481, 0.7004]]),
 tensor([[0.1568, 0.2288, 0.9780],
         [0.4037, 0.1519, 0.9458]]),
 tensor([[False, False, False],
         [False, False, False]]))

In [147]:
torch.manual_seed(53)
tensor_C = torch.rand(2, 3)

torch.manual_seed(53)
tensor_D = torch.rand(2, 3)

tensor_C, tensor_D, tensor_C == tensor_D

(tensor([[0.7456, 0.7613, 0.6584],
         [0.6439, 0.4481, 0.7004]]),
 tensor([[0.7456, 0.7613, 0.6584],
         [0.6439, 0.4481, 0.7004]]),
 tensor([[True, True, True],
         [True, True, True]]))

So, we have initialized same random values using similar random_seed value

In [148]:
torch.manual_seed(53)
tensor_C = torch.rand(2, 3)

torch.manual_seed(5)
tensor_D = torch.rand(2, 3)

tensor_C, tensor_D, tensor_C == tensor_D

(tensor([[0.7456, 0.7613, 0.6584],
         [0.6439, 0.4481, 0.7004]]),
 tensor([[0.8303, 0.1261, 0.9075],
         [0.8199, 0.9201, 0.1166]]),
 tensor([[False, False, False],
         [False, False, False]]))

## CPU <> GPU <> Apple Silicon

In [149]:
tensor_A

tensor([[1., 2.],
        [3., 4.],
        [5., 6.]], device='mps:0')

In [151]:
tensor_A.cpu(), tensor_A.to(device="cpu")

(tensor([[1., 2.],
         [3., 4.],
         [5., 6.]]),
 tensor([[1., 2.],
         [3., 4.],
         [5., 6.]]))