In [None]:
# In this notebook, you learn:
#
# 1) How matrix multiplication is carried out with tensors?
# 2) How broadcasting works in pytorch / python?
# 3) How torch.scatter_ works in pytorch?

In [2]:
import torch

## [torch.matmul](https://pytorch.org/docs/stable/generated/torch.matmul.html#torch-matmul)

In [3]:
t1 = torch.tensor(data=[[1, 2], [3, 4]], dtype=torch.int)
print(t1)
print(t1.shape)

tensor([[1, 2],
        [3, 4]], dtype=torch.int32)
torch.Size([2, 2])


In [4]:
t2 = torch.tensor(data=[[5, 6], [7, 8]], dtype=torch.int)
print(t2)
print(t2.shape)

tensor([[5, 6],
        [7, 8]], dtype=torch.int32)
torch.Size([2, 2])


In [6]:
# '@' is a short hand notation for matrix muplitlication. It performs the same operations as matmul.
# 
# This peforms the following matrix multiplication:
# |1, 2| x |5, 6| = |19 22|
# |3, 4|   |7, 8|   |43 50|
t3 = t1 @ t2
t4 = torch.matmul(t1, t2)
print(t3, t3.shape, "\n")
print(t4, t4.shape, "\n")

if torch.equal(t3, t4):
    print("Both t3 and t4 tensors contain the same values i.e., t3 = t4")
else:
    print("Elements in t3 and different from elements in t4 i.e., t3 != t4")

tensor([[19, 22],
        [43, 50]], dtype=torch.int32) torch.Size([2, 2]) 

tensor([[19, 22],
        [43, 50]], dtype=torch.int32) torch.Size([2, 2]) 

Both t3 and t4 tensors contain the same values i.e., t3 = t4


In [7]:
# 1D tensors are basically vectors.
t5 = torch.tensor(data=[9, 10], dtype=torch.int)
print(t5)
print(t5.shape)

tensor([ 9, 10], dtype=torch.int32)
torch.Size([2])


In [9]:
# '@' operator performs matrix-vector multiplication when one of the operators is a 1D tensor(vector).
#
# This performs the following matrix-vector multiplication:
# |1, 2| x |9 | = |29|
# |3, 4|   |10|   |67|
t6 = t1 @ t5
print(t6)
print(t6.shape)

tensor([29, 67], dtype=torch.int32)
torch.Size([2])


In [10]:
t7 = torch.tensor(data=[11, 12], dtype=torch.int)
print(t7)
print(t7.shape)

tensor([11, 12], dtype=torch.int32)
torch.Size([2])


In [11]:
# '@' performs dot-product when both the operators are 1D tensors (vector).
#
# This performs the following dot-product:
# |9, 10| . |11, 12| = 219
t8 = t5 @ t7
print(t8)
print(t8.shape)

tensor(219, dtype=torch.int32)
torch.Size([])


## [Broadcasting in pytorch](https://pytorch.org/docs/stable/notes/broadcasting.html#broadcasting-semantics)

In [2]:
# Resources to go through to understanding broadcasting before continuing further in this notebook:
# https://www.youtube.com/watch?v=tKcLaGdvabM 
#       -- Explains what broadcasting is and how it works on matrices with examples.
# https://pytorch.org/docs/stable/notes/broadcasting.html#broadcasting-semantics
#       -- Explains how shape of the input tensors and resultant tensors get manipulated during broadcasting.
#
# Copied the below conditions from pytorch official documentation:
#
# Two tensors are broadcastable if the following conditions hold:
# 1) Each tensor has at least one dimension.
# 2) When iterating over the dimension sizes, starting at the trailing dimension (last dimension), 
#    the dimension sizes must either be equal, one of them is 1, or one of them does not exist.


In [5]:
t9 = torch.tensor(data=[[[1, 2, 3, 4], [5, 6, 7, 8]], [[9, 10, 11, 12], [13, 14, 15, 16]]], dtype=torch.float)
print(t9)
print(t9.shape)

tensor([[[ 1.,  2.,  3.,  4.],
         [ 5.,  6.,  7.,  8.]],

        [[ 9., 10., 11., 12.],
         [13., 14., 15., 16.]]])
torch.Size([2, 2, 4])


In [6]:
# Recall from the 'understanding_tensors.ipynb' notebook that '1D' tensors are neither column
# vector nor row vectors. They behave according to the context they are used in.
t10 = torch.tensor(data=[2, 3, 4, 5], dtype=torch.float)
print(t10)
print(t10.shape)

tensor([2., 3., 4., 5.])
torch.Size([4])


In [7]:
# Here '*' is a simple element by element multiplication operation. However, the shapes of t9 (2, 2, 4)
# and t10 (4) are not the same. So, python broadcasting does it magic here to bring both the tensors
# to the same shape before applying the multiplication operation.
# 
# In this context t10 ([2, 3, 4, 5]) behaves as a row vector. Lets find out how the shape of the result (t11)
# is obtained from the shapes of t9, t10 as explained in pytorch official documentation (link above).
# t9  --> (2, 2, 4)
# t10 --> (_, _, 4)
# t11 --> (2, 2, 4)
#
# Steps involved in broadcasting:
# 1) Size (4) of Dimension 2 is same for both tensors t9, t10. So, not changes here. 
# 2) Dimension 1 of t10 needs to be made of size 2. So, the 1D tensor is copied to give the 2D tensor.
#           -- [2, 3, 4, 5] of t10 gets broadcasted to the 2D tensor [[2, 3, 4, 5], [2, 3, 4, 5]]   
# 3) Dimension 0 of t10 needs to made of size 2. So, the 2D tensor is copied to give the 3D tensor.
#           -- [[2, 3, 4, 5], [2, 3, 4, 5]] gets broadcasted to the 3D tensor [[[2, 3, 4, 5], [2, 3, 4, 5]], [[2, 3, 4, 5], [2, 3, 4, 5]]]
# 
# Now element wise multiplication happens between the two tensors to give t11.
# [[[1 * 2, 2 * 3, 3 * 4, 4 * 5], [5 * 2, 6 * 3, 7 * 4, 8 * 5]], [[9 * 2, 10 * 3, 11 * 4, 12 * 5], [13 * 2, 14 * 3, 15 * 4, 16 * 11]]]
# The left elements belong to t9 (after broadcasting) and right elements belong to t10 (after broadcasting) in each
# multiplication.
t11 = t9 * t10
print(t11)
print(t11.shape)

tensor([[[ 2.,  6., 12., 20.],
         [10., 18., 28., 40.]],

        [[18., 30., 44., 60.],
         [26., 42., 60., 80.]]])
torch.Size([2, 2, 4])


In [9]:
t12 = torch.tensor(data=[[[10], [20]]], dtype=torch.float)
print(t12)
print(t12.shape)

tensor([[[10.],
         [20.]]])
torch.Size([1, 2, 1])


In [10]:
# Here '+' is a simple element by element addition operation. However, the shapes of t9 (2, 2, 4)
# and t10 (1, 2, 1) are not the same. So, python broadcasting does it magic here to bring both the tensors
# to the same shape before applying the addition operation. 
#
# Lets find out how the shape of the result (t13) is obtained from the the shapes of t9, t12 according
# to the rules in the pytorch official documentation (link above).
# t9  --> (2, 2, 4)
# t12 --> (1, 2, 1)
# t13 --> (2, 2, 4)
#
# Steps involved in broadcasting:
# 1) Dimension 2 of t12 needs to be made 4. So, the element in each 1D tensor in the last dimension is broadcasted (copied).
#           -- [10] --> [10, 10, 10, 10]
#           -- [20] --> [20, 20, 20, 20]
#           -- So, [[[10], [20]]] gets broadcasted to [[[10, 10, 10, 10], [20, 20, 20, 20]]]]
# 2) Dimension 1 is already 2 in both the tensors (t9, t10). So, no changes here.
# 3) Dimension 0 of t12 needs to be made of size 2. So, the 2D tensor gets copied to give the 3D tensor.
#           -- [[[10, 10, 10, 10], [20, 20, 20, 20]]] gets broadcasted to [[[10, 10, 10, 10], [20, 20, 20, 20]], [[10, 10, 10, 10], [20, 20, 20, 20]]]
#
# Now, element wise addition happens between the two tensors to give t13.
# The left elements belong to t9 (after broadcasting) and right elements belong to t12 (after broadcasting) in each
# addition.
t13 = t9 + t12
print(t13)
print(t13.shape)

tensor([[[11., 12., 13., 14.],
         [25., 26., 27., 28.]],

        [[19., 20., 21., 22.],
         [33., 34., 35., 36.]]])
torch.Size([2, 2, 4])


In [13]:
# Now lets take an example when both the tensors need to updated because of the broadcast. In the 
# above examples, only the right tensor always got broadcasted.
t14 = torch.arange(end=12).reshape(1, 3, 4)
print(t14)
print(t14.shape)

tensor([[[ 0,  1,  2,  3],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 11]]])
torch.Size([1, 3, 4])


In [16]:
t15 = torch.arange(16).reshape(2, 2, 1, 4)
print(t15)
print(t15.shape)

tensor([[[[ 0,  1,  2,  3]],

         [[ 4,  5,  6,  7]]],


        [[[ 8,  9, 10, 11]],

         [[12, 13, 14, 15]]]])
torch.Size([2, 2, 1, 4])


In [17]:
# Here '+' is a simple element by element addition operation. However, the shapes of t14 (1, 3, 4)
# and t15 (2, 2, 1, 4) are not the same. So, python broadcasting does it magic here to bring both the tensors
# to the same shape before applying the addition operation. 
#
# Lets find out how the shape of the result (t16) is obtained from the the shapes of t14, t15 according to
# the rules in the pytorch official documentation (link above).
# t14 --> (_, 1, 3, 4)
# t15 --> (2, 2, 1, 4)
# t16 --> (2, 2, 3, 4)
#
# Steps involved in broadcasting:
# 1) No changes in dimension 3 since both the tensors have the same size in dimension 4.
# 2) Dimension 2 of t15 needs to be made of size 3. So, the 1D tensor along dimension 4 gets copied 3 times to give 2D tensor.
#           -- [0, 1, 2, 3]     --> [[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]]
#           -- [4, 5, 6, 7]     --> [[4, 5, 6, 7], [4, 5, 6, 7], [4, 5, 6, 7]]
#           -- [8, 9, 10, 11]   --> [[8, 9, 10, 11], [8, 9, 10, 11], [8, 9, 10, 11]]
#           -- [12, 13, 14, 15] --> [[12, 13, 14, 15], [12, 13, 14, 15], [12, 13, 14, 15]]
# 3) Dimension 1 of t14 needs to be made 2. So, the 2D tensor gets copied to create a 3D tensor.
#           -- [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]] --> [[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]]]
# 4) Dimension 0 of t14 needs to be made 2. So, the 3D tensor gets copied to created a 4D tensor.
#           -- [[[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]]], [[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]]]]
# 
# Now, element wise addition happens between the two tensors to give t16.
# The left elements belong to t14 (after broadcasting) and right elements belong to t15 (after broadcasting) in each
# addition.
t16 = t14 + t15
print(t16)
print(t16.shape)

tensor([[[[ 0,  2,  4,  6],
          [ 4,  6,  8, 10],
          [ 8, 10, 12, 14]],

         [[ 4,  6,  8, 10],
          [ 8, 10, 12, 14],
          [12, 14, 16, 18]]],


        [[[ 8, 10, 12, 14],
          [12, 14, 16, 18],
          [16, 18, 20, 22]],

         [[12, 14, 16, 18],
          [16, 18, 20, 22],
          [20, 22, 24, 26]]]])
torch.Size([2, 2, 3, 4])


## [torch.scatter_](https://pytorch.org/docs/stable/generated/torch.Tensor.scatter_.html#torch-tensor-scatter)

In [3]:
# Excellent resource to understand scatter:
# 1) https://yuyangyy.medium.com/understand-torch-scatter-b0fd6275331c
#       -- Please go through this link to understand scatter in detail with examples before continuing further.
# 
# _ at the end of 'scatter_' indicates that this is an in-place operation. 
# scatter basically scatters the inputs from the src tensor to the destination tensor according to the index tensor.
# The index tensor is used to determine the position in the destination tensor where the elements from the src tensor
# need to be placed. The src, index and destionation tensors need to have the same number of dimensions. Note that
# it is the same number of dimensions and not the same shape.

In [5]:
# Lets first consider the case when the src is a tensor.
src = torch.arange(start=1, end=11).reshape(2, 5)
print("shape: ", src.shape)
print("src: \n", src)

shape:  torch.Size([2, 5])
src: 
 tensor([[ 1,  2,  3,  4,  5],
        [ 6,  7,  8,  9, 10]])


In [13]:
# Note that index is a 2D tensor like src tensor. However, the shape of the index tensor is different from the src tensor.
index = torch.tensor(data=[[0, 1, 2, 0]], dtype=torch.int64)
print("shape: ", index.shape)
print("index: \n", index)

shape:  torch.Size([1, 4])
index: 
 tensor([[0, 1, 2, 0]])


In [14]:
destination = torch.zeros(size=(3, 5), dtype=torch.int64)
print("shape: ", destination.shape)
print("destination: \n", destination)

shape:  torch.Size([3, 5])
destination: 
 tensor([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]])


In [15]:
# src tensor is [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]].
# index tensor is [[0, 1, 2, 0]].
#
# Now lets go to each element in the src tensor and place it in the destination tensor according to the index tensor.
# src[0][0] = 1 is placed in destination[index[0][0]][0] = destiantion[0][0] => destination[0][0] = 1
# src[0][1] = 2 is placed in destination[index[0][1]][1] = destiantion[1][1] => destination[1][1] = 2
# src[0][2] = 3 is placed in destination[index[0][2]][2] = destiantion[2][2] => destination[2][2] = 3
# src[0][3] = 4 is placed in destination[index[0][3]][3] = destiantion[0][3] => destination[0][3] = 4
#
# src[0][4] = 5. But notice that index doesn't have any value at position [0][4]. So, src[0][4] = 5 is not placed
# in the destination tensor at all. So, destination[0][4] = 0.
# In general, if the index tensor doesn't have a value at a particular position, the corresponding element in the 
# src tensor is not placed in the destination tensor.
#
# Following the above argument, no other elements (except the ones shown above) from the src tensor are placed in the 
# destination tensor.
destination.scatter_(dim=0, index=index, src=src)
print("shape: ", destination.shape)
print("destination: \n", destination)

shape:  torch.Size([3, 5])
destination: 
 tensor([[1, 0, 0, 4, 0],
        [0, 2, 0, 0, 0],
        [0, 0, 3, 0, 0]])


In [None]:
# To have an intuitive understanding, scatter_ can be thought of as a way to scatter the elements from the src tensor
# to the destination tensor in a specific dimension. Lets say the destination tensor is a 2D tensor. If scatter_
# is used to scatter the elements along dimension 0, it means that the elements in a specific column in the src tensor
# are only scattered to other positions in the same column in the destination tensor. 
# 
# Consider the scatter_ formula --> destination[index[i][j]][j] = src[i][j] for all i, j.
# The elements in column j of the src tensor always stay in column j of the destination tensor. The only thing that
# changes is the row position of the elements in column j of the destination tensor. So, scatter_ has a specific 
# pattern intuitively on how it scatters the elements from the src tensor to the destination tensor. You can 
# extrapolate the same intuition to a different dimension (that is not zero) or higher dimensions as well.

In [25]:
# Now, lets consider the case when the src is a scalar.
scalar_src = 100

In [22]:
index = torch.tensor(data=[[0, 1, 2, 0]], dtype=torch.int64)
print("shape: ", index.shape)
print("index: \n", index)

shape:  torch.Size([1, 4])
index: 
 tensor([[0, 1, 2, 0]])


In [23]:
destination = torch.zeros(size=(3, 5), dtype=torch.int64)
print("shape: ", destination.shape)
print("destination: \n", destination)

shape:  torch.Size([3, 5])
destination: 
 tensor([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]])


In [26]:
# src is a scalar value 100. The index tensor is the same as before.
# index = [[0, 1, 2, 0]]
# 
# Now lets go to each element in the index tensor and place the scalar value in the destination tensor according 
# to the index tensor. When the 'src' is a scalar, the scalar value is placed in the destination tensor at the
# positions specified by the index tensor. We don't look at the src tensor at all to find the appropriate indices
# in the destination tensor. 
# 
# The general formula for scatter_ when src is a scalar is --> destination[index[i][j]][j] = scalar_src for all i, j.
# index[0][0] = 0 is used to place the src at destination[index[0][0]][0] = destiantion[0][0] => destination[0][0] = 100
# index[0][1] = 1 is used to place the src at destination[index[0][1]][1] = destiantion[1][1] => destination[1][1] = 100    
# index[0][2] = 2 is used to place the src at destination[index[0][2]][2] = destiantion[2][2] => destination[2][2] = 100
# index[0][3] = 0 is used to place the src at destination[index[0][3]][3] = destiantion[0][3] => destination[0][3] = 100
# 
# These are the only indices at which index is defined. So, the scalar value is placed at only these positions.
destination.scatter_(dim=0, index=index, value=scalar_src)
print("shape: ", destination.shape)
print("destination: \n", destination)

shape:  torch.Size([3, 5])
destination: 
 tensor([[100,   0,   0, 100,   0],
        [  0, 100,   0,   0,   0],
        [  0,   0, 100,   0,   0]])
