In [74]:
import torch
from torch import nn
import random

In [75]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, hidden_dim, num_heads):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_heads = num_heads
    
        # for each head, we create a linear layer for query, key, and value
        self.q_weights = [nn.Linear(hidden_dim, hidden_dim) for _ in range(self.num_heads)]
        self.k_weights = [nn.Linear(hidden_dim, hidden_dim) for _ in range(self.num_heads)]
        self.v_weights = [nn.Linear(hidden_dim, hidden_dim) for _ in range(self.num_heads)]
        
        self.softmax = nn.Softmax(dim = -1)
        self.linear = nn.Linear(num_heads*hidden_dim, hidden_dim)

    def forward(self, X):
        for x in X:
            x_result = []
            for head in range(self.num_heads):
                q = self.q_weights[head](x)
                k = self.k_weights[head](x)
                v = self.v_weights[head](x)
                
        

In [76]:
# A matrix like self.q_weights
w = [nn.Linear(4, 2) for _ in range(3)]
for i in range(len(w)):
    print(f'shape of {i}th weight matrix in w: {w[i].weight.shape}')
    print(f'shape of {i}th bias matrix in w: {w[i].bias.shape}')
    if i == 0:
        break
print("---")

for i in range(len(w)):
    print(f'{i}th weight matrix: {w[i].weight}')
    print(f'{i}th bias matrix: {w[i].bias}')
    if i == 0:
        break
print("---")

list = [0 for _ in range(4)]
print(list)
list = torch.tensor(list, dtype=torch.float32)
print(list.shape)
print('---')

input_data = list
results =[]
for linear_layer in w:
    output = linear_layer(input_data)
    if linear_layer == 0:
        print(output)
        print(output.shape)
    results.append(output)

print(results)
print(f'num rows in results: {len(results)}')
print(f'num cols in results: {len(results[0])}')





shape of 0th weight matrix in w: torch.Size([2, 4])
shape of 0th bias matrix in w: torch.Size([2])
---
0th weight matrix: Parameter containing:
tensor([[-0.1985, -0.1633,  0.3393,  0.0411],
        [-0.1433, -0.1576,  0.1237,  0.1431]], requires_grad=True)
0th bias matrix: Parameter containing:
tensor([-0.1693,  0.0828], requires_grad=True)
---
[0, 0, 0, 0]
torch.Size([4])
---
[tensor([-0.1693,  0.0828], grad_fn=<ViewBackward0>), tensor([ 0.1212, -0.1277], grad_fn=<ViewBackward0>), tensor([-0.3210,  0.1686], grad_fn=<ViewBackward0>)]
num rows in results: 3
num cols in results: 2


Running the code above shows that linear_layer(input_data) performs the following operation: $$((2, 4) \cdot (4,1)) + (2, 1) = (2, 1).$$ A more detailed explanation of the same idea is the following code:

In [77]:
# Initialize a list of linear layers (two linear layers).
t_weights = [nn.Linear(3, 3) for _ in range(2)]

print(f'The data type of t_weights is: {type(t_weights)}')
print(f'The data type of t_weights[0] is: {type(t_weights[0])}\n')

x = torch.tensor([[1.0, 1.0, 1.0]], dtype=torch.float32)
print(f'The torch tensor x is {x}, and has shape {x.shape}\n')

for i in range(len(t_weights)):

    print(f't_weights weights: {t_weights[i].weight}')
    print(f't_weights bias: {t_weights[i].bias}\n')

    print(f't_weights[i]: {t_weights[i]}')
    print(f't_weights[{i}]({x}): {t_weights[i](x)}\n')
    break

The data type of t_weights is: <class 'list'>
The data type of t_weights[0] is: <class 'torch.nn.modules.linear.Linear'>

The torch tensor x is tensor([[1., 1., 1.]]), and has shape torch.Size([1, 3])

t_weights weights: Parameter containing:
tensor([[ 0.1195,  0.3768,  0.3752],
        [ 0.1162, -0.1320, -0.3679],
        [-0.1940, -0.0523, -0.1250]], requires_grad=True)
t_weights bias: Parameter containing:
tensor([-0.3283,  0.0247,  0.1290], requires_grad=True)

t_weights[i]: Linear(in_features=3, out_features=3, bias=True)
t_weights[0](tensor([[1., 1., 1.]])): tensor([[ 0.5432, -0.3590, -0.2423]], grad_fn=<AddmmBackward0>)



t_weights = [nn.Linear(3, 3) for _ in range(2)] is a collection of 2 linear layers. Each linear layer is a matrix of weights plus a bias vector. The first weight matrix, t_weights[0], is: 
$$
\begin{pmatrix}
0.34 & 0.57 & -0.53 \\
-0.03 & -0.53 & 0.42 \\
-0.16 & 0.50 & 0.28
\end{pmatrix}
$$ 
and the vector $x$ is simply [1, 1, 1]. The bias vector is [-0.50, -0.43, 0.402].

These elements have shapes:

- t_weights[0].weight.shape = torch.Size([3,3])
- t_weights[0].bias.shape = torch.Size([3])
- x.shape = torch.Size([1,3])

The product t_weights[0] (x) is simply 
$
\begin{pmatrix}
0.34 & 0.57 & -0.53 \\
-0.03 & -0.53 & 0.42 \\
-0.16 & 0.50 & 0.28
\end{pmatrix} \cdot \begin{pmatrix}1 \\ 1 \\ 1 \end{pmatrix}
$


In [85]:
# Initialize a list of linear layers (three linear layers).
r = [nn.Linear(3, 3) for _ in range(3)]

print('THE LINEAR LAYER r[0]')
for i in range(len(r)):
    print(f'r[{i}]: {r[i]}')
    print(f'r[{i}].weight: {r[i].weight}')
    print(f'r[{i}].bias: {r[i].bias}\n')
    if i == 0:
        break

#Stores the results of the linear layers applied to the vector x
r_x = [r[i](x) for i in range(len(r))]

print("--------\nRESULT OF AFFINE TRANSFORMATIONS r[i](x):")
# Prints the affine transformations r[i](x)
for i in range(len(r)):
    print(f'r_x[{i}]: {r_x[i]}')


THE LINEAR LAYER r[0]
r[0]: Linear(in_features=3, out_features=3, bias=True)
r[0].weight: Parameter containing:
tensor([[ 0.3664,  0.1359, -0.2147],
        [ 0.2572,  0.0596, -0.4002],
        [-0.0569, -0.1019, -0.2516]], requires_grad=True)
r[0].bias: Parameter containing:
tensor([ 0.4570, -0.4915, -0.1332], requires_grad=True)

--------
RESULT OF AFFINE TRANSFORMATIONS r[i](x):
r_x[0]: tensor([[ 0.7446, -0.5749, -0.5436]], grad_fn=<AddmmBackward0>)
r_x[1]: tensor([[-0.7463, -0.9879,  0.5693]], grad_fn=<AddmmBackward0>)
r_x[2]: tensor([[-0.9665,  0.6212, -0.1540]], grad_fn=<AddmmBackward0>)


The list $r$ is a collection of three linear layers. Consider the first layer, $r[0]$ - it consists of a weight matrix $r[0].weight$, and a bias term, $r[0].bias$. The list $r_x$ consists of the linear layers $r[i]$ applied to $x$. The product of $x$ - $(3,1)$ -  with $r[0]$ - $(3,3)$ is a vector of shape $(3,1)$ to which we add the bias $(3,1)$

In [90]:
#Multiplying vectors using @
print(f'vector_product = r[{0}](x) @ r[{1}](x): {r[0](x) @ r[1](x).T}')
vector_product = r[0](x) @ r[1](x).T
print('---------\n')

vector_product_2 = vector_product @  r[2](x)
print(f'vector_product: {vector_product}')
print(f'r[2](x): {r[2](x)}')
print(f'vector_product_2: {vector_product_2}')

print('---------\n')

print(f'r[0](x).shape: {r[0](x).shape}')
print(f'vector_product_2.shape: {vector_product_2.shape}')

vector_product = r[0](x) @ r[1](x): tensor([[-0.2973]], grad_fn=<MmBackward0>)
---------

vector_product: tensor([[-0.2973]], grad_fn=<MmBackward0>)
r[2](x): tensor([[-0.9665,  0.6212, -0.1540]], grad_fn=<AddmmBackward0>)
vector_product_2: tensor([[ 0.2873, -0.1847,  0.0458]], grad_fn=<MmBackward0>)
---------

r[0](x).shape: torch.Size([1, 3])
vector_product_2.shape: torch.Size([1, 3])


The dot product for the vectors $r[0](x)$ and $r[1](x)$ - both of shape (3, 1): rows, columns - is a scalar. We take the product of this scalar with @ again to obtain another (3,1) vector.

In [79]:

softmax = nn.Softmax(dim = 1)
list = [[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]]

list = torch.tensor(list, dtype=torch.float32)
print(f'Origianl list (as tensor):\n {list}')
softmax_list = softmax(list)
print(f'List after softmax:\n {softmax_list}')

Origianl list (as tensor):
 tensor([[1., 2., 3.],
        [4., 5., 6.],
        [7., 8., 9.]])
List after softmax:
 tensor([[0.0900, 0.2447, 0.6652],
        [0.0900, 0.2447, 0.6652],
        [0.0900, 0.2447, 0.6652]])


Our list tensor has two dimensions since it is two dimensional. We can select dim = 1 or 0 (or -1 = 1). Applying softmax along dim = 0 applies softmax along columns in our 2d array. Selecting dim = 1 applies softmax along rows.