In [None]:
import torch
from torch import nn
import random

In [None]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, hidden_dim, num_heads):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_heads = num_heads
    
        # for each head, we create a linear layer for query, key, and value
        self.q_weights = [nn.Linear(hidden_dim, hidden_dim) for _ in range(self.num_heads)]
        self.k_weights = [nn.Linear(hidden_dim, hidden_dim) for _ in range(self.num_heads)]
        self.v_weights = [nn.Linear(hidden_dim, hidden_dim) for _ in range(self.num_heads)]
        
        self.softmax = nn.Softmax(dim = -1)
        self.linear = nn.Linear(num_heads*hidden_dim, hidden_dim)
        

In [None]:
# A matrix like self.q_weights
w = [nn.Linear(4, 2) for _ in range(3)]
for i in range(len(w)):
    print(f'shape of {i}th weight matrix in w: {w[i].weight.shape}')
    print(f'shape of {i}th bias matrix in w: {w[i].bias.shape}')
print("\n")

for i in range(len(w)):
    print(f'{i}th weight matrix: {w[i].weight}')
    print(f'{i}th bias matrix: {w[i].bias}')
    print("---")

list = [0 for _ in range(4)]
print(list)
list = torch.tensor(list, dtype=torch.float32)
list.shape

input_data = list
results =[]
for linear_layer in w:
    output = linear_layer(input_data)
    print(output)
    print(output.shape)
    results.append(output)


Running the code above shows that linear_layer(input_data) performs the following operation: $$((2, 4) \cdot (4,1)) + (2, 1) = (2, 1)$$

In [None]:

softmax = nn.Softmax(dim = 0)
list = [[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]]

list = torch.tensor(list, dtype=torch.float32)
print(f'Origianl list (as tensor):\n {list}')
softmax_list = softmax(list)
print(f'List after softmax:\n {softmax_list}')

Our list tensor has two dimensions since it is two dimensional. We can select dim = 1 or 0 (or -1 = 1). Applying softmax along dim = 0 applies softmax along columns in our 2d array.

In [None]:
# Initialize a list of linear layers (two linear layers).
# Each linear layer consists of a (3,3) weight matrix and a (3, 1) bias vector
t_weights = [nn.Linear(3, 3) for _ in range(2)]

print(f'The data type of t_weights is: {type(t_weights)}')
print(f'The data type of t_weights[0] is: {type(t_weights[0])}\n')

x = torch.tensor([[1.0, 1.0, 1.0]], dtype=torch.float32)
print(f'The torch tensor x is {x}, and has shape {x.shape}\n')

for i in range(len(t_weights)):

    print(f't_weights weights: {t_weights[i].weight}')
    print(f't_weights bias: {t_weights[i].bias}\n')

    print(f't_weights[i]: {t_weights[i]}')
    print(f't_weights[{i}]({x}): {t_weights[i](x)}\n')



t_weights = [nn.Linear(3, 3) for _ in range(2)] is a collection of 2 linear layers. Each linear layer is a matrix of weights plus a bias vector. The first weight matrix, t_weights[0], is: 
$$
\begin{pmatrix}
0.34 & 0.57 & -0.53 \\
-0.03 & -0.53 & 0.42 \\
-0.16 & 0.50 & 0.28
\end{pmatrix}
$$ 
and the vector $x$ is simply [1, 1, 1]. The bias vector is [-0.50, -0.43, 0.402].

These elements have shapes:

- t_weights[0].weight.shape = torch.Size([3,3])
- t_weights[0].bias.shape = torch.Size([3])
- x.shape = torch.Size([1,3])

The product t_weights[0] (x) is simply 
$
\begin{pmatrix}
0.34 & 0.57 & -0.53 \\
-0.03 & -0.53 & 0.42 \\
-0.16 & 0.50 & 0.28
\end{pmatrix} \cdot \begin{pmatrix}1 \\ 1 \\ 1 \end{pmatrix}
$
