In [115]:
import torch
from torch import nn
import random

In [116]:


class MultiHeadSelfAttention(nn.Module):
    def __init__(self, hidden_dim, num_heads):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_heads = num_heads
    
        # for each head, we create a linear layer for query, key, and value
        self.q_weights = [nn.Linear(hidden_dim, hidden_dim) for _ in range(self.num_heads)]
        self.k_weights = [nn.Linear(hidden_dim, hidden_dim) for _ in range(self.num_heads)]
        self.v_weights = [nn.Linear(hidden_dim, hidden_dim) for _ in range(self.num_heads)]
        
        self.softmax = nn.Softmax(dim = -1) #Don't understand this line
        self.linear = nn.Linear(num_heads*hidden_dim, hidden_dim)

    def forward(self, X):
        for x in X:
            x_result = []
            for head in range(self.num_heads):
                #Applies affine transformation to input matrix x
                q = self.q_weights[head](x)
                k = self.k_weights[head](x)
                v = self.v_weights[head](x)
                
        

In [138]:
#Pass from a 4 dim input -> 3 dim output
x = torch.randn(4)
print(f'Shape of x: {x.shape}')
print(f'Input x: {x}')


#Affine transformation is carried out by linear layer
linear = nn.Linear(4, 3)
print(f'linear.weight.shape: {linear.weight.shape}')
print(f'linear.weight: {linear.weight}')

#y stores the result of the affine transformation
y = linear(x)
print(f'Shape of y: {y.shape}')
print(f'y: {y}')


Shape of x: torch.Size([4])
Input x: tensor([-0.0198, -0.4477,  0.5764,  0.8019])
linear.weight.shape: torch.Size([3, 4])
linear.weight: Parameter containing:
tensor([[-0.4017, -0.0048, -0.1204,  0.0214],
        [ 0.0603,  0.3952,  0.1923, -0.1752],
        [ 0.4260,  0.1708, -0.3359, -0.2139]], requires_grad=True)
Shape of y: torch.Size([3])
y: tensor([ 0.3387, -0.5468, -0.3428], grad_fn=<ViewBackward0>)


Running the code above shows that linear_layer(input_data) performs the following operation: $$((2, 4) \cdot (4,1)) + (2, 1) = (2, 1).$$ A more detailed explanation of the same idea is the following code:

t_weights = [nn.Linear(3, 3) for _ in range(2)] is a collection of 2 linear layers. Each linear layer is a matrix of weights plus a bias vector. The first weight matrix, t_weights[0], is: 
$$
\begin{pmatrix}
0.34 & 0.57 & -0.53 \\
-0.03 & -0.53 & 0.42 \\
-0.16 & 0.50 & 0.28
\end{pmatrix}
$$ 
and the vector $x$ is simply [0, 0, 0]. The bias vector is [-0.50, -0.43, 0.402].

These elements have shapes:

- t_weights[0].weight.shape = torch.Size([3,3])
- t_weights[0].bias.shape = torch.Size([3])
- x.shape = torch.Size([1,3])

The product t_weights[0].weights and x is simply 
$
\begin{pmatrix}
0.34 & 0.57 & -0.53 \\
-0.03 & -0.53 & 0.42 \\
-0.16 & 0.50 & 0.28
\end{pmatrix} \cdot \begin{pmatrix}0 \\ 0 \\ 0 \end{pmatrix} = \begin{pmatrix}0 \\ 0 \\ 0 \end{pmatrix}
$

Adding the bias, we have t_weights[0] (x) = $ 
\begin{pmatrix}
0.34 & 0.57 & -0.53 \\
-0.03 & -0.53 & 0.42 \\
-0.16 & 0.50 & 0.28
\end{pmatrix} \cdot \begin{pmatrix}0 \\ 0 \\ 0 \end{pmatrix} + \begin{pmatrix}-0.50 \\ -0.43 \\ 0.402 \end{pmatrix} = \begin{pmatrix}-0.50 \\ -0.43 \\ 0.402 \end{pmatrix}
$

represented by the tensor tensor([[-0.50, -0.43, 0.402]], grad_fn=<AddmmBackward0>) [Something is wrong here]

We may also take the product of t_weights[i] with a matrix of shape $(3, 3)$. The result is $(3, 3) \cdot (3, 3) = (3, 3)$ to which we add the bias of shape $(3, 1)$

The list $r$ is a collection of three linear layers. Consider the first layer, $r[0]$ - it consists of a weight matrix $r[0].weight$, and a bias term, $r[0].bias$. The list $r_x$ consists of the linear layers $r[i]$ applied to $x$. The product of $x$ - $(3,1)$ -  with $r[0]$ - $(3,3)$ is a vector of shape $(3,1)$ to which we add the bias $(3,1)$

The dot product for the vectors $r[0](x)$ and $r[1](x)$ - both of shape (3, 1): rows, columns - is a scalar. We take the product of this scalar with @ again to obtain another (3,1) vector.

In [122]:

softmax = nn.Softmax(dim = 1)
list = [[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]]

list = torch.tensor(list, dtype=torch.float32)
print(f'Origianl list (as tensor):\n {list}')
softmax_list = softmax(list)
print(f'List after softmax:\n {softmax_list}')

Origianl list (as tensor):
 tensor([[1., 2., 3.],
        [4., 5., 6.],
        [7., 8., 9.]])
List after softmax:
 tensor([[0.0900, 0.2447, 0.6652],
        [0.0900, 0.2447, 0.6652],
        [0.0900, 0.2447, 0.6652]])


Our list tensor has two dimensions since it is two dimensional. We can select dim = 1 or 0 (or -1 = 1). Applying softmax along dim = 0 applies softmax along columns in our 2d array. Selecting dim = 1 applies softmax along rows.

In [127]:
in_features, out_features = (4, 3)
y = nn.Linear(in_features, out_features)
print(y.weight)
print(y.weight.shape)


Parameter containing:
tensor([[ 0.4494,  0.1101, -0.4059, -0.4762],
        [-0.1340, -0.4289, -0.3559,  0.4126],
        [-0.3729,  0.3202, -0.2928, -0.2099]], requires_grad=True)
torch.Size([3, 4])


In [134]:
import torch
import torch.nn as nn

# Input: 4-node layer output (vector with 4 entries)
x = torch.randn(4)  # Shape (4,)
print(x)
print(x.shape)

# Linear layer: 4 inputs → 3 outputs
linear = nn.Linear(4, 3)

# What PyTorch stores:
print(f"Weight shape: {linear.weight.shape}")  # (3, 4) - not (4, 3)!
print(f"Bias shape: {linear.bias.shape}")      # (3,)
print(linear.weight)
print(linear.bias)

# Forward pass: y = x @ W^T + b
y = linear(x)  # Shape (3,)
print(y)

tensor([-0.9926, -0.9342, -0.5142,  1.2954])
torch.Size([4])
Weight shape: torch.Size([3, 4])
Bias shape: torch.Size([3])
Parameter containing:
tensor([[ 0.0319,  0.4415, -0.2730,  0.0101],
        [ 0.3719,  0.0518, -0.2535, -0.4235],
        [ 0.4518, -0.1185, -0.1704,  0.2925]], requires_grad=True)
Parameter containing:
tensor([-0.4185,  0.3588, -0.4593], requires_grad=True)
tensor([-0.7092, -0.4770, -0.3304], grad_fn=<ViewBackward0>)


In [136]:
print(linear.weight.T)

tensor([[ 0.0319,  0.3719,  0.4518],
        [ 0.4415,  0.0518, -0.1185],
        [-0.2730, -0.2535, -0.1704],
        [ 0.0101, -0.4235,  0.2925]], grad_fn=<PermuteBackward0>)


In [129]:
# Manual computation
W = linear.weight      # (3, 4)
b = linear.bias        # (3,)
y_manual = x @ W.T + b # Same as linear(x)

print(torch.allclose(y, y_manual))  # True

True
