In [52]:
import numpy as np

### Single Head Attention in Numpy

Let as assume we have already split the input image into patches, linearly projected them, prepended the class embedding and added the position embedded as discussed in the main notebook for the project - vision_transformer.ipynb. Now we have an input tensor of dimensions:<br>[1, 197, 768] <br>
[batch_size, number_of_patches, embedding_dimension].

 Below we will implement the attention mechanism in a single attention head only using NumPy and np.arrays instead of tensors.

In [53]:
# Set input shape
input_shape = (1, 197, 768)
# Create the embedded patches
patch_embeddings = np.random.uniform(0.0, 1.0,size = input_shape).astype(np.float32)
patch_embeddings

array([[[0.0662793 , 0.20086902, 0.88743186, ..., 0.02730168,
         0.70829993, 0.87831014],
        [0.5703314 , 0.26718548, 0.51736575, ..., 0.8105333 ,
         0.5822567 , 0.6799193 ],
        [0.73056495, 0.03794412, 0.8885933 , ..., 0.08197571,
         0.8888563 , 0.6545716 ],
        ...,
        [0.8376981 , 0.43273795, 0.10236892, ..., 0.27505141,
         0.08347566, 0.32058024],
        [0.5240691 , 0.70952046, 0.05152313, ..., 0.93939435,
         0.40975556, 0.4971096 ],
        [0.50677985, 0.9401297 , 0.0452047 , ..., 0.8495476 ,
         0.6295712 , 0.20522721]]], dtype=float32)

In [None]:
# Input dimensions
batch_size, num_patches, embedding_dim = patch_embeddings.shape


# Set query/key dimension (usually embedding dimension / number of heads)
d_k = embedding_dim // 12
W_q = np.random.randn(embedding_dim, d_k).astype(np.float32) * 0.1
W_k = np.random.randn(embedding_dim, d_k).astype(np.float32) * 0.1
W_v = np.random.randn(embedding_dim, d_k).astype(np.float32)* 0.1
W_o = np.random.randn(d_k, embedding_dim).astype(np.float32) * 0.1
print(f"Shape of projection weights is {W_q.shape}")


# Linear projections for Q, K, and V
# Here each 2d slice of the 3d input is multiplied with the weight matrices
Q = patch_embeddings @ W_q # Shape: (batch, number of patches, d_k)
K = patch_embeddings @ W_k # Shape: (batch, number of patches, d_k)
V = patch_embeddings @ W_v # Shape: (batch, number of patches, d_k)
print(f"Shape of Q, K and V is: {Q.shape}")

# Calculate attention scores
attention_scores = Q @ K.transpose(0, 2, 1) / np.sqrt(d_k) # shape is (1, 187, 187) [batch_number_queries, number_keys]
print(f"Shape attention score is: {attention_scores.shape}")

# Apply softmax
attention_weights = np.exp(attention_scores) / np.sum(np.exp(attention_scores), axis = -1, keepdims = True)
print(f"Shape attention weights is: {attention_weights.shape}")

# Weight values
attention_output = attention_weights @ V

print(f"Shape of attention_output is: {attention_output.shape}")
# Output projection
output = attention_output @ W_o
print(f"Shape of input  is: {patch_embeddings.shape}")
print(f"Shape of output after projection is: {output.shape}")

Shape of projection weights is (768, 64)
Shape of Q, K and V is: (1, 197, 64)
Shape attention score is: (1, 197, 197)
Shape attention weights is: (1, 197, 197)
Shape of attention_output is: (1, 197, 64)
Shape of input  is: (1, 197, 768)
Shape of output after projection is: (1, 197, 768)


We can see that for the attention layer the shape of the output matches the shape of the input.