In [1]:
import numpy as np
from scipy.special import softmax

In [2]:
print("Step 1: Input : 3 inputs, d_model=4")
x = np.array([[1.0, 0.0, 1.0, 0.0], [0.0, 2.0, 0.0, 2.0], [1.0, 1.0, 1.0, 1.0]])
print("x:", x)

Step 1: Input : 3 inputs, d_model=4
x: [[1. 0. 1. 0.]
 [0. 2. 0. 2.]
 [1. 1. 1. 1.]]


In [3]:
print("Step 2: weights 3 dimensions x d_model=4")
w_query = np.array([[1, 0, 1], [1, 0, 0], [0, 0, 1], [0, 1, 1]])
print("w_query:", w_query)

w_key = np.array([[0, 0, 1], [1, 1, 0], [0, 1, 0], [1, 1, 0]])
print("w_key:", w_key)

w_value = np.array([[0, 2, 0], [0, 3, 0], [1, 0, 3], [1, 1, 0]])
print("w_value:", w_value)

Step 2: weights 3 dimensions x d_model=4
w_query: [[1 0 1]
 [1 0 0]
 [0 0 1]
 [0 1 1]]
w_key: [[0 0 1]
 [1 1 0]
 [0 1 0]
 [1 1 0]]
w_value: [[0 2 0]
 [0 3 0]
 [1 0 3]
 [1 1 0]]


In [4]:
print("Step 3: Matrix multiplication to obtain Q,K,V")
print("Query: x * w_query")
Q = np.matmul(x, w_query)
print("Q:", Q)

print("Key: x * w_key")
K = np.matmul(x, w_key)
print("K:", K)

print("Value: x * w_value")
V = np.matmul(x, w_value)
print("V:", V)

Step 3: Matrix multiplication to obtain Q,K,V
Query: x * w_query
Q: [[1. 0. 2.]
 [2. 2. 2.]
 [2. 1. 3.]]
Key: x * w_key
K: [[0. 1. 1.]
 [4. 4. 0.]
 [2. 3. 1.]]
Value: x * w_value
V: [[1. 2. 3.]
 [2. 8. 0.]
 [2. 6. 3.]]


In [5]:
print("Step 4: Scaled Attention Scores")
k_d = 1  # Equation is normally the square root of the number of dimensions (3 in this case)
attention_scores = (Q @ K.transpose()) / k_d
print(attention_scores)

Step 4: Scaled Attention Scores
[[ 2.  4.  4.]
 [ 4. 16. 12.]
 [ 4. 12. 10.]]


In [6]:
print("Step 5: Scaled softmax attention_scores for each vector")
attention_scores[0] = softmax(attention_scores[0])
attention_scores[1] = softmax(attention_scores[1])
attention_scores[2] = softmax(attention_scores[2])
print(attention_scores[0])
print(attention_scores[1])
print(attention_scores[2])

Step 5: Scaled softmax attention_scores for each vector
[0.06337894 0.46831053 0.46831053]
[6.03366485e-06 9.82007865e-01 1.79861014e-02]
[2.95387223e-04 8.80536902e-01 1.19167711e-01]


In [7]:
print("Step 6: attention value obtained by score1/k_d * V")
print(V[0])
print(V[1])
print(V[2])
attention1 = attention_scores[0].reshape(-1, 1)
attention1 = attention_scores[0][0] * V[0]
print("Attention 1:", attention1)

attention2 = attention_scores[0][1] * V[1]
print("Attention 2:", attention2)

attention3 = attention_scores[0][2] * V[2]
print("Attention 3:", attention3)

Step 6: attention value obtained by score1/k_d * V
[1. 2. 3.]
[2. 8. 0.]
[2. 6. 3.]
Attention 1: [0.06337894 0.12675788 0.19013681]
Attention 2: [0.93662106 3.74648425 0.        ]
Attention 3: [0.93662106 2.80986319 1.40493159]


In [8]:
print("Step 7: summed the results to create the first line of the output matrix")
attention_input1 = attention1 + attention2 + attention3
print(attention_input1)

Step 7: summed the results to create the first line of the output matrix
[1.93662106 6.68310531 1.59506841]


In [9]:
print("Step 8: Step 1 to 7 for inputs 1 to 3")
# This is assuming that we had actually gone through the whole process for all 3
# We'll just take a random matrix of the correct dimensions in lieu
attention_head1 = np.random.random((3, 64))
print(attention_head1)

Step 8: Step 1 to 7 for inputs 1 to 3
[[0.58751243 0.32876733 0.25570795 0.80796168 0.00177578 0.3998109
  0.34918701 0.83385879 0.34601251 0.40699334 0.5755632  0.74275413
  0.99623933 0.38535444 0.78002393 0.60278954 0.03993106 0.66794002
  0.20279803 0.74728954 0.53061567 0.1623918  0.60425018 0.36700845
  0.90030672 0.34287176 0.49342534 0.68111791 0.52227084 0.23022488
  0.60751024 0.29777488 0.19143023 0.86031908 0.95459437 0.34278289
  0.70680581 0.44989966 0.92446136 0.64225194 0.16512493 0.30836662
  0.18318507 0.65111996 0.10952488 0.99216561 0.4160243  0.35574822
  0.58277061 0.6588462  0.16634101 0.93591091 0.20831083 0.94459796
  0.83201022 0.91816635 0.85229539 0.55188509 0.87808955 0.25804807
  0.43139655 0.41685581 0.3131693  0.31489676]
 [0.0894747  0.22128897 0.30873792 0.02885298 0.02613985 0.36090353
  0.71915251 0.60197054 0.44946902 0.04960437 0.97914378 0.26214332
  0.21773916 0.05149666 0.3046916  0.1319246  0.24183341 0.687092
  0.24055382 0.27891963 0.37645041

In [10]:
print("Step 9: We assume we have trained the 8 heads of the attention sub-layer")
z0h1 = np.random.random((3, 64))
z1h2 = np.random.random((3, 64))
z2h3 = np.random.random((3, 64))
z3h4 = np.random.random((3, 64))
z4h5 = np.random.random((3, 64))
z5h6 = np.random.random((3, 64))
z6h7 = np.random.random((3, 64))
z7h8 = np.random.random((3, 64))
print("shape of one head", z0h1.shape, "dimension of 8 heads", 64 * 8)

Step 9: We assume we have trained the 8 heads of the attention sub-layer
shape of one head (3, 64) dimension of 8 heads 512


In [11]:
print(
    "Step 10: Concatenation of heads 1 to 8 to obtain the original 8x64=512 output dimension of the model"
)
output_attention = np.hstack((z0h1, z1h2, z2h3, z3h4, z4h5, z5h6, z6h7, z7h8))
print(output_attention)

Step 10: Concatenation of heads 1 to 8 to obtain the original 8x64=512 output dimension of the model
[[0.86927409 0.12836694 0.59222786 ... 0.62225058 0.76491867 0.38528535]
 [0.32556618 0.39026433 0.42130743 ... 0.59319731 0.0660897  0.04835302]
 [0.78720993 0.6180019  0.22714473 ... 0.32089113 0.53065514 0.78705635]]


In [None]:
def DotProductAttention(query, key, value, mask, scale=True):
    """Dot product self-attention.
    Args:
        query (numpy.ndarray): array of query representations with shape (L_q by d)
        key (numpy.ndarray): array of key representations with shape (L_k by d)
        value (numpy.ndarray): array of value representations with shape (L_k by d) where L_v = L_k
        mask (numpy.ndarray): attention-mask, gates attention with shape (L_q by L_k)
        scale (bool): whether to scale the dot product of the query and transposed key

    Returns:
        numpy.ndarray: Self-attention array for q, k, v arrays. (L_q by L_k)
    """

    assert (
        query.shape[-1] == key.shape[-1] == value.shape[-1]
    ), "Embedding dimensions of q, k, v aren't all the same"

    # Save depth/dimension of the query embedding for scaling down the dot product
    if scale:
        depth = query.shape[-1]
    else:
        depth = 1

    # Calculate scaled query key dot product according to formula above
    dots = np.matmul(query, np.swapaxes(key, -1, -2)) / np.sqrt(depth)

    # Apply the mask
    if mask is not None:
        dots = np.where(mask, dots, np.full_like(dots, -1e9))

    # Softmax formula implementation
    # Use scipy.special.logsumexp of masked_qkT to avoid underflow by division by large numbers
    # Note: softmax = e^(dots - logaddexp(dots)) = E^dots / sumexp(dots)
    logsumexp = scipy.special.logsumexp(dots, axis=-1, keepdims=True)

    # Take exponential of dots minus logsumexp to get softmax
    # Use np.exp()
    dots = np.exp(dots - logsumexp)

    # Multiply dots by value to get self-attention
    # Use np.matmul()
    attention = np.matmul(dots, value)

    return attention

In [None]:
def masked_dot_product_self_attention(q, k, v, scale=True):
    """Masked dot product self attention.
    Args:
        q (numpy.ndarray): queries.
        k (numpy.ndarray): keys.
        v (numpy.ndarray): values.
    Returns:
        numpy.ndarray: masked dot product self attention tensor.
    """

    # Size of the penultimate dimension of the query
    mask_size = q.shape[-2]

    # Creates a matrix with ones below the diagonal and 0s above. It should have shape (1, mask_size, mask_size)
    # Use np.tril() - Lower triangle of an array and np.ones()
    mask = np.tril(np.ones((1, mask_size, mask_size), dtype=np.bool_), k=0)

    return DotProductAttention(q, k, v, mask, scale=scale)