In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(3)

# Number of inputs (tokens)
N = 3
# Dimensionality of each input
D = 4

# Generate random inputs — list of (D, 1) column vectors
all_x = [np.random.normal(size=(D, 1)) for _ in range(N)]
print(all_x)

# Set random seed again for reproducibility of parameters
np.random.seed(0)

# Initialize weight matrices for query, key, value projections
omega_q = np.random.normal(size=(D, D))
omega_k = np.random.normal(size=(D, D))
omega_v = np.random.normal(size=(D, D))

# Initialize bias vectors for query, key, value projections
beta_q = np.random.normal(size=(D, 1))
beta_k = np.random.normal(size=(D, 1))
beta_v = np.random.normal(size=(D, 1))

# Compute queries, keys, and values for each input
all_queries, all_keys, all_values = [], [], []
for x in all_x:
    query = omega_q @ x + beta_q    # Linear transformation for query
    key = omega_k @ x + beta_k      # Linear transformation for key
    value = omega_v @ x + beta_v    # Linear transformation for value
    all_queries.append(query)
    all_keys.append(key)
    all_values.append(value)

# Softmax function to convert scores to probabilities
def softmax(items_in):
    e_x = np.exp(items_in - np.max(items_in))  # Subtract max for numerical stability
    return e_x / e_x.sum()                     # Normalize values to sum to 1

# Compute attention outputs individually
all_x_prime = []
for n in range(N):
    q_n = all_queries[n]                      # Query for nth output
    all_km_qn = [float(k.T @ q_n) for k in all_keys]  # Dot products of q_n with all keys
    attention = softmax(all_km_qn)            # Compute attention weights
    print(f"Attentions for output {n}:\n{attention}")

    # Weighted sum of values using attention weights (Equation 12.3)
    x_prime = sum(attention[m] * all_values[m] for m in range(N))
    all_x_prime.append(x_prime)

# Display results of manual self-attention
for i, x_p in enumerate(all_x_prime):
    print(f"x_prime_{i}_calculated:", x_p.T)

# Self-attention in matrix form (simplified batch computation)
def self_attention(X, omega_v, omega_q, omega_k, beta_v, beta_q, beta_k):
    Q = omega_q @ X + beta_q        # Compute all queries at once (D, N)
    K = omega_k @ X + beta_k        # Compute all keys at once (D, N)
    V = omega_v @ X + beta_v        # Compute all values at once (D, N)

    dot_products = Q.T @ K          # Dot product between queries and keys (N, N)
    attention = np.apply_along_axis(softmax, 1, dot_products)  # Softmax on each query row

    X_prime = (attention @ V.T).T   # Weight values by attention scores
    return X_prime

# Stack input vectors into a matrix (D, N)
X = np.hstack(all_x)

# Run matrix-based self-attention
X_prime = self_attention(X, omega_v, omega_q, omega_k, beta_v, beta_q, beta_k)
print("Matrix form self-attention result:\n", X_prime)

# Scaled dot-product self-attention
def scaled_dot_product_self_attention(X, omega_v, omega_q, omega_k, beta_v, beta_q, beta_k):
    Q = omega_q @ X + beta_q        # Compute queries
    K = omega_k @ X + beta_k        # Compute keys
    V = omega_v @ X + beta_v        # Compute values

    scale = np.sqrt(D)              # Scale factor (√D) for dot products
    dot_products = (Q.T @ K) / scale  # Scaled dot product
    attention = np.apply_along_axis(softmax, 1, dot_products)  # Softmax per query

    X_prime = (attention @ V.T).T   # Weighted sum of values
    return X_prime

# Run scaled dot-product self-attention
X_prime_scaled = scaled_dot_product_self_attention(X, omega_v, omega_q, omega_k, beta_v, beta_q, beta_k)
print("Scaled dot-product attention result:\n", X_prime_scaled)


[array([[ 1.78862847],
       [ 0.43650985],
       [ 0.09649747],
       [-1.8634927 ]]), array([[-0.2773882 ],
       [-0.35475898],
       [-0.08274148],
       [-0.62700068]]), array([[-0.04381817],
       [-0.47721803],
       [-1.31386475],
       [ 0.88462238]])]
Attentions for output 0:
[1.24326146e-13 9.98281489e-01 1.71851130e-03]
Attentions for output 1:
[2.79525306e-12 5.85506360e-03 9.94144936e-01]
Attentions for output 2:
[0.00505708 0.00654776 0.98839516]
x_prime_0_calculated: [[ 0.94744244 -0.24348429 -0.91310441 -0.44522983]]
x_prime_1_calculated: [[ 1.64201168 -0.08470004  4.02764044  2.18690791]]
x_prime_2_calculated: [[ 1.61949281 -0.06641533  3.96863308  2.15858316]]
Matrix form self-attention result:
 [[ 0.94744244  1.64201168  1.61949281]
 [-0.24348429 -0.08470004 -0.06641533]
 [-0.91310441  4.02764044  3.96863308]
 [-0.44522983  2.18690791  2.15858316]]
Scaled dot-product attention result:
 [[ 0.97411966  1.59622051  1.32638014]
 [-0.23738409 -0.09516106  0.1306

  all_km_qn = [float(k.T @ q_n) for k in all_keys]  # Dot products of q_n with all keys


# Multi-head Attention

In [2]:
import numpy as np
import matplotlib.pyplot as plt

# Set seed for reproducibility
np.random.seed(3)

# Number of inputs (tokens)
N = 6
# Dimensionality of each input vector
D = 8

# Input matrix X of shape (D, N)
X = np.random.normal(size=(D, N))
print("Input X:")
print(X)

# Number of attention heads
H = 2
# Dimensionality per head
H_D = D // H

# Set seed again for reproducibility of weights
np.random.seed(0)

# Parameters for Head 1
omega_q1 = np.random.normal(size=(H_D, D))
omega_k1 = np.random.normal(size=(H_D, D))
omega_v1 = np.random.normal(size=(H_D, D))
beta_q1 = np.random.normal(size=(H_D, 1))
beta_k1 = np.random.normal(size=(H_D, 1))
beta_v1 = np.random.normal(size=(H_D, 1))

# Parameters for Head 2
omega_q2 = np.random.normal(size=(H_D, D))
omega_k2 = np.random.normal(size=(H_D, D))
omega_v2 = np.random.normal(size=(H_D, D))
beta_q2 = np.random.normal(size=(H_D, 1))
beta_k2 = np.random.normal(size=(H_D, 1))
beta_v2 = np.random.normal(size=(H_D, 1))

# Final linear projection parameters
omega_c = np.random.normal(size=(D, D))

# Define softmax operation that works independently on each column
def softmax_cols(data_in):
  # Exponentiate all of the values
  exp_values = np.exp(data_in) ;
  # Sum over columns
  denom = np.sum(exp_values, axis = 0);
  # Compute softmax (numpy broadcasts denominator to all rows automatically)
  softmax = exp_values / denom
  # return the answer
  return softmax

# Multi-head scaled dot-product self-attention function
def multihead_scaled_self_attention(X,
                                     omega_v1, omega_q1, omega_k1, beta_v1, beta_q1, beta_k1,
                                     omega_v2, omega_q2, omega_k2, beta_v2, beta_q2, beta_k2,
                                     omega_c):
    D, N = X.shape
    H_D = D // 2  # Head dimension

    # Head 1
    Q1 = omega_q1 @ X + beta_q1
    K1 = omega_k1 @ X + beta_k1
    V1 = omega_v1 @ X + beta_v1

    scores1 = (Q1.T @ K1) / np.sqrt(H_D)
    weights1 = softmax_cols(scores1.T).T
    head1_output = V1 @ weights1

    # Head 2
    Q2 = omega_q2 @ X + beta_q2
    K2 = omega_k2 @ X + beta_k2
    V2 = omega_v2 @ X + beta_v2

    scores2 = (Q2.T @ K2) / np.sqrt(H_D)
    weights2 = softmax_cols(scores2.T).T
    head2_output = V2 @ weights2

    # Concatenate outputs from both heads
    concat_heads = np.vstack((head1_output, head2_output))

    # Final linear projection
    X_prime = omega_c @ concat_heads
    return X_prime

# Compute the output using multi-head self-attention
X_prime = multihead_scaled_self_attention(
    X, omega_v1, omega_q1, omega_k1, beta_v1, beta_q1, beta_k1,
    omega_v2, omega_q2, omega_k2, beta_v2, beta_q2, beta_k2,
    omega_c
)

# Display results
np.set_printoptions(precision=3, suppress=True)
print("\nYour answer:")
print(X_prime)

print("\nTrue values:")
print("[[-21.207  -5.373 -20.933  -9.179 -11.319 -17.812]")
print(" [ -1.995   7.906 -10.516   3.452   9.863  -7.24 ]")
print(" [  5.479   1.115   9.244   0.453   5.656   7.089]")
print(" [ -7.413  -7.416   0.363  -5.573  -6.736  -0.848]")
print(" [-11.261  -9.937  -4.848  -8.915 -13.378  -5.761]")
print(" [  3.548  10.036  -2.244   1.604  12.113  -2.557]")
print(" [  4.888  -5.814   2.407   3.228  -4.232   3.71 ]]")


Input X:
[[ 1.789  0.437  0.096 -1.863 -0.277 -0.355]
 [-0.083 -0.627 -0.044 -0.477 -1.314  0.885]
 [ 0.881  1.71   0.05  -0.405 -0.545 -1.546]
 [ 0.982 -1.101 -1.185 -0.206  1.486  0.237]
 [-1.024 -0.713  0.625 -0.161 -0.769 -0.23 ]
 [ 0.745  1.976 -1.244 -0.626 -0.804 -2.419]
 [-0.924 -1.024  1.124 -0.132 -1.623  0.647]
 [-0.356 -1.743 -0.597 -0.589 -0.874  0.03 ]]

Your answer:
[[ -6.116  -2.101  -4.916   1.423 -23.905   1.114]
 [ -0.292  11.13   -3.113   3.138   1.478  -1.398]
 [  3.321  -2.467  -5.165   9.575  19.153  -6.726]
 [ -0.572 -16.139  -6.007   3.42    2.422  -2.753]
 [ -3.825  -8.241  -1.443   0.117 -15.956   1.291]
 [  1.683   0.325  -4.154   0.296   3.734 -16.751]
 [ -1.102   7.947  10.164  -7.631  -5.134  17.329]
 [  3.313   8.716 -11.429  12.2    16.57  -19.903]]

True values:
[[-21.207  -5.373 -20.933  -9.179 -11.319 -17.812]
 [ -1.995   7.906 -10.516   3.452   9.863  -7.24 ]
 [  5.479   1.115   9.244   0.453   5.656   7.089]
 [ -7.413  -7.416   0.363  -5.573  -6.73