## Setup

In [1]:
import numpy as np

def softmax(x):
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x / np.sum(e_x, axis=-1, keepdims=True)

# Step 1: Input sentence embeddings (3 words, 4-dimensional each)
X = np.array([
    [1.0, 0.0, 1.0, 0.0],  # "The"
    [0.0, 2.0, 0.0, 2.0],  # "cat"
    [1.0, 1.0, 1.0, 1.0]   # "sat"
])
print("Input X (word embeddings):\n", X)

Input X (word embeddings):
 [[1. 0. 1. 0.]
 [0. 2. 0. 2.]
 [1. 1. 1. 1.]]


## Step 2: Create Q, K, V matrices

In [4]:
# Random weight matrices for Q, K, V (4x4)
np.random.seed(0)  # for consistent output
W_q = np.random.rand(4, 4)
W_k = np.random.rand(4, 4)
W_v = np.random.rand(4, 4)

print("Before Linear Transformation:")
print("\nWeight matrix W_q:\n", W_q)
print("\nWeight matrix W_k:\n", W_k)
print("\nWeight matrix W_v:\n", W_v)

# Linear transformations
Q = X @ W_q
K = X @ W_k
V = X @ W_v

print("\nAfter Linear Transformation:")
print("\nQuery Q:\n", Q)
print("\nKey K:\n", K)
print("\nValue V:\n", V)

Before Linear Transformation:

Weight matrix W_q:
 [[0.5488135  0.71518937 0.60276338 0.54488318]
 [0.4236548  0.64589411 0.43758721 0.891773  ]
 [0.96366276 0.38344152 0.79172504 0.52889492]
 [0.56804456 0.92559664 0.07103606 0.0871293 ]]

Weight matrix W_k:
 [[0.0202184  0.83261985 0.77815675 0.87001215]
 [0.97861834 0.79915856 0.46147936 0.78052918]
 [0.11827443 0.63992102 0.14335329 0.94466892]
 [0.52184832 0.41466194 0.26455561 0.77423369]]

Weight matrix W_v:
 [[0.45615033 0.56843395 0.0187898  0.6176355 ]
 [0.61209572 0.616934   0.94374808 0.6818203 ]
 [0.3595079  0.43703195 0.6976312  0.06022547]
 [0.66676672 0.67063787 0.21038256 0.1289263 ]]

After Linear Transformation:

Query Q:
 [[1.51247626 1.09863089 1.39448841 1.0737781 ]
 [1.98339872 3.1429815  1.01724654 1.9578046 ]
 [2.50417562 2.67012164 1.90311168 2.0526804 ]]

Key K:
 [[0.13849282 1.47254087 0.92151004 1.81468107]
 [3.00093333 2.42764101 1.45206995 3.10952573]
 [1.63895949 2.68636137 1.64754501 3.36944393]]

Value

## Step 3: Compute attention scores (Q·Kᵀ / √dk)

In [5]:
dk = Q.shape[1]  # 4
scores = Q @ K.T / np.sqrt(dk)
print("\nRaw attention scores (Q·Kᵀ / sqrt(dk)):\n", scores)


Raw attention scores (Q·Kᵀ / sqrt(dk)):
 [[ 2.53042292  6.28487859  5.67286222]
 [ 4.69652452 10.57351751  9.98328327]
 [ 4.87868518 11.57163735 10.66450385]]


## Step 4: Apply Softmax to get weights

In [6]:
attention_weights = softmax(scores)
print("\nAttention Weights (after softmax):\n", attention_weights)


Attention Weights (after softmax):
 [[0.01495411 0.63870437 0.34634152]
 [0.00180039 0.64226049 0.35593913]
 [0.00088234 0.71178464 0.28733302]]


## Step 5: Multiply by V to get final output

In [7]:
output = attention_weights @ V
print("\nFinal Output (weighted sum of V):\n", output)


Final Output (weighted sum of V):
 [[2.37124698 2.45396559 2.13285971 1.56135821]
 [2.38971598 2.47190516 2.14959736 1.5724951 ]
 [2.42309392 2.49270038 2.1810883  1.58247816]]
