In [1]:
import numpy as np 
import math

L, d_k, d_v = 4, 8, 8
q = np.random.randn(L, d_k)
k = np.random.randn(L, d_k)
v = np.random.randn(L, d_v)

In [2]:
print(q)

[[-1.58224471 -1.50960024  0.176527    0.79178155  0.30196811 -0.30161373
  -1.33142454  1.03520074]
 [ 0.3346742  -0.05008156  0.37999317 -0.28788086  1.2722994   1.05035558
   0.08524648  0.09474594]
 [-0.65639618 -0.52645628 -1.34077361  0.22798092 -0.81253418  0.52518873
  -1.56236281 -1.79479478]
 [ 1.04132585  1.64309208  0.36345151 -0.17791464  0.06417598  0.25562796
   1.47619578  0.35006814]]


In [3]:
print(k)

[[-0.1196121  -0.93118551 -0.21318391 -0.39354887 -0.218309    0.52355868
  -1.10654571  0.4264803 ]
 [-0.86357507  1.39708436  0.74549203  1.89053096 -0.52545343 -0.66203244
   0.10582103  0.23992747]
 [ 1.24510077  0.95180163 -0.11933885 -0.05873945 -1.04194888 -1.03681915
   0.99478999 -0.44955027]
 [ 0.69717297  1.42916853  0.60631936  0.75197382 -2.11714635 -0.72952369
  -1.19973049 -0.89404013]]


In [4]:
print(v)

[[ 0.57863886 -1.17212587 -0.90866264 -0.28582865 -2.03056514  0.60944632
  -0.08694745  0.24444667]
 [-1.29808847 -0.5149831  -0.50175086 -0.5585232  -0.22111698 -0.59729146
  -0.74315883  0.754882  ]
 [-0.18621314  1.64957764 -0.13101278  2.1102843   0.52262535  0.69187027
  -1.32194976 -0.13532298]
 [ 0.43067654 -1.28657597  0.87669984  1.22290297 -0.97511077  0.26949972
  -0.83446986  0.89761299]]


### SELF ATTENTION

In [5]:
np.matmul(q, k.T)

array([[ 2.93667603,  1.03432348, -5.26624844, -2.30557739],
       [ 0.25713742, -1.95210081, -2.03189304, -3.47121365],
       [ 2.18058301, -1.25388203, -1.61702548,  2.96463129],
       [-3.02639752,  1.36812031,  3.80675355,  0.75445162]])

why the denominator sqrt(d_k)?

In [7]:
q.var(), k.var(), np.matmul(q, k.T).var()

(0.8442261471544072, 0.8421034182769216, 6.540304155258422)

In [8]:
scaled = np.matmul(q, k.T) / math.sqrt(d_k)
q.var(), k.var(), scaled.var()

(0.8442261471544072, 0.8421034182769216, 0.8175380194073028)

### Masking
- This is to ensure the words dont get context from words generated in the future.
- Not required in the encoders, but required in the decoders

In [45]:
mask = np.tril(np.ones((L,L)))
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [46]:
mask[mask==0] = -np.infty
mask[mask==1] = 0
mask


array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [44]:
scaled + mask

array([[ 1.03827177,        -inf,        -inf,        -inf],
       [ 0.09091181, -0.69017186,        -inf,        -inf],
       [ 0.77095252, -0.44331424, -0.57170484,        -inf],
       [-1.0699931 ,  0.48370357,  1.34589063,  0.26673893]])

### Softmax

In [12]:
def softmax(x):
    return (np.exp(x).T / np.sum(np.exp(x), axis=1)).T

In [13]:
attention = softmax(scaled + mask)

In [14]:
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.68591362, 0.31408638, 0.        , 0.        ],
       [0.64181621, 0.19057298, 0.1676108 , 0.        ],
       [0.04822725, 0.22806276, 0.54012895, 0.18358104]])

In [16]:
new_v = np.matmul(attention, v)
new_v

array([[ 0.57863886, -1.17212587, -0.90866264, -0.28582865, -2.03056514,
         0.60944632, -0.08694745,  0.24444667],
       [-0.01081563, -0.96572628, -0.78085719, -0.37147829, -1.46224212,
         0.23042642, -0.2930545 ,  0.40476746],
       [ 0.09278788, -0.57394422, -0.70077373,  0.06381755, -1.2577909 ,
         0.39328984, -0.41900333,  0.27806836],
       [-0.28965454,  0.48081682, -0.06807131,  1.22316437, -0.04508388,
         0.31634618, -1.04089627,  0.27564233]])

In [17]:
v

array([[ 0.57863886, -1.17212587, -0.90866264, -0.28582865, -2.03056514,
         0.60944632, -0.08694745,  0.24444667],
       [-1.29808847, -0.5149831 , -0.50175086, -0.5585232 , -0.22111698,
        -0.59729146, -0.74315883,  0.754882  ],
       [-0.18621314,  1.64957764, -0.13101278,  2.1102843 ,  0.52262535,
         0.69187027, -1.32194976, -0.13532298],
       [ 0.43067654, -1.28657597,  0.87669984,  1.22290297, -0.97511077,
         0.26949972, -0.83446986,  0.89761299]])

## Final Code for Self Attention

In [52]:
import numpy as np
import math

def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=-1, keepdims=True)

def scaled_dot_product_attention(q, k, v, mask=None):
    d_k = q.shape[-1]
    scaled = np.matmul(q, k.T) / math.sqrt(d_k)
    if mask is not None:
        scaled += mask
    attention = softmax(scaled)
    out = np.matmul(attention, v)
    return out, attention

def create_causal_mask(L):
    mask = np.triu(np.ones((L, L)), k=1)
    mask[mask == 1] = -np.inf
    return mask

In [53]:
L, d_k, d_v = 4, 8, 8
np.random.seed(42)
q = np.random.randn(L, d_k)
k = np.random.randn(L, d_k)
v = np.random.randn(L, d_v)

mask = create_causal_mask(L)

In [54]:
print("=== Encoder Self-Attention ===")
values, attention = scaled_dot_product_attention(q, k, v, mask=None)
print("Q:\n", q)
print("K:\n", k)
print("V:\n", v)
print("Output:\n", values)
print("Attention Weights:\n", attention)

=== Encoder Self-Attention ===
Q:
 [[ 0.49671415 -0.1382643   0.64768854  1.52302986 -0.23415337 -0.23413696
   1.57921282  0.76743473]
 [-0.46947439  0.54256004 -0.46341769 -0.46572975  0.24196227 -1.91328024
  -1.72491783 -0.56228753]
 [-1.01283112  0.31424733 -0.90802408 -1.4123037   1.46564877 -0.2257763
   0.0675282  -1.42474819]
 [-0.54438272  0.11092259 -1.15099358  0.37569802 -0.60063869 -0.29169375
  -0.60170661  1.85227818]]
K:
 [[-0.01349722 -1.05771093  0.82254491 -1.22084365  0.2088636  -1.95967012
  -1.32818605  0.19686124]
 [ 0.73846658  0.17136828 -0.11564828 -0.3011037  -1.47852199 -0.71984421
  -0.46063877  1.05712223]
 [ 0.34361829 -1.76304016  0.32408397 -0.38508228 -0.676922    0.61167629
   1.03099952  0.93128012]
 [-0.83921752 -0.30921238  0.33126343  0.97554513 -0.47917424 -0.18565898
  -1.10633497 -1.19620662]]
V:
 [[ 0.81252582  1.35624003 -0.07201012  1.0035329   0.36163603 -0.64511975
   0.36139561  1.53803657]
 [-0.03582604  1.56464366 -2.6197451   0.821902

In [55]:
print("\n=== Decoder Self-Attention ===")
values, attention = scaled_dot_product_attention(q, k, v, mask=mask)
print("Q:\n", q)
print("K:\n", k)
print("V:\n", v)
print("Mask:\n", mask)
print("Output:\n", values)
print("Attention Weights:\n", attention)


=== Decoder Self-Attention ===
Q:
 [[ 0.49671415 -0.1382643   0.64768854  1.52302986 -0.23415337 -0.23413696
   1.57921282  0.76743473]
 [-0.46947439  0.54256004 -0.46341769 -0.46572975  0.24196227 -1.91328024
  -1.72491783 -0.56228753]
 [-1.01283112  0.31424733 -0.90802408 -1.4123037   1.46564877 -0.2257763
   0.0675282  -1.42474819]
 [-0.54438272  0.11092259 -1.15099358  0.37569802 -0.60063869 -0.29169375
  -0.60170661  1.85227818]]
K:
 [[-0.01349722 -1.05771093  0.82254491 -1.22084365  0.2088636  -1.95967012
  -1.32818605  0.19686124]
 [ 0.73846658  0.17136828 -0.11564828 -0.3011037  -1.47852199 -0.71984421
  -0.46063877  1.05712223]
 [ 0.34361829 -1.76304016  0.32408397 -0.38508228 -0.676922    0.61167629
   1.03099952  0.93128012]
 [-0.83921752 -0.30921238  0.33126343  0.97554513 -0.47917424 -0.18565898
  -1.10633497 -1.19620662]]
V:
 [[ 0.81252582  1.35624003 -0.07201012  1.0035329   0.36163603 -0.64511975
   0.36139561  1.53803657]
 [-0.03582604  1.56464366 -2.6197451   0.82190