In [4]:
import numpy as np
import math

L ,d_k, d_v = 4, 8, 8

q = np.random.randn(L,d_k)
k = np.random.randn(L,d_k)
v = np.random.randn(L,d_v)

print(q.shape)
print(k.shape)
print(v.shape)


(4, 8)
(4, 8)
(4, 8)


In [5]:
print("Q/n", q)
print("K/n", k)
print("V/n", v)

Q/n [[ 0.37018019  1.58820785  0.10841023  1.75872964 -0.78062602 -0.13276253
  -0.48586673  1.26528191]
 [-0.06490368 -0.93170641  0.59197698  0.21619447  1.36746697  0.00781463
  -1.47406886 -0.35656257]
 [-0.97915837  0.5465012   1.20577806 -0.03296482  0.11283723  1.60763748
   1.22194233 -0.80513902]
 [ 2.5127741   0.28611076 -0.50603831  0.21274529  0.46184156  2.06385935
   0.12069828 -0.01222992]]
K/n [[ 2.40900181  0.03210469 -0.22234979 -1.1190993   0.2752563  -1.00672595
  -1.57083551  0.24656526]
 [ 0.01027102 -1.07927802 -1.20315626 -0.5162842  -0.31412207  0.71764306
  -0.48093523 -1.04747622]
 [-0.30842385 -1.03325731  0.72013653  0.53337223  1.31367327 -1.36281634
   0.22279147  0.12454122]
 [-1.32716978 -1.4550734  -1.31387967 -0.34978562  1.4126511   0.97434849
   0.31231876 -0.24241538]]
V/n [[-1.19586669 -0.82343375  0.23527511 -0.18165579  0.92762998  0.03509224
   0.75543657  0.06619457]
 [-0.60892549 -1.84701695  0.14457054  0.21850452 -0.19570522 -0.45026347
  -

## Self attention

In [6]:
np.matmul(q, k.T)

array([[-0.05556992, -3.69050099, -1.53429585, -5.25044555],
       [ 2.03630621,  0.83952405,  2.93726488,  2.15385597],
       [-6.27784354, -0.65964981, -1.28265629,  1.23420578],
       [ 3.79366867,  1.50682403, -3.50215465, -0.45673152]])

In [7]:
q.var(), k.var(), np.matmul(q,k.T).var()

(0.8899243052098846, 0.8939430790771647, 8.100140263801912)

In [8]:
scaled = np.matmul(q, k.T) / math.sqrt(d_k)
q.var(), k.var(), scaled.var()

(0.8899243052098846, 0.8939430790771647, 1.012517532975239)

## Masking

In [9]:
mask = np.tril(np.ones((L,L)))
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [10]:
mask[mask==0] = -np.infty
mask[mask==1] = 0

In [12]:
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [13]:
scaled + mask

array([[-0.01964693,        -inf,        -inf,        -inf],
       [ 0.71994297,  0.29681657,        -inf,        -inf],
       [-2.21955287, -0.23322143, -0.45348748,        -inf],
       [ 1.34126442,  0.53274275, -1.23819865, -0.16147898]])

## Softmax

In [14]:
def softmax(x):
    return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

In [18]:
attention = softmax(scaled + mask)

In [19]:
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.60423113, 0.39576887, 0.        , 0.        ],
       [0.07073864, 0.51559597, 0.41366539, 0.        ],
       [0.57344389, 0.25547854, 0.04347548, 0.12760208]])

In [21]:
new_v = np.matmul(attention, v)
new_v  # attentioned v

array([[-1.19586669, -0.82343375,  0.23527511, -0.18165579,  0.92762998,
         0.03509224,  0.75543657,  0.06619457],
       [-0.96357363, -1.22853612,  0.19937706, -0.0232848 ,  0.48304887,
        -0.15699644,  0.30095486,  1.08441444],
       [-0.25729206, -1.08029866,  0.44087673,  0.52360644, -0.98915067,
        -0.17082774, -0.05892791,  1.99687336],
       [-0.74432445, -1.14755711,  0.19480704, -0.18024971,  0.31781461,
         0.0900487 ,  0.14530731,  0.70485052]])

In [23]:
v  #normal v

array([[-1.19586669, -0.82343375,  0.23527511, -0.18165579,  0.92762998,
         0.03509224,  0.75543657,  0.06619457],
       [-0.60892549, -1.84701695,  0.14457054,  0.21850452, -0.19570522,
        -0.45026347, -0.39291474,  2.63895847],
       [ 0.34148726, -0.16857966,  0.84535402,  1.02449102, -2.3058858 ,
         0.14225003,  0.21809605,  1.52672791],
       [ 0.64386936, -1.53729279, -0.10812419, -1.38276547, -0.50062794,
         1.40102366, -1.5438127 , -0.57742663]])

In [25]:
def scaled_dot_product_attention(q,k,v,mask=None):
    d_k = q.shape[-1]
    scaled = np.matmul(q,k.T) / math.sqrt(d_k)

    if mask is not None:
        scaled = scaled + mask
    
    attention = softmax(scaled)
    out = np.matmul(attention, v)

    return out , attention
    

In [26]:
values , attention = scaled_dot_product_attention(q,k,v,mask=None)
print("Q/n", q)
print("K/n", k)
print("New V/n", values)
print("Attention/n", attention)