In [15]:
import numpy as np
import random
import math

In [16]:
# L = Input_sequence_length
L, d_k, d_v = 4, 8, 8

q = np.random.randn(L, d_k)
k = np.random.randn(L, d_k)
v = np.random.randn(L, d_v)

In [17]:
q.shape, k.shape, v.shape

((4, 8), (4, 8), (4, 8))

In [18]:
print("Q\n", q)
print("K\n", k)
print("V\n", v)

Q
 [[ 0.53820015  1.80416676 -1.34248699 -0.36214475  0.1303005  -1.02177929
   0.03665372  0.17346662]
 [ 1.21134527 -0.01934549  0.6875044   1.50394237  0.65952213  0.56290001
   1.54096862 -1.43568218]
 [ 0.65182499 -2.3044433   1.09837747 -0.37850361 -0.8274594  -0.65013766
  -0.83589873  1.1313225 ]
 [-0.59179314  0.43646004  0.13665086 -0.42920879 -0.10133618 -1.24431013
   1.42459808 -0.95380103]]
K
 [[-5.73049578e-01 -4.59145700e-01  2.38540142e-03  4.18804306e-01
   9.85054998e-01 -6.20143800e-01  8.47371978e-02  2.77133270e-01]
 [ 7.14948872e-01  7.59209357e-04  6.13392066e-01 -6.83964359e-01
  -1.41002772e+00  8.63622898e-02  7.40096721e-01 -6.20817082e-01]
 [ 7.50317524e-01  9.75100509e-01  2.78897509e-01  1.08261182e+00
   8.84488268e-01  7.05650607e-01  1.54110192e+00 -4.08147413e-01]
 [ 1.05605551e-01 -1.60007998e+00  7.25271394e-01  2.09287590e-01
  -1.70832185e+00 -8.54128419e-01 -1.04649239e+00 -5.23076264e-01]]
V
 [[ 0.86999098  0.58758347 -0.41277314 -0.69313789  1.

## **Self-Attention**

In [19]:
np.matmul(q, k.T)

array([[-0.47847838, -0.54215569,  0.77650372, -3.35839257],
       [-0.0204923 ,  1.40953271,  6.65126881, -1.49684276],
       [ 0.35942954,  1.18649407, -4.80202552,  6.72539732],
       [ 0.4875155 ,  1.63651529,  1.17206811, -0.50759164]])

In [20]:
q.var(), k.var(), np.matmul(q, k.T).var()

(np.float64(0.9789226622778145),
 np.float64(0.6614572848289667),
 np.float64(8.122363429546745))

In [21]:
scaled = np.matmul(q, k.T) / math.sqrt(d_k)

In [23]:
q.var(), k.var(), scaled.var()

(np.float64(0.9789226622778145),
 np.float64(0.6614572848289667),
 np.float64(1.0152954286933427))

## **Masking**

In [24]:
mask = np.tril(np.ones((L, L)))

In [25]:
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [27]:
mask[mask == 0] = -np.inf
mask[mask == 1] = 0

In [28]:
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [29]:
scaled + mask

array([[-0.16916765,        -inf,        -inf,        -inf],
       [-0.00724512,  0.49834507,        -inf,        -inf],
       [ 0.12707753,  0.419489  , -1.6977724 ,        -inf],
       [ 0.17236276,  0.57859553,  0.41438866, -0.17946074]])

## **Softmax**

In [30]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis = -1)).T

In [31]:
attention = softmax(scaled + mask)

In [32]:
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.37622786, 0.62377214, 0.        , 0.        ],
       [0.3998567 , 0.53566967, 0.06447363, 0.        ],
       [0.22329486, 0.33519951, 0.28443907, 0.15706656]])

In [33]:
new_v = np.matmul(attention, v)

In [34]:
new_v

array([[ 0.86999098,  0.58758347, -0.41277314, -0.69313789,  1.38996567,
         1.01964881, -1.13234405,  0.37598487],
       [ 0.18977865, -0.34442751, -0.14037793, -0.18295123,  0.7874703 ,
        -0.59528435, -1.27362247, -0.30733588],
       [ 0.0898837 , -0.20729   , -0.22078244, -0.13996652,  0.72450847,
        -0.46440904, -1.29878362, -0.30916575],
       [-0.59493714,  0.06442054, -0.60930312,  0.33475281,  0.17223258,
        -0.40565801, -0.98848595, -0.52901783]])

## **Scaled Dot Product Attention Mechanism**

In [37]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis = -1)).T

def scaled_dot_product_attention(q, k, v, mask = None):
  d_k = q.shape[-1]
  scaled = np.matmul(q, k.T) / math.sqrt(d_k)
  if mask is not None:
    scaled = scaled + mask
  attention = softmax(scaled)
  out = np.matmul(attention, v)
  return out, attention

In [38]:
values, attention = scaled_dot_product_attention(q, k, v, mask = None)

In [40]:
values, attention

(array([[-0.75756644,  0.2193663 , -0.6564718 ,  0.37083531,  0.087303  ,
         -0.30867658, -1.22182351, -0.57009652],
        [-1.64983562,  0.46097359, -0.90102168,  0.83703014, -0.54814834,
         -0.47925294, -1.58033824, -0.95046293],
        [-0.47608974,  0.1871123 , -1.16856143,  0.66281298,  0.03788909,
          0.06230742,  0.94350168, -0.2906599 ],
        [-0.59493714,  0.06442054, -0.60930312,  0.33475281,  0.17223258,
         -0.40565801, -0.98848595, -0.52901783]]),
 array([[0.25657806, 0.25086617, 0.39986865, 0.09268712],
        [0.07230754, 0.11988328, 0.76490541, 0.04290377],
        [0.08336553, 0.11168097, 0.01344201, 0.79151149],
        [0.22329486, 0.33519951, 0.28443907, 0.15706656]]))

In [43]:
values, attention = scaled_dot_product_attention(q, k, v, mask = mask)

In [44]:
values, attention

(array([[ 0.86999098,  0.58758347, -0.41277314, -0.69313789,  1.38996567,
          1.01964881, -1.13234405,  0.37598487],
        [ 0.18977865, -0.34442751, -0.14037793, -0.18295123,  0.7874703 ,
         -0.59528435, -1.27362247, -0.30733588],
        [ 0.0898837 , -0.20729   , -0.22078244, -0.13996652,  0.72450847,
         -0.46440904, -1.29878362, -0.30916575],
        [-0.59493714,  0.06442054, -0.60930312,  0.33475281,  0.17223258,
         -0.40565801, -0.98848595, -0.52901783]]),
 array([[1.        , 0.        , 0.        , 0.        ],
        [0.37622786, 0.62377214, 0.        , 0.        ],
        [0.3998567 , 0.53566967, 0.06447363, 0.        ],
        [0.22329486, 0.33519951, 0.28443907, 0.15706656]]))