# Importing libraries

In [1]:
import numpy as np
import math

## Data 

### Attention in Transformres

`Q` - What I am Looking for: [sequence length * dk]

`k` - what I can offer: [sequence length * dk]

`v` - what I actually offer: [sequence length * dv]

`L` - Length of the input sequence.

In [2]:
L, d_k, d_v = 4,8,8
q = np.random.randn(L, d_k)
k = np.random.randn(L, d_k)
v = np.random.randn(L, d_v)

In [7]:
print(f"Q\n: {q}")
print(f"K\n: {k}")
print(f"V\n: {v}")

Q
: [[ 1.15903265 -0.03436588 -0.54091411 -0.04183514  1.0294876  -1.38443887
  -0.20643329  0.63079036]
 [ 0.28384774 -0.99402038  0.19379103  0.07773752 -0.54139472  0.61485768
  -0.66874001 -0.61946928]
 [-0.4069266   0.21494398 -1.1802294  -0.75233679  0.07718897 -1.41885616
   0.21152508  0.7129345 ]
 [ 0.1134854   0.8669609   0.64626725  0.42897377  0.56820766  0.60987589
  -0.0463874   0.50168931]]
K
: [[ 0.45504661 -0.32470325 -0.69371571 -0.29226108  0.61103539 -0.04622356
  -0.63832225 -0.2160977 ]
 [-0.61339223  0.52199931  0.0452046   0.16846916 -0.05370172  0.0053866
  -1.15041188 -2.96587082]
 [ 0.25472673 -0.12084354  0.78193255 -0.32265847  0.87753825  0.55725366
  -1.7554051   1.73041697]
 [ 0.94920364  0.30000482  0.87418704  1.02043214 -1.29928868  0.21313945
  -0.90408828 -0.19782976]]
V
: [[ 0.58005958  1.18831246  0.4010658  -1.55581048  1.41179241  0.5148482
  -0.17225128  0.41666077]
 [ 0.3933322   0.28408931 -0.87984952  0.59853989 -0.10313076 -0.00300281
   0.

## Self Attention

We need to every word to look at every single word if it's have a high affinity towards it or not.

In [9]:
np.matmul(q, k.T)

array([[ 1.61454566, -2.45648243,  1.4757651 , -0.99653726],
       [ 0.49627535,  1.9678468 ,  0.28837998,  1.78158179],
       [ 0.60732396, -2.18789169, -0.67030639, -2.85620385],
       [-0.5633614 , -0.97738461,  2.07910641,  0.70492234]])

In [10]:
# checking the variance of the data
q.var(), k.var(), np.matmul(q, k.T).var()

(0.4602380800685623, 0.802920543839752, 2.4398253816293427)

In [14]:
scaled = np.matmul(q, k.T) / math.sqrt(d_k)

In [15]:
scaled

array([[ 0.57082809, -0.86849769,  0.52176176, -0.35232913],
       [ 0.17545983,  0.69573891,  0.10195772,  0.62988428],
       [ 0.21472144, -0.77353652, -0.2369891 , -1.00982055],
       [-0.19917833, -0.34555764,  0.73507512,  0.24922768]])

In [16]:
q.var(), k.var(), scaled.var()

(0.4602380800685623, 0.802920543839752, 0.3049781727036678)

## Masking

* This is to ensure words don't get context from words generated in the future.

* Not required in the encoders, but required int he decoders.


In [17]:
mask = np.tril(np.ones((L,L)))
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [18]:
mask[mask == 0] = -np.infty
mask[mask == 1] = 0

In [19]:
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [20]:
scaled + mask

array([[ 0.57082809,        -inf,        -inf,        -inf],
       [ 0.17545983,  0.69573891,        -inf,        -inf],
       [ 0.21472144, -0.77353652, -0.2369891 ,        -inf],
       [-0.19917833, -0.34555764,  0.73507512,  0.24922768]])

## Softmax

* It is usually to convert a vector to a probability distrbution

In [21]:
def softmax(x):
    return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

In [24]:
attention = softmax(scaled+mask)

In [25]:
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.37278698, 0.62721302, 0.        , 0.        ],
       [0.49781882, 0.18530039, 0.31688079, 0.        ],
       [0.16736523, 0.1445751 , 0.42599681, 0.26206287]])

In [27]:
new_v = np.matmul(attention, v)
new_v

array([[ 0.58005958,  1.18831246,  0.4010658 , -1.55581048,  1.41179241,
         0.5148482 , -0.17225128,  0.41666077],
       [ 0.46294173,  0.62117193, -0.40234097, -0.20457387,  0.46161287,
         0.19004531,  0.08436345, -0.1596276 ],
       [ 0.15605259,  0.70630258,  0.53436808, -0.62667827,  0.59117397,
         0.43796768, -0.49917541, -0.26127964],
       [-0.25972492,  0.44755252,  0.4701361 ,  0.18543861,  0.74603991,
         0.46306653, -0.84643746, -0.56355693]])

In [28]:
v

array([[ 0.58005958,  1.18831246,  0.4010658 , -1.55581048,  1.41179241,
         0.5148482 , -0.17225128,  0.41666077],
       [ 0.3933322 ,  0.28408931, -0.87984952,  0.59853989, -0.10313076,
        -0.00300281,  0.23688361, -0.50214727],
       [-0.64881368,  0.19596142,  1.57076871,  0.11652266, -0.29201105,
         0.57505215, -1.44319289, -1.18547145],
       [-0.52384454,  0.4736236 , -0.53012659,  1.18160556,  2.4767373 ,
         0.50507957, -0.90459399, -0.21249256]])

## Functions

In [29]:
def softmax(x):
    return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

In [32]:
def scaled_dot_product_attention(q,k,v, mask=None):
    d_k = q.shape[-1]
    scaled = np.matmul(q, k.T) / math.sqrt(d_k)
    if mask is not None:
        scaled = scaled * mask
    attention = softmax(scaled)
    out = np.matmul(attention, v)
    return out, attention

In [33]:
values, attention = scaled_dot_product_attention(q,k,v,mask=None)
print("Q\n",q)
print("K\n",k) 
print("V\n", v)
print("New V\n", values)
print("Attention\n", attention)

Q
 [[ 1.15903265 -0.03436588 -0.54091411 -0.04183514  1.0294876  -1.38443887
  -0.20643329  0.63079036]
 [ 0.28384774 -0.99402038  0.19379103  0.07773752 -0.54139472  0.61485768
  -0.66874001 -0.61946928]
 [-0.4069266   0.21494398 -1.1802294  -0.75233679  0.07718897 -1.41885616
   0.21152508  0.7129345 ]
 [ 0.1134854   0.8669609   0.64626725  0.42897377  0.56820766  0.60987589
  -0.0463874   0.50168931]]
K
 [[ 0.45504661 -0.32470325 -0.69371571 -0.29226108  0.61103539 -0.04622356
  -0.63832225 -0.2160977 ]
 [-0.61339223  0.52199931  0.0452046   0.16846916 -0.05370172  0.0053866
  -1.15041188 -2.96587082]
 [ 0.25472673 -0.12084354  0.78193255 -0.32265847  0.87753825  0.55725366
  -1.7554051   1.73041697]
 [ 0.94920364  0.30000482  0.87418704  1.02043214 -1.29928868  0.21313945
  -0.90408828 -0.19782976]]
V
 [[ 0.58005958  1.18831246  0.4010658  -1.55581048  1.41179241  0.5148482
  -0.17225128  0.41666077]
 [ 0.3933322   0.28408931 -0.87984952  0.59853989 -0.10313076 -0.00300281
   0.236

In [34]:
values, attention = scaled_dot_product_attention(q,k,v,mask)
print("Q\n",q)
print("K\n",k) 
print("V\n", v)
print("New V\n", values)
print("Attention\n", attention)

Q
 [[ 1.15903265 -0.03436588 -0.54091411 -0.04183514  1.0294876  -1.38443887
  -0.20643329  0.63079036]
 [ 0.28384774 -0.99402038  0.19379103  0.07773752 -0.54139472  0.61485768
  -0.66874001 -0.61946928]
 [-0.4069266   0.21494398 -1.1802294  -0.75233679  0.07718897 -1.41885616
   0.21152508  0.7129345 ]
 [ 0.1134854   0.8669609   0.64626725  0.42897377  0.56820766  0.60987589
  -0.0463874   0.50168931]]
K
 [[ 0.45504661 -0.32470325 -0.69371571 -0.29226108  0.61103539 -0.04622356
  -0.63832225 -0.2160977 ]
 [-0.61339223  0.52199931  0.0452046   0.16846916 -0.05370172  0.0053866
  -1.15041188 -2.96587082]
 [ 0.25472673 -0.12084354  0.78193255 -0.32265847  0.87753825  0.55725366
  -1.7554051   1.73041697]
 [ 0.94920364  0.30000482  0.87418704  1.02043214 -1.29928868  0.21313945
  -0.90408828 -0.19782976]]
V
 [[ 0.58005958  1.18831246  0.4010658  -1.55581048  1.41179241  0.5148482
  -0.17225128  0.41666077]
 [ 0.3933322   0.28408931 -0.87984952  0.59853989 -0.10313076 -0.00300281
   0.236

  return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T
