### Transformers

In [1]:
import numpy as np
import pandas as pd
# np.random.seed(23)
np.set_printoptions(precision=4, suppress = True)
pd.options.display.float_format = '{:.4f}'.format


#### Dimension 

In [2]:
d_model = 4
d_ff = 5
d_k = 3
d_v = 3

#### Word embeddings

In [12]:
me_ = np.random.randn(d_model)
gusta_ = np.random.randn(d_model)
estudiar_ = np.random.rand(d_model)
inteligencia_ = np.random.rand(d_model)
artificial_ = np.random.rand(d_model)

In [13]:
print(me_)

[ 0.4417  1.1289 -1.8381 -0.9388]


In [14]:
print(gusta_)

[-0.2018  1.0454  0.5382  0.8121]


#### Positional embeddings

In [15]:
pos_1 = np.array([0.3, -0.9, -0.6, 0.8])
pos_2 = np.array([-.6, 0.8, -0.9, 0.15])
pos_3 = np.array([0.8, -0.5, -0.9, 0.5])
pos_4 = np.array([-0.95, 0.2, -0.4, -0.9])
pos_5 = np.array([0.95, 0.16, 0.2, -0.95])

In [16]:
me_embed_pos = me_ + pos_1
gusta_embed_pos = gusta_ + pos_2
estudiar_embed_pos = estudiar_ + pos_3
inteligencia_embed_pos = inteligencia_ + pos_4
artificial_embed_pos = artificial_ + pos_5

In [17]:
print(me_embed_pos)

[ 0.7417  0.2289 -2.4381 -0.1388]


In [18]:
print(gusta_embed_pos)

[-0.8018  1.8454 -0.3618  0.9621]


#### Projection matrices (or FC network)

In [19]:
W_q = np.random.randn(d_model, d_k) * 0.1
W_k = np.random.randn(d_model, d_k) * 0.1
W_v = np.random.randn(d_model, d_v) * 0.1

In [20]:
me_query = me_embed_pos @ W_q
me_key = me_embed_pos @ W_k
me_value = me_embed_pos @ W_v

gusta_query = gusta_embed_pos @ W_q
gusta_key = gusta_embed_pos @ W_k
gusta_value = gusta_embed_pos @ W_v

estudiar_query = estudiar_embed_pos @ W_q
estudiar_key = estudiar_embed_pos @ W_k
estudiar_value = estudiar_embed_pos @ W_v

inteligencia_query = inteligencia_embed_pos @ W_q
inteligencia_key = inteligencia_embed_pos @ W_k
inteligencia_value = inteligencia_embed_pos @ W_v

artificial_query = artificial_embed_pos @ W_q
artificial_key = artificial_embed_pos @ W_k
artificial_value = artificial_embed_pos @ W_v

#### Similarities

In [21]:
def softmax(x):
  x -= np.max(x, axis=1, keepdims=True)
  return np.exp(x)/np.exp(x).sum(axis=1, keepdims=True)

In [22]:
me_alpha_me = me_query @ me_key.T
me_alpha_gusta = me_query @ gusta_key.T
me_alpha_estudiar = me_query @ estudiar_key.T
me_alpha_inteligencia = me_query @ inteligencia_key.T
me_alpha_artificial = me_query @ artificial_key.T
me_alphas = softmax(([[me_alpha_me, me_alpha_gusta, 
                     me_alpha_estudiar, me_alpha_inteligencia,
                    me_alpha_artificial]]))


In [23]:
me_alpha_me

-0.002689607928836218

In [24]:
print(me_alphas)

[[0.2003 0.1993 0.199  0.2012 0.2002]]


In [25]:
me_attention = np.array([me_alphas[:,0]*me_value +
                         me_alphas[:, 1]*gusta_value +
                         me_alphas[:, 2]*estudiar_value +
                         me_alphas[:,3]*inteligencia_value +
                         me_alphas[:, 4]*artificial_value])

In [26]:
me_attention.shape

(1, 3)

In [27]:
print(me_attention)

[[ 0.1467  0.0717 -0.0538]]


In [21]:
gusta_alpha_me = gusta_query @ me_key.T
gusta_alpha_gusta = gusta_query @ gusta_key.T
gusta_alpha_estudiar = gusta_query @ estudiar_key.T
gusta_alpha_inteligencia = gusta_query @ inteligencia_key.T
gusta_alpha_artificial = gusta_query @ artificial_key.T
gusta_alphas = softmax(([[gusta_alpha_me, gusta_alpha_gusta,
                     gusta_alpha_estudiar, gusta_alpha_inteligencia,
                       gusta_alpha_artificial]]))

In [22]:
print(gusta_alphas)

[[0.1967 0.1976 0.1937 0.2065 0.2055]]


In [23]:
gusta_attention = np.array([gusta_alphas[...,0]*me_value +
                         gusta_alphas[...,1]*gusta_value +
                         gusta_alphas[..., 2]*estudiar_value +
                         gusta_alphas[...,3]*inteligencia_value +
                          gusta_alphas[...,4]*artificial_value])

In [24]:
print(gusta_attention)

[[-0.0767  0.0809  0.0647]]


### Matrices

In [25]:
X = np.array([[me_embed_pos],
              [gusta_embed_pos],
              [estudiar_embed_pos],
              [inteligencia_embed_pos],
              [artificial_embed_pos]]).reshape(-1, d_model)
print(X.shape)

(5, 4)


In [29]:
print(me_embed_pos)
print(gusta_embed_pos)
print(estudiar_embed_pos)

[ 0.967  -0.8742 -1.3776  1.7486]
[ 0.1017 -0.2511 -1.2675 -0.9875]
[ 1.4181 -0.0881 -0.8975  1.384 ]


In [26]:
print(X)

[[ 0.967  -0.8742 -1.3776  1.7486]
 [ 0.1017 -0.2511 -1.2675 -0.9875]
 [ 1.4181 -0.0881 -0.8975  1.384 ]
 [-0.0651  0.5004  0.1896  0.0784]
 [ 1.7951  0.2251  0.4947 -0.6621]]


In [30]:
Q = X @ W_q
K = X @ W_k
V = X @ W_v

In [31]:
print(me_query)
print(gusta_query)
print(estudiar_query)
print(inteligencia_query)
print(artificial_query)

[-0.1159 -0.0089 -0.0807]
[ 0.0539 -0.0729 -0.2006]
[-0.1092  0.0701 -0.1161]
[-0.0471 -0.0031  0.0788]
[ 0.1479  0.2473 -0.3784]


In [32]:
print(Q)

[[-0.1159 -0.0089 -0.0807]
 [ 0.0539 -0.0729 -0.2006]
 [-0.1092  0.0701 -0.1161]
 [-0.0471 -0.0031  0.0788]
 [ 0.1479  0.2473 -0.3784]]


In [None]:
Q.shape

In [33]:
print(me_key)
print(gusta_key)
print(estudiar_key)
print(inteligencia_key)
print(artificial_key)

[0.0779 0.0176 0.3061]
[ 0.0452 -0.0045  0.2849]
[ 0.0901 -0.035   0.4052]
[0.0125 0.0363 0.0416]
[ 0.0046 -0.2769  0.1759]


In [34]:
print(K)

[[ 0.0779  0.0176  0.3061]
 [ 0.0452 -0.0045  0.2849]
 [ 0.0901 -0.035   0.4052]
 [ 0.0125  0.0363  0.0416]
 [ 0.0046 -0.2769  0.1759]]


In [35]:
alphas=softmax(Q@K.T)

In [36]:
print(alphas)

[[0.1981 0.1992 0.1963 0.2039 0.2024]
 [0.1967 0.1976 0.1937 0.2065 0.2055]
 [0.1988 0.1997 0.1955 0.2067 0.1993]
 [0.2006 0.2006 0.2021 0.1971 0.1995]
 [0.1994 0.1989 0.1899 0.2192 0.1926]]


In [37]:
print(me_alphas)

[[0.1981 0.1992 0.1963 0.2039 0.2024]]


In [38]:
print(gusta_alphas)

[[0.1967 0.1976 0.1937 0.2065 0.2055]]


In [42]:
print(me_attention)

[[-0.078   0.0809  0.0652]]


In [41]:
print(gusta_attention)

[[-0.0767  0.0809  0.0647]]


In [39]:
attn_scores = alphas @ V

In [40]:
print(attn_scores)

[[-0.078   0.0809  0.0652]
 [-0.0767  0.0809  0.0647]
 [-0.0783  0.0803  0.0651]
 [-0.0804  0.0815  0.0663]
 [-0.0777  0.0784  0.064 ]]
