In [154]:
import numpy as np
import pandas as pd
np.random.seed(23)
np.set_printoptions(precision=4, suppress=True)
pd.options.display.float_format = '{:,.4f}'.format

# Dimensions

In [155]:
d_model = 4
d_ff = 4
d_k = 3
d_v = 3

# Word embeddings

In [156]:
me_ = np.random.randn(d_model)
gusta_ = np.random.randn(d_model)
estudiar_ = np.random.randn(d_model)
inteligencia_ = np.random.randn(d_model)
artificial_ = np.random.randn(d_model)

In [157]:
print(me_)

[ 0.667   0.0258 -0.7776  0.9486]


## Positional embeddings

In [158]:
pos_1 = np.array([0.3, -0.9, -0.6, 0.8])
pos_2 = np.array([-0.6, 0.8, -0.9, 0.15])
pos_3 = np.array([0.8, -0.5, -0.9, 0.5])
pos_4 = np.array([-0.9, 0.2, -0.4, -0.9])
pos_5 = np.array([0.95, 0.16, 0.2, -0.95])

In [159]:
me_embed_pos = me_ + pos_1
gusta_embed_pos = gusta_ + pos_2
estudiar_embed_pos = estudiar_ + pos_3
inteligencia_embed_pos = inteligencia_ + pos_4
artificial_embed_pos = artificial_ + pos_5
print(me_embed_pos)

[ 0.967  -0.8742 -1.3776  1.7486]


# Projection matrix (or FC network)

In [160]:
W_q = np.random.randn(d_model, d_k) * 0.1
W_k = np.random.randn(d_model, d_k) * 0.1
W_v = np.random.randn(d_model, d_v) * 0.1

me_query = me_embed_pos @ W_q
me_key = me_embed_pos @ W_k
me_value = me_embed_pos @ W_v

gusta_query = gusta_embed_pos @ W_q
gusta_key = gusta_embed_pos @ W_k
gusta_value = gusta_embed_pos @ W_v

estudiar_query = estudiar_embed_pos @ W_q
estudiar_key = estudiar_embed_pos @ W_k
estudiar_value = estudiar_embed_pos @ W_v

inteligencia_query = inteligencia_embed_pos @ W_q
inteligencia_key = inteligencia_embed_pos @ W_k
inteligencia_value = inteligencia_embed_pos @ W_v

artificial_query = artificial_embed_pos @ W_q
artificial_key = artificial_embed_pos @ W_k
artificial_value = artificial_embed_pos @ W_v



In [161]:
def softmax(x):
  x -= np.max(x, axis=1, keepdims=True)
  return np.exp(x)/np.exp(x).sum(axis=1, keepdims=True)

## Palabra me

In [162]:
me_query.shape

(3,)

In [163]:
me_alpha_me = me_query @ me_key.T
me_alpha_gusta = me_query @ gusta_key.T
me_alpha_estudiar = me_query @ estudiar_key.T
me_alpha_inteligencia = me_query @ inteligencia_key.T
me_alpha_artificial = me_query @ artificial_key.T

me_alphas = softmax(([[me_alpha_me, me_alpha_gusta, me_alpha_estudiar, me_alpha_inteligencia, me_alpha_artificial]]))
print(me_alphas)

[[0.1984 0.203  0.1929 0.2045 0.2012]]


In [164]:
me_alphas[:,0]

array([0.1984])

In [165]:
me_attention = np.array([
    me_alphas[:,0] * me_value + 
    me_alphas[:,1] * gusta_value +
    me_alphas[:,2] * estudiar_value +
    me_alphas[:,3] * inteligencia_value +
    me_alphas[:,4] * artificial_value
])

In [166]:
print(me_attention)

[[ 0.0066 -0.0286  0.0036]]


## Palabra gusta

In [167]:
gusta_alpha_me = gusta_query @ me_key.T
gusta_alpha_gusta = gusta_query @ gusta_key.T
gusta_alpha_estudiar = gusta_query @ estudiar_key.T
gusta_alpha_inteligencia = gusta_query @ inteligencia_key.T
gusta_alpha_artificial = gusta_query @ artificial_key.T
gusta_alphas = softmax(([[gusta_alpha_me, gusta_alpha_gusta, gusta_alpha_estudiar, gusta_alpha_inteligencia, gusta_alpha_artificial]]))
print("alphas de gusta:", gusta_alphas)
gusta_attention = np.array([
    gusta_alphas[:,0] * me_value + 
    gusta_alphas[:,1] * gusta_value +
    gusta_alphas[:,2] * estudiar_value +
    gusta_alphas[:,3] * inteligencia_value +
    gusta_alphas[:,4] * artificial_value
])
print("atencion de gusta:", gusta_attention)

alphas de gusta: [[0.177  0.194  0.2195 0.2076 0.2019]]
atencion de gusta: [[ 0.0062 -0.0306 -0.0024]]


# Palabra estudiar

In [168]:
estudiar_alpha_me = estudiar_query @ me_key.T
estudiar_alpha_gusta = estudiar_query @ gusta_key.T
estudiar_alpha_estudiar = estudiar_query @ estudiar_key.T
estudiar_alpha_inteligencia = estudiar_query @ inteligencia_key.T
estudiar_alpha_artificial = estudiar_query @ artificial_key.T

estudiar_alphas = softmax(([[estudiar_alpha_me, estudiar_alpha_gusta, estudiar_alpha_estudiar, estudiar_alpha_inteligencia, estudiar_alpha_artificial]]))
print("alphas de estudiar:", estudiar_alphas)
estudiar_attention = np.array([
    estudiar_alphas[:,0] * me_value + 
    estudiar_alphas[:,1] * gusta_value +
    estudiar_alphas[:,2] * estudiar_value +
    estudiar_alphas[:,3] * inteligencia_value +
    estudiar_alphas[:,4] * artificial_value
])
print("atencion de estudiar:", estudiar_attention)

alphas de estudiar: [[0.1967 0.2003 0.1981 0.2016 0.2032]]
atencion de estudiar: [[ 0.0066 -0.0293  0.0018]]


## Palabra inteligencia

In [169]:
inteligencia_alpha_me = inteligencia_query @ me_key.T
inteligencia_alpha_gusta = inteligencia_query @ gusta_key.T
inteligencia_alpha_estudiar = inteligencia_query @ estudiar_key.T
inteligencia_alpha_inteligencia = inteligencia_query @ inteligencia_key.T
inteligencia_alpha_artificial = inteligencia_query @ artificial_key.T
inteligencia_alphas = softmax(([[inteligencia_alpha_me, inteligencia_alpha_gusta, inteligencia_alpha_estudiar, inteligencia_alpha_inteligencia, inteligencia_alpha_artificial]]))
print("alphas de inteligencia:", inteligencia_alphas)
inteligencia_attention = np.array([
    inteligencia_alphas[:,0] * me_value + 
    inteligencia_alphas[:,1] * gusta_value +
    inteligencia_alphas[:,2] * estudiar_value +
    inteligencia_alphas[:,3] * inteligencia_value +
    inteligencia_alphas[:,4] * artificial_value
])
print("atencion de inteligencia:", inteligencia_attention)

alphas de inteligencia: [[0.1662 0.1897 0.2319 0.2095 0.2027]]
atencion de inteligencia: [[ 0.006  -0.0315 -0.0049]]


## Palabra artificial

In [170]:
artificial_alpha_me = artificial_query @ me_key.T
artificial_alpha_gusta = artificial_query @ gusta_key.T
artificial_alpha_estudiar = artificial_query @ estudiar_key.T
artificial_alpha_inteligencia = artificial_query @ inteligencia_key.T
artificial_alpha_artificial = artificial_query @ artificial_key.T
artificial_alphas = softmax(([[artificial_alpha_me, artificial_alpha_gusta, artificial_alpha_estudiar, artificial_alpha_inteligencia, artificial_alpha_artificial]]))   
print("alphas de artificial:", artificial_alphas)
artificial_attention = np.array([
    artificial_alphas[:,0] * me_value + 
    artificial_alphas[:,1] * gusta_value +
    artificial_alphas[:,2] * estudiar_value +
    artificial_alphas[:,3] * inteligencia_value +
    artificial_alphas[:,4] * artificial_value
])
print("atencion de artificial:", artificial_attention)

alphas de artificial: [[0.1894 0.1939 0.2182 0.1996 0.199 ]]
atencion de artificial: [[ 0.0068 -0.0313 -0.0067]]


## Utilizando matrices

In [171]:
X = np.array([[me_embed_pos], [gusta_embed_pos], [estudiar_embed_pos], [inteligencia_embed_pos], [artificial_embed_pos]]).reshape(-1, d_model)
print("X:", X)

X: [[ 0.967  -0.8742 -1.3776  1.7486]
 [ 0.1017 -0.2511 -1.2675 -0.9875]
 [-0.5221  1.2723 -1.2475  1.1701]
 [-0.5777  0.2603 -1.4435 -1.9099]
 [ 1.3917  1.2889 -1.6381 -1.8888]]


In [172]:
Q = X @ W_q
K = X @ W_k
V = X @ W_v

In [173]:
print("Q:", Q)
print("K:", K)
print("V:", V)

Q: [[-0.2856  0.1529  0.1914]
 [ 0.1156 -0.2958 -0.0378]
 [-0.0123 -0.0164 -0.1174]
 [ 0.2861 -0.5074 -0.1684]
 [ 0.3299 -0.2983 -0.1628]]
K: [[ 0.21    0.4861  0.0043]
 [-0.1448  0.0443 -0.0516]
 [-0.0906 -0.3696  0.0938]
 [-0.3261 -0.2561 -0.0451]
 [-0.2622 -0.1248 -0.1372]]
V: [[ 0.041  -0.0518 -0.2071]
 [ 0.004   0.0187  0.1207]
 [ 0.0223 -0.1137 -0.3795]
 [-0.006   0.0276  0.1633]
 [-0.0271 -0.0291  0.2981]]


In [174]:
print(me_query)
print(gusta_query)
print(estudiar_query)
print(inteligencia_query)
print(artificial_query)

[-0.2856  0.1529  0.1914]
[ 0.1156 -0.2958 -0.0378]
[-0.0123 -0.0164 -0.1174]
[ 0.2861 -0.5074 -0.1684]
[ 0.3299 -0.2983 -0.1628]


In [175]:
print(Q.shape)

(5, 3)


In [176]:
print(K.shape)

(5, 3)


In [178]:
alphas = softmax(Q @ K.T)
print("alphas:", alphas)

alphas: [[0.1984 0.203  0.1929 0.2045 0.2012]
 [0.177  0.194  0.2195 0.2076 0.2019]
 [0.1967 0.2003 0.1981 0.2016 0.2032]
 [0.1662 0.1897 0.2319 0.2095 0.2027]
 [0.1894 0.1939 0.2182 0.1996 0.199 ]]


In [180]:
attention_scores = alphas @ V
print("atencion scores:", attention_scores)

atencion scores: [[ 0.0066 -0.0286  0.0036]
 [ 0.0062 -0.0306 -0.0024]
 [ 0.0066 -0.0293  0.0018]
 [ 0.006  -0.0315 -0.0049]
 [ 0.0068 -0.0313 -0.0067]]
