In [1]:
import torch

In [2]:
#each row corresponds to a word
inputs=torch.tensor(
    [
 [ 0.42, -1.37,  2.15],
 [ 1.08,  0.56, -0.91],
 [-0.73,  1.94,  0.28],
 [ 2.61, -0.44, -1.12],
 [ 0.09,  0.87,  1.53],
 [-1.68,  0.31, -0.26]
]
)

words=['Dream','big','and','work','for','it']

In [3]:
#calculating magnitude of each vector
# formula: square root of sum of squares of all values in each row
magnitudes=torch.norm(inputs,dim=1)#dim=1 represents the calculating along all columns in each row

for word,magnitude in zip(words,magnitudes):
  print(f"{word}: {magnitude.item(): .4f}")

Dream:  2.5838
big:  1.5192
and:  2.0916
work:  2.8740
for:  1.7624
it:  1.7280


In [4]:
#taking 2nd token as query

query=inputs[1]#2nd tokemn input embedding


#creating a empty array of shape of inputs' row value
attn_scores2=torch.empty(inputs.shape[0])

for i,x_i in enumerate(inputs):
  attn_scores2[i]=torch.dot(x_i,query)

print(attn_scores2)
#attn_scores2 represents how each token is related to 2nd token

tensor([-2.2701,  2.3081,  0.0432,  3.5916, -0.8079, -1.4042])


In [5]:
#calculating attention weights with simple normalization

attn_weights2temp=attn_scores2/attn_scores2.sum()

print("Attention weights for 2nd token: ",attn_weights2temp)
print("Sum: ",attn_weights2temp.sum())


Attention weights for 2nd token:  tensor([-1.5541,  1.5801,  0.0296,  2.4588, -0.5531, -0.9613])
Sum:  tensor(1.0000)


In [6]:
#calculating attention weights with softmax

attn_weights2=torch.softmax(attn_scores2,dim=-1)
print("Attention weights for 2nd token with softmax: ",attn_weights2)
print("Sum: ",attn_weights2.sum())




Attention weights for 2nd token with softmax:  tensor([0.0021, 0.2087, 0.0217, 0.7532, 0.0093, 0.0051])
Sum:  tensor(1.)


In [7]:
#calculating attention scores for each token with each token
attn_scores=inputs @ inputs.T
#each row represents how much that row's token related with all tokens including itself in each column
print(attn_scores)

tensor([[ 6.6758, -2.2701, -2.3624, -0.7090,  2.1354, -1.6893],
        [-2.2701,  2.3081,  0.0432,  3.5916, -0.8079, -1.4042],
        [-2.3624,  0.0432,  4.3749, -3.0725,  2.0505,  1.7550],
        [-0.7090,  3.5916, -3.0725,  8.2601, -1.8615, -4.2300],
        [ 2.1354, -0.8079,  2.0505, -1.8615,  3.1059, -0.2793],
        [-1.6893, -1.4042,  1.7550, -4.2300, -0.2793,  2.9861]])


In [9]:
#calculating attention weights
attn_weights=torch.softmax(attn_scores,dim=-1)
print(attn_weights)
#each row sum is almost equal to 1

tensor([[9.8837e-01, 1.2875e-04, 1.1740e-04, 6.1339e-04, 1.0545e-02, 2.3014e-04],
        [2.1438e-03, 2.0868e-01, 2.1669e-02, 7.5316e-01, 9.2515e-03, 5.0961e-03],
        [1.0002e-03, 1.1088e-02, 8.4348e-01, 4.9171e-04, 8.2528e-02, 6.1414e-02],
        [1.2608e-04, 9.2974e-03, 1.1863e-05, 9.9052e-01, 3.9821e-05, 3.7281e-06],
        [2.1194e-01, 1.1167e-02, 1.9469e-01, 3.8939e-03, 5.5936e-01, 1.8946e-02],
        [6.8917e-03, 9.1653e-03, 2.1586e-01, 5.4315e-04, 2.8228e-02, 7.3931e-01]])


In [10]:
#context vectors
context_vecs=attn_weights @ inputs
print(context_vecs)
#each rows carries information about how that row's token is related to all the other tokens inclusing itself

tensor([[ 0.4173, -1.3448,  2.1403],
        [ 2.1685, -0.1658, -1.0099],
        [-0.6978,  1.7318,  0.3380],
        [ 2.5953, -0.4307, -1.1175],
        [-0.0124,  0.5844,  1.3466],
        [-1.3829,  0.6680, -0.0827]])
