In [23]:
import torch
import tiktoken

In [24]:
tokenizer = tiktoken.get_encoding('gpt2')

In [25]:
raw = 'So threatening had become the general aspect of affairs, that the king thought it prudent to send his son' 

In [26]:
enc = tokenizer.encode(raw)
print(enc)

[2396, 11123, 550, 1716, 262, 2276, 4843, 286, 9674, 11, 326, 262, 5822, 1807, 340, 34998, 284, 3758, 465, 3367]


In [44]:
ret = tokenizer.decode(enc)
print(ret)

So threatening had become the general aspect of affairs, that the king thought it prudent to send his son


In [27]:
vocab_size = 4
output_dimension = 8
inputs = torch.nn.Embedding(vocab_size, output_dimension)
print(inputs.weight)

inputs = inputs.weight.data
print(inputs) # get the same answer without the 'requires_grad+True'

Parameter containing:
tensor([[-1.2333, -1.1896, -0.6725,  0.3895,  0.1615,  0.3366,  0.1482, -1.4482],
        [-1.9821,  2.6993, -1.2479, -0.3451, -1.8017, -0.0958,  0.2081, -0.1727],
        [ 0.1061,  1.3612,  0.2706, -0.7686, -0.2944,  1.5069,  0.5106,  0.5482],
        [-0.1095,  1.6537,  0.5615,  0.6544, -1.0156, -0.9757, -1.6549, -1.2158]],
       requires_grad=True)
tensor([[-1.2333, -1.1896, -0.6725,  0.3895,  0.1615,  0.3366,  0.1482, -1.4482],
        [-1.9821,  2.6993, -1.2479, -0.3451, -1.8017, -0.0958,  0.2081, -0.1727],
        [ 0.1061,  1.3612,  0.2706, -0.7686, -0.2944,  1.5069,  0.5106,  0.5482],
        [-0.1095,  1.6537,  0.5615,  0.6544, -1.0156, -0.9757, -1.6549, -1.2158]])


In [28]:
inputs.shape

torch.Size([4, 8])

In [29]:
for row in inputs:
    print(row.tolist())

[-1.2333179712295532, -1.1896461248397827, -0.6725375652313232, 0.3895379304885864, 0.16151940822601318, 0.3366181552410126, 0.14821209013462067, -1.4482097625732422]
[-1.9821252822875977, 2.6993346214294434, -1.2479064464569092, -0.345114141702652, -1.8016605377197266, -0.095818892121315, 0.20806346833705902, -0.17266066372394562]
[0.10607588291168213, 1.3611797094345093, 0.2705560624599457, -0.7686013579368591, -0.29441431164741516, 1.5068678855895996, 0.5106224417686462, 0.5482245087623596]
[-0.10945736616849899, 1.653702735900879, 0.5614665746688843, 0.6544092893600464, -1.0155706405639648, -0.9756858944892883, -1.6548634767532349, -1.2157646417617798]


In [30]:
x = torch.Tensor([1.1, 2.3])
y = torch.Tensor([3.4, -2.1])

In [31]:
print(torch.dot(x, y))

print(1.1 * 3.4 + 2.3 * (-2.1))

tensor(-1.0900)
-1.0899999999999999


In [32]:
query = inputs[2]
print(query)

tensor([ 0.1061,  1.3612,  0.2706, -0.7686, -0.2944,  1.5069,  0.5106,  0.5482])


In [33]:
for i in range(len(inputs)):
    print(torch.dot(query, inputs[i]))

tensor(-2.4901)
tensor(3.7893)
tensor(5.4466)
tensor(-0.7944)


In [34]:
att_scores2 = torch.zeros(len(inputs))
for i in range(len(inputs)):
    att_scores2[i] = torch.dot(query, inputs[i])

print(att_scores2)

tensor([-2.4901,  3.7893,  5.4466, -0.7944])


In [35]:
att_weights2 = torch.softmax(att_scores2, dim = 0) # torch.exp(x)/ torch.exp(x).sum()
print(att_weights2)

tensor([2.9958e-04, 1.5981e-01, 8.3826e-01, 1.6327e-03])


In [36]:
att_weights2.sum()

tensor(1.)

In [37]:
context_vector2 = torch.zeros(query.shape)
for i in range(len(att_weights2)):
    context_vector2 += att_weights2[i] * inputs[i]
print(context_vector2)

tensor([-0.2284,  1.5747,  0.0281, -0.6983, -0.5363,  1.2463,  0.4586,  0.4295])


In [38]:
print(inputs)
print(inputs.T) # flips axes

tensor([[-1.2333, -1.1896, -0.6725,  0.3895,  0.1615,  0.3366,  0.1482, -1.4482],
        [-1.9821,  2.6993, -1.2479, -0.3451, -1.8017, -0.0958,  0.2081, -0.1727],
        [ 0.1061,  1.3612,  0.2706, -0.7686, -0.2944,  1.5069,  0.5106,  0.5482],
        [-0.1095,  1.6537,  0.5615,  0.6544, -1.0156, -0.9757, -1.6549, -1.2158]])
tensor([[-1.2333, -1.9821,  0.1061, -0.1095],
        [-1.1896,  2.6993,  1.3612,  1.6537],
        [-0.6725, -1.2479,  0.2706,  0.5615],
        [ 0.3895, -0.3451, -0.7686,  0.6544],
        [ 0.1615, -1.8017, -0.2944, -1.0156],
        [ 0.3366, -0.0958,  1.5069, -0.9757],
        [ 0.1482,  0.2081,  0.5106, -1.6549],
        [-1.4482, -0.1727,  0.5482, -1.2158]])


In [39]:
attention_scores = inputs @ inputs.T
print(attention_scores)

tensor([[ 5.7991, -0.1042, -2.4901, -0.9321],
        [-0.1042, 16.2199,  3.7893,  5.5432],
        [-2.4901,  3.7893,  5.4466, -0.7944],
        [-0.9321,  5.5432, -0.7944,  9.6902]])


In [40]:
attention_weights = torch.softmax( attention_scores, dim = -1 )
print(attention_weights)

tensor([[9.9584e-01, 2.7192e-03, 2.5019e-04, 1.1882e-03],
        [8.1383e-08, 9.9997e-01, 3.9944e-06, 2.3075e-05],
        [2.9958e-04, 1.5981e-01, 8.3826e-01, 1.6327e-03],
        [2.3986e-05, 1.5564e-02, 2.7525e-05, 9.8438e-01]])


In [41]:
print(attention_weights[0].sum())

tensor(1.0000)


In [42]:
context_vectors = attention_weights @ inputs
print(context_vectors)

tensor([[-1.2337, -1.1751, -0.6724,  0.3876,  0.1547,  0.3342,  0.1463, -1.4440],
        [-1.9821,  2.6993, -1.2479, -0.3451, -1.8016, -0.0958,  0.2080, -0.1727],
        [-0.2284,  1.5747,  0.0281, -0.6983, -0.5363,  1.2463,  0.4586,  0.4295],
        [-0.1386,  1.6699,  0.5333,  0.6388, -1.0278, -0.9619, -1.6258, -1.1995]])
