In [2]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
text = ("Hello, do you like tea <|endoftext|> In the sunlit terraces")
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)
strings = tokenizer.decode(integers)
print(strings)

[15496, 11, 466, 345, 588, 8887, 220, 50256, 554, 262, 4252, 18250, 8812, 2114]
Hello, do you like tea <|endoftext|> In the sunlit terraces


In [3]:
import torch
from torch.utils.data import Dataset, DataLoader


class GPTDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [4]:
def create_dataloader(txt, batch_size=4, max_length=256,
                      stride=128, shuffle=True, drop_last=True,
                      num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDataset(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

In [5]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    txt = f.read()
dataloader = create_dataloader(txt, batch_size=1, shuffle=False, max_length=4, stride=1)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [6]:
vocab_size = 6
output_dim = 3
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [80]:
inputs = torch.tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]]
)

In [89]:
query = inputs[1]
attn_scores_2 = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
    attn_scores_2[i] = torch.dot(x_i, query)
print(attn_scores_2)

tensor([-0.1913,  5.0345,  0.6437, -0.3340, -1.3706, -5.6814])


In [82]:
res = 0
for idx, element in enumerate(inputs[0]):
    res += inputs[0][idx] * query[idx]
print(res)
print(torch.dot(inputs[0], query))

tensor(-0.1913)
tensor(-0.1913)


In [49]:
attn_weights_2_tmp = attn_scores_2 / attn_scores_2.sum()
print(f"Attention weights: {attn_weights_2_tmp}")
print(f"Attention weights sum: {attn_weights_2_tmp.sum():.2f}")

Attention weights: tensor([ 0.1007, -2.6512, -0.3390,  0.1759,  0.7218,  2.9918])
Attention weights sum: 1.00


In [53]:
def softmax_naive(x):
    return torch.exp(x) / torch.exp(x).sum(dim=0)
print(softmax_naive(attn_scores_2))
print(softmax_naive(attn_scores_2).sum())

tensor([5.2494e-03, 9.7646e-01, 1.2100e-02, 4.5514e-03, 1.6142e-03, 2.1667e-05])
tensor(1.0000)


In [90]:
attn_weights_2 = torch.softmax(attn_scores_2, dim=0)
print(attn_weights_2)
print(attn_weights_2.sum())

tensor([5.2494e-03, 9.7646e-01, 1.2100e-02, 4.5514e-03, 1.6142e-03, 2.1667e-05])
tensor(1.0000)


In [84]:
query = inputs[1]
context_vec_2 = torch.zeros(query.shape)
for i, x_i in enumerate(inputs):
    context_vec_2 += attn_weights_2[i] * x_i
print(context_vec_2)


tensor([0.9096, 1.5453, 1.2613])


In [85]:
attn_scores = torch.empty(6,6)
for i, x_i in enumerate(inputs):
    for j, y_j in enumerate(inputs):
        attn_scores[i,j] = torch.dot(x_i, y_j)
print(attn_scores)

tensor([[ 0.1740, -0.1913,  0.4932, -0.1133, -0.3422, -0.5804],
        [-0.1913,  5.0345,  0.6437, -0.3340, -1.3706, -5.6814],
        [ 0.4932,  0.6437,  1.6926, -0.5219, -1.4420, -3.2377],
        [-0.1133, -0.3340, -0.5219,  2.4137,  1.5050,  1.9999],
        [-0.3422, -1.3706, -1.4420,  1.5050,  1.8478,  3.9260],
        [-0.5804, -5.6814, -3.2377,  1.9999,  3.9260, 10.6686]])


In [67]:
attn_scores = inputs @ inputs.T
print(attn_scores)

tensor([[ 0.1740, -0.1913,  0.4932, -0.1133, -0.3422, -0.5804],
        [-0.1913,  5.0345,  0.6437, -0.3340, -1.3706, -5.6814],
        [ 0.4932,  0.6437,  1.6926, -0.5219, -1.4420, -3.2377],
        [-0.1133, -0.3340, -0.5219,  2.4137,  1.5050,  1.9999],
        [-0.3422, -1.3706, -1.4420,  1.5050,  1.8478,  3.9260],
        [-0.5804, -5.6814, -3.2377,  1.9999,  3.9260, 10.6686]])


In [79]:
attn_weights = torch.softmax(attn_scores, dim=-1)
print(attn_weights)
print(attn_weights.sum(dim=-1))

tensor([[2.0461e-01, 1.4200e-01, 2.8154e-01, 1.5352e-01, 1.2211e-01, 9.6223e-02],
        [5.2494e-03, 9.7646e-01, 1.2100e-02, 4.5514e-03, 1.6142e-03, 2.1667e-05],
        [1.6635e-01, 1.9338e-01, 5.5198e-01, 6.0279e-02, 2.4022e-02, 3.9877e-03],
        [3.5334e-02, 2.8337e-02, 2.3482e-02, 4.4222e-01, 1.7824e-01, 2.9239e-01],
        [1.1318e-02, 4.0470e-03, 3.7683e-03, 7.1774e-02, 1.0113e-01, 8.0797e-01],
        [1.3002e-05, 7.9193e-08, 9.1192e-07, 1.7165e-04, 1.1779e-03, 9.9864e-01]])
tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])


In [92]:
all_context_vecs = attn_weights @ inputs
print(all_context_vecs)
print(context_vec_2)

tensor([[ 0.0820,  0.2441, -0.2841],
        [ 0.9096,  1.5453,  1.2613],
        [ 0.8742,  0.2282,  0.0448],
        [-1.1466,  0.2898, -1.0053],
        [-2.4283, -0.5283, -1.2824],
        [-2.8376, -0.7833, -1.4086]])
tensor([0.9096, 1.5453, 1.2613])
