In [27]:
import os
import urllib.request
import re


if not os.path.exists("the-verdict"):
    urllib.request.urlretrieve("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/refs/heads/main/ch02/01_main-chapter-code/the-verdict.txt", 
                   "the-verdict")

In [28]:
with open("the-verdict", "r", encoding = "utf-8") as f:
    raw_text = f.read()

len(raw_text)

20479

In [29]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
preprocessed[:20]

['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius',
 '--',
 'though',
 'a',
 'good',
 'fellow',
 'enough',
 '--',
 'so',
 'it',
 'was']

In [30]:
len(preprocessed)

4690

In [31]:
# tokens to token ids

all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
vocab_size

1130

In [32]:
vocab = {word: token for token, word in enumerate(all_words)}
vocab

{'!': 0,
 '"': 1,
 "'": 2,
 '(': 3,
 ')': 4,
 ',': 5,
 '--': 6,
 '.': 7,
 ':': 8,
 ';': 9,
 '?': 10,
 'A': 11,
 'Ah': 12,
 'Among': 13,
 'And': 14,
 'Are': 15,
 'Arrt': 16,
 'As': 17,
 'At': 18,
 'Be': 19,
 'Begin': 20,
 'Burlington': 21,
 'But': 22,
 'By': 23,
 'Carlo': 24,
 'Chicago': 25,
 'Claude': 26,
 'Come': 27,
 'Croft': 28,
 'Destroyed': 29,
 'Devonshire': 30,
 'Don': 31,
 'Dubarry': 32,
 'Emperors': 33,
 'Florence': 34,
 'For': 35,
 'Gallery': 36,
 'Gideon': 37,
 'Gisburn': 38,
 'Gisburns': 39,
 'Grafton': 40,
 'Greek': 41,
 'Grindle': 42,
 'Grindles': 43,
 'HAD': 44,
 'Had': 45,
 'Hang': 46,
 'Has': 47,
 'He': 48,
 'Her': 49,
 'Hermia': 50,
 'His': 51,
 'How': 52,
 'I': 53,
 'If': 54,
 'In': 55,
 'It': 56,
 'Jack': 57,
 'Jove': 58,
 'Just': 59,
 'Lord': 60,
 'Made': 61,
 'Miss': 62,
 'Money': 63,
 'Monte': 64,
 'Moon-dancers': 65,
 'Mr': 66,
 'Mrs': 67,
 'My': 68,
 'Never': 69,
 'No': 70,
 'Now': 71,
 'Nutley': 72,
 'Of': 73,
 'Oh': 74,
 'On': 75,
 'Once': 76,
 'Only': 77,
 '

In [33]:
all_words = sorted(list(set(preprocessed)))
all_words.extend(["<|endoftext|>", "<|unk|>"])
vocab = {word: token for token, word in enumerate(all_words)}
vocab_size = len(all_words)
vocab_size

1132

In [34]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_2_int = vocab
        self.int_2_str = {token: word for word, token in vocab.items()}

    def encode(self, text: str):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [
            item if item in self.str_2_int
            else "<|unk|>"
            for item in preprocessed
            ]

        ids = [self.str_2_int[s] for s in preprocessed]
        return ids

    def decoder(self, ids: list):
        text = ' '.join([self.int_2_str[s] for s in ids])
        text = re.sub(r"\s+([,.?!\"'])", r"\1", text)
        return text

tokenozer_v1 = SimpleTokenizerV1(vocab)
text = "hello."
encoded = tokenozer_v1.encode(text)
decoded = tokenozer_v1.decoder(encoded)

print(f"text: {text}")
print(f"encoded: {encoded}")
print(f"decoded: {decoded}")

text: hello.
encoded: [1131, 7, 1131]
decoded: <|unk|>. <|unk|>


In [35]:
# tiktoken
import tiktoken

tokenizer_tik = tiktoken.get_encoding("gpt2")

In [43]:
vocab_size = tokenizer_tik.n_vocab
vocab_size

50257

In [36]:
h = tokenizer_tik.encode(
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.",
    allowed_special={"<|endoftext|>"}
)

decoded = tokenizer_tik.decode(h)
decoded

'Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.'

In [37]:
import torch
from torch.utils.data import DataLoader, Dataset

In [79]:
class GPT_Dataset_V1(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        super().__init__()

        self.input_ids  = []
        self.target_ids = []

        self.token_ids = h = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

        for i in range(0, len(self.token_ids) - max_length, stride):
            self.input_ids.append(
                torch.tensor(self.token_ids[i:i + max_length])
            )

            self.target_ids.append(
                torch.tensor(self.token_ids[i + 1: i + max_length + 1])
            )
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]
    
gpt_ds_v1 = GPT_Dataset_V1(raw_text, tokenizer_tik, 5, 10)

sample_inputs, sample_targets = next(iter(gpt_ds_v1))
print(tokenizer_tik.decode(sample_inputs.tolist()))
print(tokenizer_tik.decode(sample_targets.tolist()))


I HAD always thought
 HAD always thought Jack


In [80]:
def create_dataloader(text, batch_size = 2, max_length = 256, stride = 128, shuffle = True, drop_last = True, num_workers = 4):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPT_Dataset_V1(text, tokenizer, max_length, stride)
    # print(type(max_length), type(stride), type(text))
    
    dataloader = DataLoader(
        dataset, 
        batch_size = batch_size, 
        shuffle = shuffle, 
        drop_last = drop_last, 
        num_workers = num_workers)
    
    return dataloader

In [81]:
sample_inputs, sample_targets = next(iter(create_dataloader(raw_text, max_length = 10, num_workers = 0, batch_size = 8)))
sample_inputs

tensor([[  423,  4750,   326,  9074,    13,   402,   271, 10899,   373,  5527],
        [12036,   683,     0,  3226,  1781,   314,  4001,   284,   466,   262],
        [ 3347, 27846,   503,  2048,  4628, 24882,   379,   262,  8812,   558],
        [   11, 17728,   257,  8500,  4417,   284,   670,   319,   438, 15464],
        [   11,   508,   550, 18459,  1068,   284,  1577,   257, 23844,   286],
        [  616,   705, 23873,  2350,     6, 14707,   588,   257,  2156,   286],
        [10197,   832,   262, 46475,   286, 18113,   544,   338, 10953,   314],
        [  383,  8631,  3872,   373,    11,   314,  1422,   470,   760,   810]])

In [82]:
sample_targets

tensor([[ 4750,   326,  9074,    13,   402,   271, 10899,   373,  5527,    26],
        [  683,     0,  3226,  1781,   314,  4001,   284,   466,   262,  4286],
        [27846,   503,  2048,  4628, 24882,   379,   262,  8812,   558,   810],
        [17728,   257,  8500,  4417,   284,   670,   319,   438, 15464,    11],
        [  508,   550, 18459,  1068,   284,  1577,   257, 23844,   286,  7543],
        [  705, 23873,  2350,     6, 14707,   588,   257,  2156,   286,  4116],
        [  832,   262, 46475,   286, 18113,   544,   338, 10953,   314,  2936],
        [ 8631,  3872,   373,    11,   314,  1422,   470,   760,   810,   284]])

In [83]:
s = torch.nn.Embedding(5, 2)
s.weight

Parameter containing:
tensor([[ 0.6873, -0.5623],
        [ 2.0006, -0.6980],
        [ 0.6034,  0.0375],
        [-0.8091, -0.7755],
        [-0.4808, -0.2414]], requires_grad=True)

In [84]:
s(torch.tensor([2,3]))

tensor([[ 0.6034,  0.0375],
        [-0.8091, -0.7755]], grad_fn=<EmbeddingBackward0>)

In [85]:
# embeddings
batch_size = 8
max_length = 4

inputs, targets = next(iter(create_dataloader(raw_text, max_length = max_length, stride = max_length, num_workers = 0, batch_size = batch_size, shuffle=False)))
inputs

tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

In [89]:
inputs.shape

torch.Size([8, 4])

In [91]:
vocab_size = tokenizer_tik.n_vocab # 50257
output_dim = 256

embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
embedding_layer.weight

Parameter containing:
tensor([[-0.2217,  1.6671, -0.2987,  ..., -0.0138, -0.3276, -0.1640],
        [-0.9847, -0.3573,  0.4639,  ...,  0.3473,  0.5081, -0.8632],
        [ 0.0311, -0.9728, -2.3140,  ...,  0.4531, -0.6739,  0.6442],
        ...,
        [ 0.6457, -0.6678,  0.6428,  ..., -0.8375, -0.5307, -0.4777],
        [-1.1325, -0.3802,  0.0738,  ..., -0.6353, -2.2342, -0.1806],
        [ 1.2680,  0.5859, -1.1715,  ..., -0.8569,  1.8439,  0.4743]],
       requires_grad=True)

In [94]:
embedding_layer.weight.shape

torch.Size([50257, 256])

In [93]:
token_embedding = embedding_layer(inputs)
token_embedding.shape

torch.Size([8, 4, 256])

In [95]:
position_embedding_layer = torch.nn.Embedding(max_length, output_dim)
position_embedding_layer.weight.shape

torch.Size([4, 256])

In [97]:
pos_embedding = position_embedding_layer(torch.arange(max_length))
pos_embedding.shape

torch.Size([4, 256])

In [98]:
input_embedding = token_embedding + pos_embedding
input_embedding.shape

torch.Size([8, 4, 256])