In [36]:
with open("verdict.txt", "r", encoding="utf-8") as f :
    raw_text = f.read()

print("Total number of characters:", len(raw_text))
print(raw_text[:99])

Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [37]:
import re

text = "Hello, world. This, is a test."
result = re.split(r'(\s)', text)

print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [38]:
result = re.split(r'([,.]|\s)', text)

print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [39]:
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [40]:
text = "Hello, world. Is this-- a test?"

result = re.split(r'([,.:;?__!"()\']|--|\s)', text)

result = [item.strip() for item in result if item.strip()]

print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [41]:
preprocessed =  re.split(r'([,.:;?__!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]

print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [42]:
print(len(preprocessed))

4690


In [43]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

1130


In [44]:
vocab = {token:integer for integer,token in enumerate(all_words)}

In [45]:
for i, item in enumerate(vocab.items()):
    print(i, item)
    if i >= 50:
        break

0 ('!', 0)
1 ('"', 1)
2 ("'", 2)
3 ('(', 3)
4 (')', 4)
5 (',', 5)
6 ('--', 6)
7 ('.', 7)
8 (':', 8)
9 (';', 9)
10 ('?', 10)
11 ('A', 11)
12 ('Ah', 12)
13 ('Among', 13)
14 ('And', 14)
15 ('Are', 15)
16 ('Arrt', 16)
17 ('As', 17)
18 ('At', 18)
19 ('Be', 19)
20 ('Begin', 20)
21 ('Burlington', 21)
22 ('But', 22)
23 ('By', 23)
24 ('Carlo', 24)
25 ('Chicago', 25)
26 ('Claude', 26)
27 ('Come', 27)
28 ('Croft', 28)
29 ('Destroyed', 29)
30 ('Devonshire', 30)
31 ('Don', 31)
32 ('Dubarry', 32)
33 ('Emperors', 33)
34 ('Florence', 34)
35 ('For', 35)
36 ('Gallery', 36)
37 ('Gideon', 37)
38 ('Gisburn', 38)
39 ('Gisburns', 39)
40 ('Grafton', 40)
41 ('Greek', 41)
42 ('Grindle', 42)
43 ('Grindles', 43)
44 ('HAD', 44)
45 ('Had', 45)
46 ('Hang', 46)
47 ('Has', 47)
48 ('He', 48)
49 ('Her', 49)
50 ('Hermia', 50)


In [46]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?__!"()\']|--|\s)', text)

        preprocessed = [item.strip() for item in preprocessed if item.strip()]

        ids = [self.str_to_int[s] for s in preprocessed]

        return ids 

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s([,.?!"()\'])', r'\1', text)

        return text

In [47]:
tokenizer = SimpleTokenizerV1(vocab)

text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [48]:
tokenizer.decode(ids)


'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [49]:
text = "Hello, do you like tea?"
print(tokenizer.encode(text))

KeyError: 'Hello'

In [50]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}

In [51]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [52]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

## Byte Pair Coding

In [18]:
!pip3 install tiktoken 

Collecting tiktoken
  Downloading tiktoken-0.12.0-cp311-cp311-win_amd64.whl.metadata (6.9 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2025.9.18-cp311-cp311-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/41.5 kB ? eta -:--:--
     --------- ------------------------------ 10.2/41.5 kB ? eta -:--:--
     -------------------------------------- 41.5/41.5 kB 665.7 kB/s eta 0:00:00
Collecting requests>=2.26.0 (from tiktoken)
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting charset_normalizer<4,>=2 (from requests>=2.26.0->tiktoken)
  Downloading charset_normalizer-3.4.4-cp311-cp311-win_amd64.whl.metadata (38 kB)
Collecting idna<4,>=2.5 (from requests>=2.26.0->tiktoken)
  Downloading idna-3.11-py3-none-any.whl.metadata (8.4 kB)
Collecting urllib3<3,>=1.21.1 (from requests>=2.26.0->tiktoken)
  Using cached urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests>=2.26.0->tik


[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [53]:
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.12.0


In [54]:
tokenizer = tiktoken.get_encoding("gpt2")

In [55]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)


integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"}
)

print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [56]:
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


In [57]:
integers = tokenizer.encode("Akwirw ier")
print(integers)

strings = tokenizer.decode(integers)
print(strings)

[33901, 86, 343, 86, 220, 959]
Akwirw ier


## Data Sampling with Windows

In [58]:
with open("verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [59]:
enc_sample = enc_text[50:]

In [60]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size + 1]


print(x)
print(y)

[290, 4920, 2241, 287]
[4920, 2241, 287, 257]


In [61]:
for i in range(1, context_size):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context, "--->", desired)

[290] ---> 4920
[290, 4920] ---> 2241
[290, 4920, 2241] ---> 287


## Implementing Data Loader

In [29]:
!pip install torch torchvision torchaudio

Collecting torch
  Downloading torch-2.9.0-cp311-cp311-win_amd64.whl.metadata (30 kB)
Collecting torchvision
  Downloading torchvision-0.24.0-cp311-cp311-win_amd64.whl.metadata (5.9 kB)
Collecting torchaudio
  Downloading torchaudio-2.9.0-cp311-cp311-win_amd64.whl.metadata (6.9 kB)
Collecting filelock (from torch)
  Downloading filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx>=2.5.1 (from torch)
  Using cached networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec>=0.8.5 (from torch)
  Using cached fsspec-2025.9.0-py3-none-any.whl.metadata (10 kB)
Collecting numpy (from torchvision)
  Downloading numpy-2.3.4-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.9 kB ? eta -:--:--
     -------------------------------------


[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizeer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

    token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

    for i in range(0, len(token_ids) - max_length, stride):
        input_chunk = token_ids[i:i+max_length]
        target_chunk = token_ids[i+1:i+max_length]
        self.input_ids.append(torch.tensor(input_chunk))
        self.target_ids.append(torch.tensor(target))

    def __len__(self):
        return len(self.input_ids)
    

    def __getitem(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

NameError: name 'txt' is not defined

In [31]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,stride=128,shuffle=True, drop_last=True,num_workers=0):


    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [32]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

FileNotFoundError: [Errno 2] No such file or directory: 'the-verdict.txt'

In [33]:
import torch
print("PyTorch version:", torch.__version__)
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

PyTorch version: 2.9.0+cpu


NameError: name 'GPTDatasetV1' is not defined

In [34]:
second_batch = next(data_iter)
print(second_batch)

NameError: name 'data_iter' is not defined

In [35]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

NameError: name 'GPTDatasetV1' is not defined

## Token Embeddings

In [63]:
input_ids = torch.tensor([2,3,5,1])

In [64]:
vocab_size = 6
output_dim = 3

torch.manual_seed(42)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [66]:
print(embedding_layer(torch.tensor([3])))

tensor([[-0.6866,  0.6105,  1.3347]], grad_fn=<EmbeddingBackward0>)


In [67]:
print(embedding_layer(input_ids))

tensor([[ 0.8008,  1.6806,  0.3559],
        [-0.6866,  0.6105,  1.3347],
        [ 0.8599, -0.3097, -0.3957],
        [ 0.4396, -0.7581,  1.0783]], grad_fn=<EmbeddingBackward0>)


## Positional Embeddings

In [68]:
vocab_size = 50527
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [69]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=8,
    stride=max_length, shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

NameError: name 'GPTDatasetV1' is not defined

In [70]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

NameError: name 'inputs' is not defined

In [71]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

NameError: name 'inputs' is not defined

In [72]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [73]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [74]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

NameError: name 'token_embeddings' is not defined