In [1]:
import os
import urllib.request

if not os.path.exists("the-verdict.txt"):
    url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/refs/heads/main/ch02/01_main-chapter-code/the-verdict.txt"
    file_path = "the-verdict.txt"
    urllib.request.urlretrieve(url, file_path)

In [2]:
with open("the-verdict.txt") as f:
    raw_text = f.read()

len(raw_text)

20479

In [3]:
import re

#Split the text into individual tokens using regular expressions
#Let's start of with a simple expression and text
# text = "Hello! This is a test."
# result = re.split(r'(\s)', text)
# print(result)

# result = re.split(r'[!.]|\s',text)
# result = [item for item in result if item.strip()]
# print(result)

#sophisticated regular expression with "the-verdict.txt"
new_text = "Hello, world. Is this -- a test?"
raw_result = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
raw_result = [item.strip() for item in raw_result if item.strip()]
processed = raw_result
len(processed)

4690

In [4]:
#Converting tokens into token IDs
all_words = sorted(set(processed))
all_words.extend(['<|endoftext|>', '<|unk|>'])
vocab_size = len(all_words)
print(f"The size of the vocab is: {vocab_size}")

vocab = {token: integer for integer, token in enumerate(all_words)}
print(vocab)

The size of the vocab is: 1132
{'!': 0, '"': 1, "'": 2, '(': 3, ')': 4, ',': 5, '--': 6, '.': 7, ':': 8, ';': 9, '?': 10, 'A': 11, 'Ah': 12, 'Among': 13, 'And': 14, 'Are': 15, 'Arrt': 16, 'As': 17, 'At': 18, 'Be': 19, 'Begin': 20, 'Burlington': 21, 'But': 22, 'By': 23, 'Carlo': 24, 'Chicago': 25, 'Claude': 26, 'Come': 27, 'Croft': 28, 'Destroyed': 29, 'Devonshire': 30, 'Don': 31, 'Dubarry': 32, 'Emperors': 33, 'Florence': 34, 'For': 35, 'Gallery': 36, 'Gideon': 37, 'Gisburn': 38, 'Gisburns': 39, 'Grafton': 40, 'Greek': 41, 'Grindle': 42, 'Grindles': 43, 'HAD': 44, 'Had': 45, 'Hang': 46, 'Has': 47, 'He': 48, 'Her': 49, 'Hermia': 50, 'His': 51, 'How': 52, 'I': 53, 'If': 54, 'In': 55, 'It': 56, 'Jack': 57, 'Jove': 58, 'Just': 59, 'Lord': 60, 'Made': 61, 'Miss': 62, 'Money': 63, 'Monte': 64, 'Moon-dancers': 65, 'Mr': 66, 'Mrs': 67, 'My': 68, 'Never': 69, 'No': 70, 'Now': 71, 'Nutley': 72, 'Of': 73, 'Oh': 74, 'On': 75, 'Once': 76, 'Only': 77, 'Or': 78, 'Perhaps': 79, 'Poor': 80, 'Profession

In [5]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self,text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self,ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])',r'\1',text)
        return text
    
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know," 
            Mrs. Gisburn said with pardonable pride."""
text_with_unknow = "Hello, do you like tea?"
ids = tokenizer.encode(text)
print(f"The encoded ids for text without unknowns are: {ids}")
decoded_text = tokenizer.decode(ids)
print(f"The decoded text without unknows is: {decoded_text}")

#The encoded text with unknowns
ids = tokenizer.encode(text_with_unknow)
decoded_text = tokenizer.decode(ids)
print(f"The encoded text with unknows is: {ids}")
print(f"The decoded text with unknows are: {decoded_text}") #<|unk|>, do you like tea? ---> When decoded, we get the unk token which is not desirable, this is a shortcoming that 
#can be overcomed using byte pair encoding

The encoded ids for text without unknowns are: [1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
The decoded text without unknows is: " It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.
The encoded text with unknows is: [1131, 5, 355, 1126, 628, 975, 10]
The decoded text with unknows are: <|unk|>, do you like tea?


In [6]:
!pip install tiktoken
import tiktoken
#Tiktokenizer tokenizes using byte pair encoding

tokenizer = tiktoken.get_encoding("gpt2")
temp_ids = tokenizer.encode("Hello World")
temp_text = tokenizer.decode(temp_ids)

print(f"Encoding with tiktoken is: {temp_ids}")
print(f"Decoded text from tiktokenizer is: {temp_text}")

Encoding with tiktoken is: [15496, 2159]
Decoded text from tiktokenizer is: Hello World


In [7]:
#Data sampling with a sliding window i.e number of input tokens and output tokens according to the context window/max sequence length
#The LLM we are constructing, predicts the next token

with open("the-verdict.txt","r",encoding="utf-8") as f:
    raw_text = f.read()
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

enc_sample = enc_text[50:]

#Let's set the context size to 4, for better understanding
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:       {y}")

#Let's decode the encoded samples
for i in range(1,context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(f"{tokenizer.decode(context)}---->{tokenizer.decode([desired])}")
    break

5145
x: [290, 4920, 2241, 287]
y:       [4920, 2241, 287, 257]
 and----> established


In [8]:
#Dataset preparation using torch's dataset and dataloader
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):

    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(text, allowed_special={'<|endoftext|>'})

        for i in range(0, len(token_ids)-max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self,idx):
        return self.input_ids[idx], self.target_ids[idx]

In [9]:
#Create dataloader
def create_dataloader_v1(txt, batch_size=2, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):

    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer,max_length,stride)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
    return dataloader

with open("the-verdict.txt", "r",encoding="utf-8") as file:
    raw_text = file.read()

dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(f"The first batch is: {first_batch}")

second_batch = next(data_iter)
print(f"The second batch is: {second_batch}")

The first batch is: [tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]
The second batch is: [tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


In [10]:
#Creating embeddings for tokens
vocab_size = tokenizer.n_vocab
output_dim = 256

embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(f"Embedding layer weights are: {embedding_layer.weight}")

#Get the embedding of token id - 3
token_emb_3 = embedding_layer(torch.tensor([3]))
print(f"The token embedding is: {token_emb_3} and it's shape is: {token_emb_3.shape}")

Embedding layer weights are: Parameter containing:
tensor([[-1.6264,  0.1688,  0.2665,  ..., -0.3120,  1.2720, -0.6315],
        [-0.0422,  0.3220, -0.1574,  ..., -0.7017, -0.5334,  0.6473],
        [-1.7121,  0.4959,  0.1773,  ..., -0.1976, -1.0407,  0.3001],
        ...,
        [ 0.5568,  0.3399,  0.0175,  ...,  1.4633, -0.4538,  0.4061],
        [ 0.0044,  0.0635,  0.0268,  ...,  0.1595,  0.3196,  0.2373],
        [ 2.3420,  1.2338, -0.6844,  ...,  0.5333, -1.6165, -0.2057]],
       requires_grad=True)
The token embedding is: tensor([[ 9.0089e-01,  8.5052e-03, -3.2654e-01, -2.7527e-01, -3.8364e-01,
         -9.3903e-01, -8.3717e-01,  1.3958e+00, -8.0950e-01, -1.1701e+00,
         -7.7724e-01,  4.5919e-01,  9.1886e-01,  1.6229e+00, -4.1428e-01,
          7.0946e-01,  1.1273e+00,  1.0048e+00, -8.7417e-01,  4.7475e-01,
          3.0436e-02, -1.0749e+00, -4.4085e-01,  1.6056e+00,  2.9862e-01,
          1.3872e+00,  3.5942e-01, -1.3618e+00,  1.2377e-01, -2.6175e+00,
         -5.3681e-01