# LLM from scratch

## Step 1: Creating Tokens

In [1]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print("total number of character: ", len(raw_text))

# print the first 100 characters 
print(raw_text[:99])

total number of character:  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


the goal is to tokenize all the characters in the document 20479
to split the text into individual tokens we can use the python library `re` (regular expressions)

In [2]:
import re

text = "hello, world. This is a test"
result = re.split(r'(\s)', text)

print(result)

['hello,', ' ', 'world.', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test']


the result is words and whitespaces but we need commas and fullstops

In [3]:
result = re.split(r'([,.]|\s)', text)
print(result)

['hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test']


In [4]:
# remove the whitespaces 
result = [item for item in result if item.strip()]
print(result)

['hello', ',', 'world', '.', 'This', 'is', 'a', 'test']


Removing whitespaces reduces the memory and computing requirement. However, whitespaces can be useful for models that are sensitive to the exact structure of the text e.g. Python Code is sensitive to indentation.

In [5]:
# include other punctuation marks 
text = "hello, world. This is -- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item for item in result if item.strip()]
print(result)

['hello', ',', 'world', '.', 'This', 'is', '--', 'a', 'test', '?']


Now we have a basic tokenizer working. Let's apply it to our document

In [6]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item for item in preprocessed if item.strip()]
print(preprocessed[:30])
print(len(preprocessed))

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']
4690


## Step 2: Convert Tokens into Token IDs

In [7]:
# build a vocabulary of unique tokens
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

1130


In [8]:
# create the vocabulary and print its first 51 entries 
vocab = {token:integer for integer, token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i>=50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


### Implement the Tokenizer class in Python

Encode method takes text and gives ids

Decode method takes ids and gives text

In [9]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:t for t,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        # remove whitespaces 
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        # assign an id to each token t 
        ids = [self.str_to_int[t] for t in preprocessed]
        return ids

    def decode(self, ids):
        # convert the ids to tokens 
        text = " ".join([self.int_to_str[i] for i in ids])
        # replace spaces before punctuations 
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [10]:
# instantiate a new tokenizer object from the above class and tokenize a passage 
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know,"
            Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [11]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [12]:
# Gives a KeyError: 'Hello' 
# text = "Hello, do you like tea?"
# ids = tokenizer.encode(text)
# print(ids)

Hello does not exist in the vocabulary so it throws the error. To prevent this we add special context tokens

### Adding Special Context Tokens

Modify the tokenizer to handle unknown words (not in vocabulary. Use the special unkown word token <|unk|>and the end of text token <|endoftext|>

In [13]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer, token in enumerate(all_tokens)}
len(vocab.items())

1132

In [14]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:t for t,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        # remove whitespaces 
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed         # the change and addition of unknown
        ]
        # assign an id to each token t 
        ids = [self.str_to_int[t] for t in preprocessed]
        return ids

    def decode(self, ids):
        # convert the ids to tokens 
        text = " ".join([self.int_to_str[i] for i in ids])
        # replace spaces before punctuations 
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [15]:
tokenizer = SimpleTokenizerV2(vocab)
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))

print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [16]:
tokenizer.encode(text)

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]

In [17]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'

### BYTE PAIR ENCODING
what GPT uses for tokenization

In [18]:
!pip3 install tiktoken



In [19]:
import importlib
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

In [20]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace."
    "of someunknownPlace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 262, 20562, 13, 1659, 617, 34680, 27271, 13]


In [21]:
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.of someunknownPlace.


someunknownPlace is unknown text but it is encoded and decoded without errors because of how the BPE tokenizer works. represents unknown words as subwords or characters

### DATA SAMPLING WITH SLIDING WINDOWS

#### CREATING INPUT-TARGET PAIRS

In [22]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [23]:
# removing the first 50 tokens for demo purposes 
enc_sample = enc_text[50:]

Create x and y variables repn the input and target respectively.

In [24]:
context_size = 4 # length of input

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print("x: ", x)
print("y: ", y)

x:  [290, 4920, 2241, 287]
y:  [4920, 2241, 287, 257]


process inputs along with the targets, which are the inputs shifted by one position, we can create the next-word prediction as follows

In [25]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context, " ----> ", desired )

[290]  ---->  4920
[290, 4920]  ---->  2241
[290, 4920, 2241]  ---->  287
[290, 4920, 2241, 287]  ---->  257


everything left of the arrow refers to input an llm would receive and the token id llm is supposed to predict

In [26]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(tokenizer.decode(context), " ----> ", tokenizer.decode([desired]) )

 and  ---->   established
 and established  ---->   himself
 and established himself  ---->   in
 and established himself in  ---->   a


#### IMPLEMENTING A DATA LOADER

implement the above example using tensors. we collect inputs in a tensor x, where each row reps one input context. 

The second tensor y contains the corresponding prediction targets (next words) which are created by shifting the input by one position.

Step 1: tokenize the entire text

Step 2: use a sliding window to chunk the text into overlapping sequences of max_length

Step 3: return the total numbner of rrows in the dataset

Step 4:return a single row drom the dataset

In [27]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        
        # tokenize the entire text 
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1: i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        # return input output/target pairs of a row based on index 
        return self.input_ids[idx], self.target_ids[idx]

Step 1: initialize the tokenizer

Step 2: create dataset

Step 3: drop_last = True drops the last batch if it shorter than the specified batch_size to prevent loss spikes during training

Step 4:the number of CPU processes to use for preprocessing

In [28]:
def create_dataloader_v1(
        txt, batch_size=4, max_length=256,
        stride=128, shuffle=True, drop_last=True,
        num_workers=0
):
    # init the tokenizer 
    tokenizer = tiktoken.get_encoding("gpt2")

    # create dataset \
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # create a dataloader 
    dataloader = DataLoader(
        dataset, batch_size=batch_size,
        shuffle=shuffle, drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [29]:
# test the dataloader with a batch size of 1 for an LLM with a context size of 4
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()


In [30]:
print("Pytorch version: ", torch.__version__)

Pytorch version:  2.3.0+cu118


In [31]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


The first_batch variable contains two tensors: the first stores the input token ids, and the second stores the target token ids

the max_length is 4, each of the tensors contains 4 token ids

the input size of 4 is small and this is for demo only 

In [32]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


the stride setting dicatets the number of positions the inputs shift across batches emulating sliding window

In [33]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=4, stride=4, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]]), tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])]


## Step 3: Create Token Embeddings

In [34]:
input_ids = torch.tensor([2, 3, 5, 1])

In [35]:
# convert the ids into embeddings 
vocab_size = 6
output_dim = 3 #vector dimension 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [36]:
# underlying weight matrix 
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [37]:
# apply the embedding layer to a token ID to obtain its embedding vector
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [38]:
# convert all the four token IDs into their vector embeddings 
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


### Positional Embeddings

Consider much more realistic and useful embedding sizes and encode inpout tokens into a 256 dim vector representation

In [39]:
vocab_size = 50257
output_dim = 256

token_emnedding_layer = torch.nn.Embedding(vocab_size, output_dim)

instantiate the data loader (data sampling with a sliding window) first:

In [40]:
max_length = 4 #token ids to look at and predict the next
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
    stride = max_length, shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [41]:
print("Token IDs:\n", inputs)
print("\nInputs Shape", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs Shape torch.Size([8, 4])


the token ids or input is a tensor of 8x4 dim. Need to convert to the 8x4x256 dim

In [43]:
token_embeddings = token_emnedding_layer(inputs)
print(token_embeddings)
print(token_embeddings.shape)

tensor([[[ 0.4913,  1.1239,  1.4588,  ..., -0.3995, -1.8735, -0.1445],
         [ 0.4481,  0.2536, -0.2655,  ...,  0.4997, -1.1991, -1.1844],
         [-0.2507, -0.0546,  0.6687,  ...,  0.9618,  2.3737, -0.0528],
         [ 0.9457,  0.8657,  1.6191,  ..., -0.4544, -0.7460,  0.3483]],

        [[ 1.5460,  1.7368, -0.7848,  ..., -0.1004,  0.8584, -0.3421],
         [-1.8622, -0.1914, -0.3812,  ...,  1.1220, -0.3496,  0.6091],
         [ 1.9847, -0.6483, -0.1415,  ..., -0.3841, -0.9355,  1.4478],
         [ 0.9647,  1.2974, -1.6207,  ...,  1.1463,  1.5797,  0.3969]],

        [[-0.7713,  0.6572,  0.1663,  ..., -0.8044,  0.0542,  0.7426],
         [ 0.8046,  0.5047,  1.2922,  ...,  1.4648,  0.4097,  0.3205],
         [ 0.0795, -1.7636,  0.5750,  ...,  2.1823,  1.8231, -0.3635],
         [ 0.4267, -0.0647,  0.5686,  ..., -0.5209,  1.3065,  0.8473]],

        ...,

        [[-1.6156,  0.9610, -2.6437,  ..., -0.9645,  1.0888,  1.6383],
         [-0.3985, -0.9235, -1.3163,  ..., -1.1582, -1.13

Add positional embeddings

In [44]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


Add the positional embeddings to the token embeddings

In [45]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])


## Data Processing Pipeline