# Load Data

In [None]:
file_path = "/content/drive/MyDrive/Colab Notebooks/Build LLM From Scratch/text_preprocessing/the-verdict.txt"

with open(file_path, "r", encoding = "utf-8") as f:
    raw_text = f.read()

print(f"Total number of charcters: {len(raw_text)}")
print(f"{raw_text[:99]}")

Total number of charcters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


# Tokenization using Regex (Extra)

In [None]:
import re

In [None]:
tekenized = re.split(r"([,.:;?!-()_\'\s]|--)", raw_text)
tekenized = [item for item in tekenized if item.strip()]
print(len(tekenized))
print(tekenized[:30])

4690
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


# Converting tokens into tokens ids

In [None]:
# Build a vocabulary that maps every token to unique integer

all_words = sorted(set(tekenized))
print(all_words)
print(len(all_words))

['!', '"', "'", '(', ')', ',', '--', '.', ':', ';', '?', 'A', 'Ah', 'Among', 'And', 'Are', 'Arrt', 'As', 'At', 'Be', 'Begin', 'Burlington', 'But', 'By', 'Carlo', 'Chicago', 'Claude', 'Come', 'Croft', 'Destroyed', 'Devonshire', 'Don', 'Dubarry', 'Emperors', 'Florence', 'For', 'Gallery', 'Gideon', 'Gisburn', 'Gisburns', 'Grafton', 'Greek', 'Grindle', 'Grindles', 'HAD', 'Had', 'Hang', 'Has', 'He', 'Her', 'Hermia', 'His', 'How', 'I', 'If', 'In', 'It', 'Jack', 'Jove', 'Just', 'Lord', 'Made', 'Miss', 'Money', 'Monte', 'Moon-dancers', 'Mr', 'Mrs', 'My', 'Never', 'No', 'Now', 'Nutley', 'Of', 'Oh', 'On', 'Once', 'Only', 'Or', 'Perhaps', 'Poor', 'Professional', 'Renaissance', 'Rickham', 'Riviera', 'Rome', 'Russian', 'Sevres', 'She', 'Stroud', 'Strouds', 'Suddenly', 'That', 'The', 'Then', 'There', 'They', 'This', 'Those', 'Though', 'Thwing', 'Thwings', 'To', 'Usually', 'Venetian', 'Victor', 'Was', 'We', 'Well', 'What', 'When', 'Why', 'Yes', 'You', '_', 'a', 'abdication', 'able', 'about', 'above',

In [None]:
all_words.extend(["<|endoftext|>", "<|unk|>"])
print(len(all_words))

vocab = {token: id for id, token in enumerate(all_words)}

for i, item in enumerate(vocab.items()):
    print(i+1, item)
    if i>= 10:
      break

1132
1 ('!', 0)
2 ('"', 1)
3 ("'", 2)
4 ('(', 3)
5 (')', 4)
6 (',', 5)
7 ('--', 6)
8 ('.', 7)
9 (':', 8)
10 (';', 9)
11 ('?', 10)


In [None]:
vocab["<|endoftext|>"], vocab["<|unk|>"]

for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


# Building a simple tokenizer class

In [None]:
class SimpleTokenizer:
    def __init__(self, vocab: dict):
        self.token_to_id = vocab
        self.id_to_token = {id: token for token, id in vocab.items()}

    def encode(self, txt: str):
        tokenized = re.split(r"([,.:;?!-()_\'\s]|--)", txt)
        tokenized = [item.strip() if item.strip() in self.token_to_id else "<|unk|>" for item in tokenized if item.strip()]
        ids = [self.token_to_id[token] for token in tokenized]

        return ids

    def decode(self, ids: list):
        txt = " ".join([self.id_to_token[id] for id in ids])
        txt = re.sub(r'\s+([,.?!"()\'])', r'\1', txt) # removes spaces before the specified punctuation

        return txt

In [None]:
tokenizer = SimpleTokenizer(vocab)

In [None]:
txt = """"It's the last he painted, you know," Mrs. Gisburn said with pardonable pride."""
encoding = tokenizer.encode(txt)
print(encoding)
print(tokenizer.decode(encoding))

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [None]:
# try on a text not contained on the training set

new_txt = "Hey! Did you praise Allah today for his grants?"
print(tokenizer.encode(new_txt))

[1131, 0, 1131, 1126, 1131, 1131, 1131, 456, 549, 1131, 10]


In [None]:
txt1 = "O Allah! give me strength."
txt2 = "I have to be patient."
new_txt = " <|endoftext|> ".join((txt1, txt2))
print(new_txt)

O Allah! give me strength. <|endoftext|> I have to be patient.


In [None]:
new_txt_ids = tokenizer.encode(new_txt)

In [None]:
tokenizer.decode(new_txt_ids)

'<|unk|> <|unk|>! give me <|unk|>. <|endoftext|> I have to be patient.'

# BPE

In [None]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [None]:
from importlib.metadata import version
import tiktoken

In [None]:
print(version("tiktoken"))

0.8.0


In [None]:
tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
txt = "I will apologize to my mother for my mistake. <|endoftext|> The king needs the queen to give him a massage."
ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

print(ids)

[40, 481, 16521, 284, 616, 2802, 329, 616, 7457, 13, 220, 50256, 383, 5822, 2476, 262, 16599, 284, 1577, 683, 257, 26900, 13]


In [None]:
print(tokenizer.decode(ids))

I will apologize to my mother for my mistake. <|endoftext|> The king needs the queen to give him a massage.


In [None]:
tokenizer.encode("Akwirw ier")

[33901, 86, 343, 86, 220, 959]

In [None]:
[tokenizer.decode([token]) for token in tokenizer.encode("Akwirw ier")]

['Ak', 'w', 'ir', 'w', ' ', 'ier']

In [None]:
tokenizer.decode(tokenizer.encode("Akwirw ier"))

'Akwirw ier'

# Data Sampling

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path = "/content/drive/MyDrive/Colab Notebooks/Build LLM From Scratch/text_preprocessing/the-verdict.txt"

with open(file_path, "r", encoding = "utf-8") as f:
    txt = f.read()

enc_txt = tokenizer.encode(txt)
print(f"Encoded Text Size: {len(enc_txt)}")

Encoded Text Size: 5145


In [None]:
enc_sample = enc_txt[50:]

In [None]:
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [None]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    target = enc_sample[i]
    print(f"{context} ----> {target}")

[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257


In [None]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    target = enc_sample[i]
    print(tokenizer.decode(context), "----->", tokenizer.decode([target]))

 and ----->  established
 and established ----->  himself
 and established himself ----->  in
 and established himself in ----->  a


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
class GPTDatasetV1(Dataset):

    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids)-max_length, stride):
            context = token_ids[i : i+max_length]
            self.input_ids.append(torch.tensor(context))

            target = token_ids[i+1 : i+max_length+1]
            self.target_ids.append(torch.tensor(target))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [None]:
# The following function uses the GPTDatasetV1 to load the inputs in batches via PyTorch DataLoader

def create_dataloader_v1(
    txt,
    batch_size = 4,
    max_length = 256,
    stride = 128,
    shuffle = True,
    drop_last = True,
    num_workers = 0
):

    tokenizer = tiktoken.get_encoding("gpt2")

    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle = shuffle,
        drop_last = drop_last,
        num_workers = num_workers
    )

    return dataloader

In [None]:
file_path = "/content/drive/MyDrive/Colab Notebooks/Build LLM From Scratch/text_preprocessing/the-verdict.txt"

with open(file_path, "r", encoding="utf-8") as f:
    raw_txt = f.read()

In [None]:
dataloader_0 = create_dataloader_v1(
    txt = raw_txt,
    batch_size = 1,
    stride = 1,
    shuffle = False,
    max_length = 4
)

data_iter = iter(dataloader_0)

first_batch = next(data_iter)

print(first_batch)

second_batch = next(data_iter)

print(second_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]
[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


In [None]:
dataloader_1 = create_dataloader_v1(
    txt = raw_txt,
    batch_size = 1,
    max_length = 2,
    stride = 2,
    shuffle = False
)

data_iter = iter(dataloader_1)

first_batch = next(data_iter)
print(first_batch)

second_batch = next(data_iter)
print(second_batch)

[tensor([[ 40, 367]]), tensor([[ 367, 2885]])]
[tensor([[2885, 1464]]), tensor([[1464, 1807]])]


In [None]:
dataloader_2 = create_dataloader_v1(
    txt = raw_txt,
    batch_size = 1,
    max_length = 8,
    stride = 2,
    shuffle = False
)

data_iter = iter(dataloader_2)

first_batch = next(data_iter)
print(first_batch)

second_batch = next(data_iter)
print(second_batch)

third_batch = next(data_iter)
print(third_batch)

[tensor([[  40,  367, 2885, 1464, 1807, 3619,  402,  271]]), tensor([[  367,  2885,  1464,  1807,  3619,   402,   271, 10899]])]
[tensor([[ 2885,  1464,  1807,  3619,   402,   271, 10899,  2138]]), tensor([[ 1464,  1807,  3619,   402,   271, 10899,  2138,   257]])]
[tensor([[ 1807,  3619,   402,   271, 10899,  2138,   257,  7026]]), tensor([[ 3619,   402,   271, 10899,  2138,   257,  7026, 15632]])]


# PyTorch Dataset and DataLoader (Extra)

In [None]:
X_train = torch.randn((5,2))
y_train = torch.tensor([0, 0, 0, 1, 1])

X_test = torch.randn((2,2))
y_test = torch.tensor([0, 1])

In [None]:
X_train, y_train, X_test, y_test

(tensor([[-0.2846, -1.1679],
         [-1.8851, -1.2288],
         [-0.8017, -1.1892],
         [ 0.1706, -0.3006],
         [ 1.2895,  0.5622]]),
 tensor([0, 0, 0, 1, 1]),
 tensor([[-0.0728, -0.9722],
         [-0.6049, -0.4990]]),
 tensor([0, 1]))

In [None]:
from torch.utils.data import Dataset

In [None]:
class ToyDataset(Dataset):

    def __init__(self, X, y):
        self.features = X
        self.labels = y

    def __len__(self):
        return self.labels.shape[0]

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [None]:
train_ds = ToyDataset(X_train, y_train)
test_ds = ToyDataset(X_test, y_test)

In [None]:
print(len(train_ds))

5


In [None]:
from torch.utils.data import DataLoader
torch.manual_seed(123)

<torch._C.Generator at 0x7adec86d6d30>

In [None]:
train_loader = DataLoader(
    dataset = train_ds,
    batch_size = 2,
    shuffle = True
)

test_loader = DataLoader(
    dataset = test_ds,
    batch_size = 2,
    shuffle = False
)

In [None]:
for idx, (x,y) in enumerate(train_loader):
    print(f"Batch {idx+1}: {x}, {y}")

Batch 1: tensor([[ 0.1706, -0.3006],
        [-1.8851, -1.2288]]), tensor([1, 0])
Batch 2: tensor([[-0.2846, -1.1679],
        [-0.8017, -1.1892]]), tensor([0, 0])
Batch 3: tensor([[1.2895, 0.5622]]), tensor([1])


In [None]:
for idx, (x,y) in enumerate(test_loader):
    print(f"Batch {idx+1}: {x}, {y}")

Batch 1: tensor([[-0.0728, -0.9722],
        [-0.6049, -0.4990]]), tensor([0, 1])


In [None]:
train_loader = DataLoader(
    dataset = train_ds,
    shuffle = True,
    batch_size = 2,
    drop_last = True
)

In [None]:
for idx, (x,y) in enumerate(train_loader):
    print(f"Batch {idx+1}: {x} {y}")

Batch 1: tensor([[-0.8017, -1.1892],
        [-1.8851, -1.2288]]) tensor([0, 0])
Batch 2: tensor([[-0.2846, -1.1679],
        [ 0.1706, -0.3006]]) tensor([0, 1])


In [None]:
# dataloader sampling with a a batch size greater than 1

dataloader = create_dataloader_v1(
    raw_text,
    batch_size = 8,
    max_length = 4,
    stride = 4,
    shuffle = False
)

In [None]:
data_iter = iter(dataloader)

inputs, targets = next(data_iter)

print(f"Inputs:\n{inputs}")
print(f"Targets:\n{targets}")


Inputs:
tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Targets:
tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


- Using a stride of 4 with a max_length of 4, assures no overlaps happen between batches.
- Overlaps could increase overfitting.


# Creating token embeddings

- Converting token ids into embedding vectors is the last text preparation step for LLM training.
> Reviewing the steps:
raw_text -> tokenized_text -> token_ids -> token embeddings
- Initially, we generate random values for the embedding vectors.
- These random values will be waiting optimization through LLM training.
- Continuous vector representation (embeddings) are necessary for training a neural network.

In [None]:
# An illustration example of the process (for simplicity)
# assume vocabulary = 6 words
# asume embeddings size = 3
# Instantiate the embedding layer using PyTorch

input_ids = torch.tensor([2, 3, 5, 1])

# Create embedding layer

vocab_size = 6
emb_dim = 3

torch.manual_seed(123)
emb_layer = torch.nn.Embedding(vocab_size, emb_dim)

In [None]:
print(emb_layer)
print(emb_layer.weight)
print(emb_layer.weight.shape)

Embedding(6, 3)
Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)
torch.Size([6, 3])


- There is one row for each token in a vocabulary of 6 tokens.
- Each token embedding has 3 dimensions (columns).


In [None]:
print(emb_layer(torch.tensor([3])))


tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


- This corresponds to the third index and the fourth row seen in the previous matrix.
- This tells us that the embedding layer is just a lookup table.
- Retreive a vector from the lookup table using a token id.
- This embedding layer approach is a more efficient version of the one-hot encoding and matrix multiplication approach.

In [None]:
# Try the embedding layer on the 4 input ids
emb_layer(input_ids)

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)

- Each id value in the input_ids corresponds to a row in the lookup table (the embedding weight matrix).

# Encoding words' positions

- Additional layer to add positional information for text tokens.
  - Has the same dimension as the previous layer
- The self-attention mechanism would have no clue about word positions or order without this layer.
- A token id in the previous embedding layer would always map to the same vector regardless of where it appeared in the input text.
- In principle, the deterministic, position-independent embedding of the token ID is good for reproducability purposes. But, as we still need the positional information of a token in an input sequence and since the self-attention mechanism itself is position-agnostic, we have to inject our model with this additional positional layer.
- Categories of position-aware embeddings:
  1. Relative positional embeddings
    - Answers/learns: "How far apart is the word from another?"
    - Generalizes well to inputs of varying lengths.
  2. Absolute positional embeddings
    - Answers/learns: "At which exact position the word appear?"
    - Does not generalize well to inputs of varying lengths.
- The two categories ensure that the model gives accurate and context-aware predictions.
- The choice of the positional embedding layer type is dependent on your application and the nature of the data used.
- GPT models considered optimizing the positional layer during the training process rather than keeping it fixed or predefined.

In [None]:
# Experiment with 256-dimensional vector representation
# Use the earlier vocabulary created by the BPE tokenizer

vocab_size = 50257
out_dim = 256

embed_layer = torch.nn.Embedding(vocab_size, out_dim)

- Sample data from the previous dataloader.
- Embed each token in each batch into a 256-dimensional vector.
- The dimension of the resulted embedding matrix: 8 × 4 × 256.

In [None]:
max_length = 4

dataloader = create_dataloader_v1(
    raw_txt,
    batch_size = 8,
    max_length = max_length,
    stride = 4,
    shuffle = False
)

data_iter = iter(dataloader)

inputs, targets = next(data_iter)

print(f"Token IDs:\n{inputs}\nInputs shape: {inputs.shape}")

Token IDs:
tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Inputs shape: torch.Size([8, 4])


- Embed these token ids into 256-dimensional vectors (the positional layer).

In [None]:
token_embeddings = embed_layer(inputs)
token_embeddings, token_embeddings.shape

(tensor([[[ 0.3045, -2.2610, -0.4094,  ..., -1.6522, -0.2868,  1.2438],
          [-0.2460,  0.2528,  0.2237,  ..., -0.7093, -0.6564, -1.2014],
          [ 0.4986,  0.1621,  0.6822,  ...,  1.1171,  0.2367,  0.2138],
          [-0.5436,  0.6707,  2.0537,  ..., -0.6127,  0.5454, -0.9175]],
 
         [[-0.4044,  0.3227,  1.0297,  ...,  0.0570,  0.7727, -0.0346],
          [-0.2779,  0.7304,  2.0882,  ...,  1.0014,  0.1509, -0.5499],
          [-1.4237,  0.5973, -1.3234,  ..., -0.4272,  1.0466, -0.1104],
          [-1.5506,  0.1693,  1.4931,  ..., -0.3387, -1.0787, -1.2689]],
 
         [[ 1.4963,  0.5346,  0.8751,  ...,  2.5655, -0.7914,  1.6417],
          [-0.5089, -1.4528, -0.4107,  ...,  0.7509,  0.4721,  0.3420],
          [ 0.0118, -1.0189,  0.1589,  ...,  0.5212, -0.2544,  0.9034],
          [-0.6605,  0.8879, -0.3432,  ...,  0.0128,  0.4897, -0.2956]],
 
         ...,
 
         [[ 1.2109, -1.1724, -1.9072,  ...,  0.6573, -2.3168,  2.6120],
          [ 0.4538,  1.3213,  0.0283,  

- For the absolute positional embedding approach, we add another embedding layer with the same size.

In [None]:
context_length = max_length

pos_layer = torch.nn.Embedding(context_length, out_dim)

In [None]:
pos_embeddings = pos_layer(torch.arange(context_length))
pos_embeddings, pos_embeddings.shape

(tensor([[-2.3757,  0.3616,  1.6211,  ..., -0.6813, -1.1042, -0.5608],
         [ 1.1531,  0.0796,  0.5718,  ...,  1.3845, -1.1687,  2.1513],
         [ 0.2339, -0.5470,  0.7052,  ...,  0.1407, -1.9278, -1.4534],
         [-0.2963, -0.2133,  0.5776,  ..., -0.0176,  0.8153, -1.4964]],
        grad_fn=<EmbeddingBackward0>),
 torch.Size([4, 256]))

- torch.arange(context_length) is a placeholder vector contains a sequence of numbers 0, 1, .., up to the maximum input length.
- The context_length represents the supported input size that the LLM reveives.
- If the input text to the LLM is larger than the supported context length, it would be truncated.
- We add the two seperate embeddings.

In [None]:
input_embeddings = token_embeddings + pos_embeddings
input_embeddings.shape

torch.Size([8, 4, 256])

- This is the final input that is ready to be passed to the transformer module of the LLM.

# Summary
- For the text data to be compatible with NN, it must be transformed into numerical values in a continuous vector space (embeddings).
- First step is tokenization. Tokenization can be character level or word level. The tokens are converted into token IDs followingly.
- Some special tokens can be added to help the model handle some contexts like unknown words or boundary between unrelated texts.
- BPE is a famous tokenization algorithm used by GPT-2 and GPT-3. It handles unknown words effeciently.
- Through the sliding window technique, we can get input-target pairs from the tokenized data to train the LLM.

- The embedding layers in the LLMs works as a lookup operation. For each token id, there is a corresponding continuous embedding vector.
- A positional embedding layer is added at the end to give positional information.