<a href="https://colab.research.google.com/github/Nandika-A/LLM-from-scratch/blob/main/LLM_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preparing the data

In [None]:
import os
import urllib.request

In [None]:
if not os.path.exists("the-verict.txt"):
  url = ("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/refs/heads/main/ch02/01_main-chapter-code/the-verdict.txt")
  urllib.request.urlretrieve(url, "the-verdict.txt")


In [None]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
  raw_text = f.read()

In [None]:
raw_text

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)\n\n"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it\'s going to send the value of my picture \'way up; but I don\'t think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing\'s lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn\'s "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?\n\nWell!--even 

In [None]:
len(raw_text)

20479

In [None]:
import re

## Break down text into tokens

In [None]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]

In [None]:
preprocessed[:10]

['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius']

## Convert tokens into token Ids

In [None]:
all_words = sorted(set(preprocessed))

In [None]:
vocab_size = len(all_words)

In [None]:
print(vocab_size)

1130


In [None]:
vocab = {token:integer for integer,token in enumerate(all_words)}

In [None]:
len(vocab)

1130

Use this vocabulary to convert each word into an integer

Now encode the text into token Ids

In [None]:
class SimpleTokenizerV1:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {i:s for s,i in vocab.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    ids = [self.str_to_int[i] for i in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[i] for i in ids])
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text

This tokenizer can only work for the words in the vocabulary. Otherwise it gives error.

In [None]:
tokenizer = SimpleTokenizerV1(vocab)

In [None]:
new_text = "This is good!"
ids = tokenizer.encode(new_text)
print(ids)

[97, 584, 500, 0]


In [None]:
tokenizer.decode(ids)

'This is good!'

## Special context tokens
Like end-of-text token

We also want if the word isn't in the vocabulary, the tokenizer shouldn't fail but extend those tokens in the vocabulary.

In [None]:
all_words.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer,token in enumerate(all_words)}

In [None]:
len(vocab)

1132

In [None]:
class SimpleTokenizerV2:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {i:s for s,i in vocab.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]
    ids = [self.str_to_int[i] for i in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[i] for i in ids])
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text

In [None]:
tokenizer = SimpleTokenizerV2(vocab)

In [None]:
text = "Hello World, This is good!"

In [None]:
ids = tokenizer.encode(text)

In [None]:
tokenizer.decode(ids)

'<|unk|> <|unk|>, This is good!'

## Byte pair encoding
Algo for handling unknown tokens
It breaks down longer words into known subwords instead of substituting unknown token. So, one word may become many tokens, but it never fails

In [None]:
import tiktoken

In [None]:
tiktoken.__version__

'0.9.0'

Tiktoken library has tokenizers of many models. Here we are using Gpt2. The actual library is in rust with a python API.

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
tokenizer.n_vocab

50257

In [None]:
ids = tokenizer.encode("hello world!")

In [None]:
tokenizer.decode(ids)

'hello world!'

<|endoftext|> token is added to denote the end of the document.

In [None]:
ids = tokenizer.encode("hello world! <|endoftext|> hi", allowed_special={"<|endoftext|>"})

## Data Sampling with sliding window

LLMs predict one word at a time. This helps them to scale and train efficiently, as label gets labelled itself.

In [None]:
context_size = 4

no. of tokens in the window, which is passed to the LLM as LLM can't receive all the tokens at once for training, generally in thousands

In [None]:
ids = tokenizer.encode(raw_text)

In [None]:
ids[:30]

[40,
 367,
 2885,
 1464,
 1807,
 3619,
 402,
 271,
 10899,
 2138,
 257,
 7026,
 15632,
 438,
 2016,
 257,
 922,
 5891,
 1576,
 438,
 568,
 340,
 373,
 645,
 1049,
 5975,
 284,
 502,
 284,
 3285]

In [None]:
x = ids[:context_size]
y = ids[1:context_size+1]
print(x)
print(y)

[40, 367, 2885, 1464]
[367, 2885, 1464, 1807]


Three tokens overlap, as that is the prediction result. If input is 40, LLM should predict 367 and so on.

## Create the dataset

In [None]:
import torch

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
class GPTDatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids = []

    token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

    for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk)) # not optimal way if number of tokens is very large, can't use in memory storage

  def __len__(self):
        return len(self.input_ids)

  def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

We drop the last batch if the size if less than batch size. This creates the batches of same size, which help in stable training

In [None]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

Max length is the context length

In [None]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

In [None]:
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]]), tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])]


In [None]:
inputs, targets = next(data_iter)
print(inputs)
print(targets)

tensor([[  287,   262,  6001,   286],
        [  465, 13476,    11,   339],
        [  550,  5710,   465, 12036],
        [   11,  6405,   257,  5527],
        [27075,    11,   290,  4920],
        [ 2241,   287,   257,  4489],
        [   64,   319,   262, 34686],
        [41976,    13,   357, 10915]])
tensor([[  262,  6001,   286,   465],
        [13476,    11,   339,   550],
        [ 5710,   465, 12036,    11],
        [ 6405,   257,  5527, 27075],
        [   11,   290,  4920,  2241],
        [  287,   257,  4489,    64],
        [  319,   262, 34686, 41976],
        [   13,   357, 10915,   314]])


## Create token embeddings
token_ids --> token_embeddings

In [None]:
input = torch.tensor([[ 3, 1,  4,  5]])

In [None]:
input2 = torch.tensor([[ 300, 1,  4,  5]])

In [None]:
input

tensor([[3, 1, 4, 5]])

In [None]:
vocab_size = 6
output_dim = 3
torch.manual_seed(42) # nn layer with random weights, so putting seeds to get same weights everytime
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [None]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 1.9269,  1.4873, -0.4974],
        [ 0.4396, -0.7581,  1.0783],
        [ 0.8008,  1.6806,  0.3559],
        [-0.6866,  0.6105,  1.3347],
        [-0.2316,  0.0418, -0.2516],
        [ 0.8599, -0.3097, -0.3957]], requires_grad=True)


Weights of Embedding layer, which are optimised later.

In [None]:
embedding_layer(torch.tensor([1]))

tensor([[ 0.4396, -0.7581,  1.0783]], grad_fn=<EmbeddingBackward0>)

In [None]:
#embedding_layer(input2)

The error IndexError: index out of range in self in cell CnI8XtVpJEdy means that the input input2 contains values that are larger than the vocab_size of your embedding layer.

Your embedding layer was initialized with vocab_size = 6, which means it can only accept input indices ranging from 0 to 5. The input input2 must contain at least one value outside of this range.

To fix this, you need to ensure that all values in input2 are between 0 and 5 (inclusive). If you intended to use the input tensor defined earlier, you can change input2 to input.

In [None]:
embedding_layer(input)

tensor([[[-0.6866,  0.6105,  1.3347],
         [ 0.4396, -0.7581,  1.0783],
         [-0.2316,  0.0418, -0.2516],
         [ 0.8599, -0.3097, -0.3957]]], grad_fn=<EmbeddingBackward0>)

### Encoding word positions

In [None]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [None]:
max_length = 16
dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=max_length, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [None]:
inputs

tensor([[   40,   367,  2885,  1464,  1807,  3619,   402,   271, 10899,  2138,
           257,  7026, 15632,   438,  2016,   257]])

In [None]:
token_embeddings = token_embedding_layer(inputs)

In [None]:
token_embeddings.shape

torch.Size([1, 16, 256])

For positional embeddings, Gpt2 uses same layer again.

Add each of token embeddings to positional embeddings, and it becomes input embeddings

In [None]:
context_length = max_length
pos_embeddings_layer = torch.nn.Embedding(context_length, output_dim)

In [None]:
size_of_each_batch = torch.arange(max_length)

In [None]:
pos_embeddings_layer.weight

Parameter containing:
tensor([[ 1.2192, -0.2741,  0.6823,  ..., -2.0313, -0.3160, -0.2499],
        [ 0.1600, -2.1962,  0.4126,  ..., -1.1532,  0.4579,  1.3812],
        [-0.1451,  0.5679,  0.1859,  ...,  0.2771, -1.2594,  1.3905],
        ...,
        [-1.7559,  0.0438,  1.1475,  ...,  0.4167,  1.1104,  0.4144],
        [-1.9071, -0.9007,  1.7004,  ..., -1.8818,  0.1716,  0.0144],
        [-0.1248, -1.0667,  0.4120,  ...,  0.9362,  0.9994,  0.1742]],
       requires_grad=True)

In [None]:
pos_embeddings = pos_embeddings_layer(size_of_each_batch)

In [None]:
pos_embeddings.shape

torch.Size([16, 256])

In [None]:
input_embeddings = token_embeddings + pos_embeddings

In [None]:
input_embeddings.shape

torch.Size([1, 16, 256])

# Attention

Parsing input with self attention, to get the context of the whole sentence while generating each word, as while generating a word we can just look at the previously generated token in general. It assigns attention scores to each word in the sentence for reference.

In [None]:
# input_embeddings = [torch.tensor(
#   [[0.43, 0.15, 0.89], # Your     (x^1)
#    [0.55, 0.87, 0.66], # journey  (x^2)
#    [0.57, 0.85, 0.64], # starts   (x^3)
#    [0.22, 0.58, 0.33], # with     (x^4)
#    [0.77, 0.25, 0.10], # one      (x^5)
#    [0.05, 0.80, 0.55]] # step     (x^6)
# )]

# output_dim = 3
# max_length = 6

In [None]:
input_embeddings[0]

tensor([[ 0.9929,  1.2730,  2.4816,  ..., -3.1044, -1.0006, -1.4009],
        [ 0.2024, -1.5256,  3.4631,  ...,  0.5621,  0.5827,  1.8916],
        [-0.2273,  0.8504,  0.1607,  ..., -1.0865, -0.4518,  0.8350],
        ...,
        [-2.0435,  0.4572,  1.5234,  ...,  0.7286,  2.2091,  1.1354],
        [-0.9065, -1.3995,  0.7922,  ..., -0.7733, -0.1588,  1.0131],
        [ 1.1009, -0.8636,  1.7284,  ...,  1.2134,  0.4801,  0.0901]],
       grad_fn=<SelectBackward0>)

In [None]:
input_embeddings[0].shape[0]

16

In [None]:
query = input_embeddings[0][1] # let's query the second word in the sentence, in first batch.

In [None]:
query

tensor([ 0.2024, -1.5256,  3.4631, -0.3572,  0.7412,  0.4286,  0.0370, -0.0280,
        -0.2736,  2.2205,  0.2307, -0.3646, -0.7828,  1.1415,  1.2583,  1.4813,
         0.0852,  1.4233, -3.9333,  0.9847,  0.5941,  0.5589,  1.8527,  1.1428,
         0.9194,  1.5592, -0.5299, -0.3945,  1.6160, -2.0992,  1.1277, -0.2103,
        -1.6615,  0.9536,  0.3687,  2.7760, -1.6139, -1.9557, -0.3317,  2.7021,
        -0.1604, -1.5577,  0.4850,  2.2246, -0.0147, -1.3917,  1.7022,  0.5634,
         0.3720, -0.1192,  1.5620,  0.4310, -0.1631,  1.1995,  0.2864, -0.7074,
        -2.3949,  2.2157, -0.0747, -1.4528,  1.2118, -0.3935, -3.1965,  1.2907,
        -1.2329, -1.1810,  0.0480,  0.7396, -0.6971,  0.2765, -0.9201,  2.2997,
        -0.2713,  0.9154,  3.1596,  0.7278, -0.8000,  0.7926, -0.2249,  0.5910,
         1.6051,  0.8777, -1.3312, -1.7427, -1.5775, -0.6021,  2.4569,  2.0505,
         2.7599,  0.8910,  0.1726, -1.1436, -0.5788,  2.5402, -0.1757, -1.3925,
         2.6024, -1.4237, -1.9743,  1.06

In [None]:
attention_scores_2 = torch.empty(max_length)

for i, x_i in enumerate(input_embeddings[0]):
  attention_scores_2[i] = torch.dot(x_i, query)

print(attention_scores_2)

tensor([-2.1417e+01,  5.2857e+02,  4.9593e+01,  8.3008e+00, -5.7123e+01,
        -1.4962e+01,  4.1251e+00,  1.4813e+01,  5.0935e-01, -2.5302e+01,
        -4.3514e+01, -3.6953e+01,  6.0200e+01, -2.4985e+00,  4.8889e+01,
        -1.2598e+01], grad_fn=<CopySlices>)


Attention scores need to be normalized for getting attention weights

In [None]:
attention_weights_2 = torch.softmax(attention_scores_2, dim=0)
print(attention_weights_2)

tensor([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       grad_fn=<SoftmaxBackward0>)


In [None]:
context_vec_2 = torch.zeros(query.shape)

for i, x_i in enumerate(input_embeddings[0]):
  context_vec_2 += x_i * attention_weights_2[i]

print(context_vec_2)

tensor([ 0.2024, -1.5256,  3.4631, -0.3572,  0.7412,  0.4286,  0.0370, -0.0280,
        -0.2736,  2.2205,  0.2307, -0.3646, -0.7828,  1.1415,  1.2583,  1.4813,
         0.0852,  1.4233, -3.9333,  0.9847,  0.5941,  0.5589,  1.8527,  1.1428,
         0.9194,  1.5592, -0.5299, -0.3945,  1.6160, -2.0992,  1.1277, -0.2103,
        -1.6615,  0.9536,  0.3687,  2.7760, -1.6139, -1.9557, -0.3317,  2.7021,
        -0.1604, -1.5577,  0.4850,  2.2246, -0.0147, -1.3917,  1.7022,  0.5634,
         0.3720, -0.1192,  1.5620,  0.4310, -0.1631,  1.1995,  0.2864, -0.7074,
        -2.3949,  2.2157, -0.0747, -1.4528,  1.2118, -0.3935, -3.1965,  1.2907,
        -1.2329, -1.1810,  0.0480,  0.7396, -0.6971,  0.2765, -0.9201,  2.2997,
        -0.2713,  0.9154,  3.1596,  0.7278, -0.8000,  0.7926, -0.2249,  0.5910,
         1.6051,  0.8777, -1.3312, -1.7427, -1.5775, -0.6021,  2.4569,  2.0505,
         2.7599,  0.8910,  0.1726, -1.1436, -0.5788,  2.5402, -0.1757, -1.3925,
         2.6024, -1.4237, -1.9743,  1.06

## Simple self attention for all inputs without trainable weights

This is for 1st batch only

In [None]:
attention_scores = torch.empty(max_length, max_length)
print(input_embeddings[0].shape)

for i, x_i in enumerate(input_embeddings[0]):
  for j, x_j in enumerate(input_embeddings[0]):
    attention_scores[i, j] = x_i.dot(x_j)

print(attention_scores)
print(attention_scores.shape)

torch.Size([16, 256])
tensor([[ 4.7055e+02, -2.1417e+01,  7.9083e+00,  1.3565e+00,  1.6035e+00,
         -1.4744e+01,  5.5573e+01,  1.1527e+01, -7.0634e+01, -1.9790e+01,
         -1.5545e-01,  1.2248e+01,  3.1930e+01,  3.6030e+01,  1.4739e+01,
          1.1818e+01],
        [-2.1417e+01,  5.2857e+02,  4.9593e+01,  8.3008e+00, -5.7123e+01,
         -1.4962e+01,  4.1251e+00,  1.4813e+01,  5.0935e-01, -2.5302e+01,
         -4.3514e+01, -3.6953e+01,  6.0200e+01, -2.4985e+00,  4.8889e+01,
         -1.2598e+01],
        [ 7.9083e+00,  4.9593e+01,  5.5046e+02, -3.8664e+01, -2.4345e+00,
         -2.9529e+01,  9.1606e+00,  3.2558e+01, -6.2929e+01,  6.8157e+01,
          1.2723e+00, -3.2723e+01, -2.7900e+01,  3.8270e+01, -3.8404e+01,
         -3.2785e+01],
        [ 1.3565e+00,  8.3008e+00, -3.8664e+01,  4.4035e+02, -1.0155e+01,
          5.1974e+01,  2.9931e+01, -2.4154e-01,  9.8480e+00,  2.8886e+01,
          3.6532e+01, -5.5056e+01, -3.8941e+00, -1.2686e+01, -5.1219e+01,
         -7.3850e-01]

In [None]:
attention_weights = torch.softmax(attention_scores, dim=0)

In [None]:
context_vec = torch.empty(max_length, output_dim)

for j in range(max_length):
  for i, x_i in enumerate(input_embeddings[0]):
      context_vec[j] += x_i * attention_weights[j][i]

print(context_vec.shape)

torch.Size([16, 256])


In [None]:
context_vec[1]

tensor([ 2.0243e-01, -1.5256e+00,  3.4631e+00, -3.5721e-01,  7.4116e-01,
         4.2861e-01,  3.7044e-02, -2.7969e-02, -2.7362e-01,  2.2205e+00,
         2.3067e-01, -3.6459e-01, -7.8278e-01,  1.1415e+00,  1.2583e+00,
         1.4813e+00,  8.5225e-02,  1.4233e+00, -3.9333e+00,  9.8467e-01,
         5.9412e-01,  5.5894e-01,  1.8527e+00,  1.1428e+00,  9.1944e-01,
         1.5592e+00, -5.2993e-01, -3.9450e-01,  1.6160e+00, -2.0992e+00,
         1.1277e+00, -2.1026e-01, -1.6615e+00,  9.5357e-01,  3.6869e-01,
         2.7760e+00, -1.6139e+00, -1.9557e+00, -3.3174e-01,  2.7021e+00,
        -1.6040e-01, -1.5577e+00,  4.8504e-01,  2.2246e+00, -1.4680e-02,
        -1.3917e+00,  1.7022e+00,  5.6339e-01,  3.7203e-01, -1.1917e-01,
         1.5620e+00,  4.3099e-01, -1.6315e-01,  1.1995e+00,  2.8641e-01,
        -7.0738e-01, -2.3949e+00,  2.2157e+00, -7.4739e-02, -1.4528e+00,
         1.2118e+00, -3.9352e-01, -3.1965e+00,  1.2907e+00, -1.2329e+00,
        -1.1810e+00,  4.8032e-02,  7.3961e-01, -6.9

## Implementing self-attention with trainable weights