<h1> Building LLMs from Scratch </h1>

<h2> Part 7: Tokenization </h2>
(in sync with lectures... Lectures 1-6 were all intuition and theory)

In [3]:
# Retreived the text and stores in raw_text

with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print("Total number of characters:", len(raw_text))
print((raw_text[:99]))


Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [30]:
import re


text = "Hello, world. Is this-- is a test?"
result = re.split(r'(\s)', text) # splitting wherever there is a whitespace! \s is for spaces like \n is for new line

print(result)

['Hello,', ' ', 'world.', ' ', 'Is', ' ', 'this--', ' ', 'is', ' ', 'a', ' ', 'test?']


In [31]:
result = re.split(r'([,.:;?_!"()\']|--|\s)', text) # splitting at more stuff

result = [item for item in result if item.strip()]
print(result) # separated words and punctuations... whitespaces removed from array (fine for this, but what about where structure is important?? like code!!! whitespace is important!)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'is', 'a', 'test', '?']


In [34]:
# for whole text!!

preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item for item in preprocessed if item.strip()]
print(preprocessed[:30])
print("Total count:", len(preprocessed))

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']
Total count: 4690


<h3> Converting to token IDs </h3>

In [44]:
all_words = sorted(set(preprocessed)) # using set removes all repeated values!!
vocab_size = len(all_words)

print(all_words[:99])
print(vocab_size) # only unique words now!

['!', '"', "'", '(', ')', ',', '--', '.', ':', ';', '?', 'A', 'Ah', 'Among', 'And', 'Are', 'Arrt', 'As', 'At', 'Be', 'Begin', 'Burlington', 'But', 'By', 'Carlo', 'Chicago', 'Claude', 'Come', 'Croft', 'Destroyed', 'Devonshire', 'Don', 'Dubarry', 'Emperors', 'Florence', 'For', 'Gallery', 'Gideon', 'Gisburn', 'Gisburns', 'Grafton', 'Greek', 'Grindle', 'Grindles', 'HAD', 'Had', 'Hang', 'Has', 'He', 'Her', 'Hermia', 'His', 'How', 'I', 'If', 'In', 'It', 'Jack', 'Jove', 'Just', 'Lord', 'Made', 'Miss', 'Money', 'Monte', 'Moon-dancers', 'Mr', 'Mrs', 'My', 'Never', 'No', 'Now', 'Nutley', 'Of', 'Oh', 'On', 'Once', 'Only', 'Or', 'Perhaps', 'Poor', 'Professional', 'Renaissance', 'Rickham', 'Riviera', 'Rome', 'Russian', 'Sevres', 'She', 'Stroud', 'Strouds', 'Suddenly', 'That', 'The', 'Then', 'There', 'They', 'This', 'Those']
1130


In [61]:
# assigning the token values (ENCODING!)
vocab = {token:integer for integer, token in enumerate(all_words)} # enumerate takes all items and gives index value to each... mapping as dict!

for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


<h3> Tokenizer Class </h3>

In [90]:
class SimpleTokenizerV1:
     def __init__(self, vocab):
          self.str_to_int = vocab
          self.int_to_str = {i:s for s,i in vocab.items()}

     def encode(self, text): # converting strings to token IDs
          preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
          preprocessed = [item for item in preprocessed if item.strip()]

          IDs = [self.str_to_int[s] for s in preprocessed]
          return IDs
     
     def decode(self, IDs):
          text = " ".join([self.int_to_str[i] for i in IDs]) # .join puts in an array
          text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) # minor change, just removing any spaces created before punctuations
          text = re.sub(r'([\'"])\s+', r'\1', text) # removing spaces after single and double quotes
          return text

In [None]:
tokenizer = SimpleTokenizerV1(vocab) # vocab done earlier... dictionary of each words with a token
text = """"It's the last he painted, you know,"
Mrs. Gisburn said with pardonable pride.
"""

ids = tokenizer.encode(text)
print(ids)

# wont_work = tokenizer.encode("wassup") ----- this wont work because its not in our vocab
# print(wont_work)

text_again = tokenizer.decode(ids)
print(text_again)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
"It's the last he painted, you know,"Mrs. Gisburn said with pardonable pride.


<h3>Special Context Tokens</h3>
Handling unknown words...

<|unk|> for unknown words <br>
<|endoftext|> a token between unrelated texts (when using multiple data sources)


In [111]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens += ["<|endoftext|>", "<|unk|>"]
print(all_tokens[-5:])

vocab = {token:integer for integer, token in enumerate(all_tokens)}

print("New vocabulary size:", len(vocab)) # increased by 2!

['younger', 'your', 'yourself', '<|endoftext|>', '<|unk|>']
New vocabulary size: 1132


In [112]:
class SimpleTokenizerV2:
     def __init__(self, vocab):
          self.str_to_int = vocab
          self.int_to_str = {i:s for s,i in vocab.items()}

     def encode(self, text): # converting strings to token IDs
          preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
          preprocessed = [item for item in preprocessed if item.strip()]
          preprocessed = [
               item if item in self.str_to_int
               else "<|unk|>" for item in preprocessed
          ]

          IDs = [self.str_to_int[s] for s in preprocessed]
          return IDs
     
     def decode(self, IDs):
          text = " ".join([self.int_to_str[i] for i in IDs]) # .join puts in an array
          text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) # minor change, just removing any spaces created before punctuations
          text = re.sub(r'([\'"])\s+', r'\1', text) # removing spaces after single and double quotes
          return text
    

In [156]:
tokenizer = SimpleTokenizerV2(vocab) # using v2 that handles special cases!

## testing weird stuff!!
weird_text = "She had pardonable pride... <|endoftext|> Gisburn gave marshmellow tea!"
print(weird_text, "\n")

will_work = tokenizer.encode(weird_text) ## now it works!! gets assigned unknown id
print("IDs:", will_work, "\n") ## testing both unknown word and <|endoftext|>!!

weird_text_again = tokenizer.decode(will_work)
print(weird_text_again)

She had pardonable pride... <|endoftext|> Gisburn gave marshmellow tea! 

IDs: [88, 514, 754, 793, 7, 7, 7, 1130, 38, 484, 1131, 975, 0] 

She had pardonable pride... <|endoftext|> Gisburn gave <|unk|> tea!


<h4><i>Note:</i></h4>Other special tokens also exist... (but even OpenAI gpt's dont used these!) <br><br>
1) beginning of sequence (BOS) - start of new text source <br>
2) padding (PAD) => shorter text sources are padded so all sources have same size (as much as the largest)... Helps with parallel processing

<hr>
<h2>Part 8: The GPT Tokenizer!</h2>
<h3>Byte-Pair Encoding</h3>

In [None]:
import tiktoken # type: ignore
tokenizer = tiktoken.get_encoding("gpt2")

In [292]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace..."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"}) #try <|endoftext|> specifically... the largest token ID... so just 50256 tokens!! less than the 200,000 words in actual dictionary
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 617, 34680, 27271, 986]


In [293]:
text_again = tokenizer.decode(integers)
print(text_again)

# Didn't need a special token for unknown words!! Just broke down stuff and dealt with unknown text...

Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace...


<hr>
<h2>Part 9: Creating Input-Target Pairs</h2>
A sliding window approach!!

In [391]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print("Encoding length:", len(enc_text))

print(enc_text)
enc_sample = enc_text[50:] # for testing, more interesting (why?)

Encoding length: 5145
[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257, 7026, 15632, 438, 2016, 257, 922, 5891, 1576, 438, 568, 340, 373, 645, 1049, 5975, 284, 502, 284, 3285, 326, 11, 287, 262, 6001, 286, 465, 13476, 11, 339, 550, 5710, 465, 12036, 11, 6405, 257, 5527, 27075, 11, 290, 4920, 2241, 287, 257, 4489, 64, 319, 262, 34686, 41976, 13, 357, 10915, 314, 2138, 1807, 340, 561, 423, 587, 10598, 393, 28537, 2014, 198, 198, 1, 464, 6001, 286, 465, 13476, 1, 438, 5562, 373, 644, 262, 1466, 1444, 340, 13, 314, 460, 3285, 9074, 13, 46606, 536, 5469, 438, 14363, 938, 4842, 1650, 353, 438, 2934, 489, 3255, 465, 48422, 540, 450, 67, 3299, 13, 366, 5189, 1781, 340, 338, 1016, 284, 3758, 262, 1988, 286, 616, 4286, 705, 1014, 510, 26, 475, 314, 836, 470, 892, 286, 326, 11, 1770, 13, 8759, 2763, 438, 1169, 2994, 284, 943, 17034, 318, 477, 314, 892, 286, 526, 383, 1573, 11, 319, 9074, 13, 536, 5469, 338, 11914, 11, 33096, 663, 4808, 3808, 62, 355, 996, 484, 547, 12548, 287, 281, 13

In [392]:
context_size = 4 #takes the 4 words before it as input... (gpt3 takes 2048! gpt4-turbo takes 128,000!!!)

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print("x:", x)
print("y:", y)

# will finally create 2d version of multiple lines like this!!! (Tensors in PyTorch)


x: [290, 4920, 2241, 287]
y: [4920, 2241, 287, 257]


In [393]:
print(f"Input ----> Desired Output\n")
for i in range(1, context_size+1):
    input = enc_sample[:i]
    desired_output = enc_sample[i]
    print(f"{input} ----> {desired_output}")


Input ----> Desired Output

[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257


In [596]:
print(f"Input ----> Desired Output\n")

for i in range(1, context_size+1):
    input = enc_sample[:i]
    desired_output = enc_sample[i]
    print(tokenizer.decode(input), "---->", tokenizer.decode([desired_output]))

Input ----> Desired Output

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


<h3>Using Data Loader</h3>

In [597]:
%pip install torch
from torch.utils.data import Dataset, DataLoader # type: ignore

Note: you may need to restart the kernel to use updated packages.


In [598]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride): #max_length here is context size!!
        self.input_ids = []
        self.target_ids = []

        # tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Creating the 2d dataset!!
        for i in range (0, len(token_ids) - max_length, stride): # from 0th index all the way to end (-max_length to prevent out of bounds error, since we add later)... each loop increases by STRIDE
            input_chunk = token_ids[i: i + max_length]
            target_chunk = token_ids[i+1: i + max_length + 1]
            print(input_chunk)
            print(target_chunk)
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids) # tell us how many batches were created... EACH batch has 4 training processes within it!!! (1 ---> 2, then 1,2 ----> 3, then 1,2,3 -----> 4, etc...)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx] # gives us one batch based on index

In [599]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create the dataset (2d arrays with sliding window text)
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)


    # Creating dataloader with pytorch!!
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    ) # this Pytorch method uses __getitem__ from earlier and sets up the input-target pairs from tensors

    return dataloader

################# MEANINGS: (mostly hyperparameters) #################
# txt - the text we're training on
# batch_size - the number of rows (tensors) in the 2d array it goes through before updating params
# max_length - context size/window... the number of words used as input to output the next word
# stride - the number of tokens to move the window forward by after each step.
# drop_last - the LAST tensor might be less than the stride size, so last row will be shorter.. can lead to werid edge case... might as well just drop the last one.
# num_workers - more about the backend CPU and how many "workers" will be doing parallel processing at a time... higher means more processing power needed but faster! 

<h3>Testing the created Dataset and DataLoader classes</h3>

In [600]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [601]:
import torch
print("PyTorch version:", torch.__version__)
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

PyTorch version: 2.5.1
[40, 367, 2885, 1464]
[367, 2885, 1464, 1807]
[367, 2885, 1464, 1807]
[2885, 1464, 1807, 3619]
[2885, 1464, 1807, 3619]
[1464, 1807, 3619, 402]
[1464, 1807, 3619, 402]
[1807, 3619, 402, 271]
[1807, 3619, 402, 271]
[3619, 402, 271, 10899]
[3619, 402, 271, 10899]
[402, 271, 10899, 2138]
[402, 271, 10899, 2138]
[271, 10899, 2138, 257]
[271, 10899, 2138, 257]
[10899, 2138, 257, 7026]
[10899, 2138, 257, 7026]
[2138, 257, 7026, 15632]
[2138, 257, 7026, 15632]
[257, 7026, 15632, 438]
[257, 7026, 15632, 438]
[7026, 15632, 438, 2016]
[7026, 15632, 438, 2016]
[15632, 438, 2016, 257]
[15632, 438, 2016, 257]
[438, 2016, 257, 922]
[438, 2016, 257, 922]
[2016, 257, 922, 5891]
[2016, 257, 922, 5891]
[257, 922, 5891, 1576]
[257, 922, 5891, 1576]
[922, 5891, 1576, 438]
[922, 5891, 1576, 438]
[5891, 1576, 438, 568]
[5891, 1576, 438, 568]
[1576, 438, 568, 340]
[1576, 438, 568, 340]
[438, 568, 340, 373]
[438, 568, 340, 373]
[568, 340, 373, 645]
[568, 340, 373, 645]
[340, 373, 645, 1

In [602]:
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch) # gives input and target token id's!! (ALWAYS shifted by 1!!)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [603]:
second_batch = next(data_iter)
print(second_batch)

###### STRIDE!!! ######
# the number of tokens to move the window forward by, after each step. 
# if [1,2,3,4]----->[2,3,4,5] is FIRST batch and stride is 1 then next input-target pair is [2,3,4,5]----->[3,4,5,6] 
# BUT NOW IF stride=3 then next pair is directly [4,5,6,7]----->[5,6,7,8]... 
# entire window moved by 3 in next batch!! (input to output still always changes ONLY by 1!!)

# so to think about it, stride can at most be the same as max_length...
# next pair directly starts from the very next tokens completely after the last one used in previous input... more than that would lead to completely skipping tokens!!
# ex - [1,2,3,4] -----> [2,3,4,5] and THEN [5,6,7,8]----->[6,7,8,9]... ZERO overlap between input values
# larger stride leads to lesser computation... more efficient, less accurate

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


Note on Batch Sizes:
- number of pairs used for training before parameters are readjusted (1 means weights are changed after EVERY pair)
- batch size of 1 requires less memory, but captures allll the noise!!! (just like overfitting in linear regression with low learning rate!!)
- very large batch size would miss pattern entirely, (underfitting in linear regression with high learning rate!!)
- so need to find the right balance!! (batch size is also a <b><u>hyperparameter</u></b>!!!)

In [604]:
%%capture
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

In [605]:
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

# So the model will process this WHOLE batch below before making a parameters update... batch size!!
# Stride here is 4 equal to max_length, so no overlap (like when < max_length), but we also dont skip any tokens completely (like when > max_length)
# Again a hyperparameter!!! very small could lead to overfitting, very large leads to not using entire data and underfitting

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


To be clear... despite any possible type of hyperparameter settings...<br>
An input-target pair, is ALWAYS only offset by 1!!<br>
<b>Which means, at any point in one single process, the output is ALWAYS just 1 word</b> (based on an input of multiple words i.e. context size)

<h5><i>These input-target pairs from the dataloader will now be converted into vector embeddings for our LLM... <br>: )</i></h5>

<hr>
<h2>Part 10: Token Embeddings</h2>
They serve as input to training the LLM

<h3>We train a neural network to create vector embeddings!</h3>

Mini Demo of Word Embeddings usage.. Google News Word2Vec!

In [606]:
import gensim.downloader as api
model = api.load("word2vec-google-news-300") # pretrained vector embeddings based on google news content... 300 dimensions



In [607]:
word_vectors = model
print(word_vectors['computer'])

[ 1.07421875e-01 -2.01171875e-01  1.23046875e-01  2.11914062e-01
 -9.13085938e-02  2.16796875e-01 -1.31835938e-01  8.30078125e-02
  2.02148438e-01  4.78515625e-02  3.66210938e-02 -2.45361328e-02
  2.39257812e-02 -1.60156250e-01 -2.61230469e-02  9.71679688e-02
 -6.34765625e-02  1.84570312e-01  1.70898438e-01 -1.63085938e-01
 -1.09375000e-01  1.49414062e-01 -4.65393066e-04  9.61914062e-02
  1.68945312e-01  2.60925293e-03  8.93554688e-02  6.49414062e-02
  3.56445312e-02 -6.93359375e-02 -1.46484375e-01 -1.21093750e-01
 -2.27539062e-01  2.45361328e-02 -1.24511719e-01 -3.18359375e-01
 -2.20703125e-01  1.30859375e-01  3.66210938e-02 -3.63769531e-02
 -1.13281250e-01  1.95312500e-01  9.76562500e-02  1.26953125e-01
  6.59179688e-02  6.93359375e-02  1.02539062e-02  1.75781250e-01
 -1.68945312e-01  1.21307373e-03 -2.98828125e-01 -1.15234375e-01
  5.66406250e-02 -1.77734375e-01 -2.08984375e-01  1.76757812e-01
  2.38037109e-02 -2.57812500e-01 -4.46777344e-02  1.88476562e-01
  5.51757812e-02  5.02929

In [632]:
trying_queen = word_vectors.most_similar(positive=['king', 'woman'], negative=['man'], topn=10)
print(trying_queen)
print(word_vectors.similarity('woman', 'man')) # how similar are the words

[('queen', 0.7118192911148071), ('monarch', 0.6189674735069275), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321243286133), ('kings', 0.5236844420433044), ('Queen_Consort', 0.5235945582389832), ('queens', 0.5181134343147278), ('sultan', 0.5098593235015869), ('monarchy', 0.5087411403656006)]
0.76640123


<h3>Training a Token Embedding!</h3>
Trying to create an <b>Embedding Layer Weight Matrix</b><br><br>



| TOKEN ID | Weights (dimensions) |
|----------|-----------------------|
| 0        | [132, 156, 903.... 768 dimensions] |
| 1        | [374, 364, 937....] |
| 2        | [987, 126, 123....] |
| 3        | [456, 832, 072....] |
| 4        | [217, 753, 666....] |
...        | one vector for EACH token ID/word/token, total 50257

*numbers acc to gpt-2.. total 50257 * 768 = 38.5 million total weights!


1) All values intiially randomized
2) Slowly optimized during training process... typical neural network training... input is word/token, output is array with 768 values


<h3>Coding process (with 6 tokens * 3 dimensions)</h3>

In [None]:
vocab_size = 6
output_dim = 3

torch.manual_seed(123) # ensure same random weights!!

###### PyTorch syntax to create Embeddings layer neural network (nn)!!!
embedding_layer = torch.nn.Embedding(vocab_size, output_dim) # tokens, dimensions... #rows, #columns

print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [681]:
print(embedding_layer(torch.tensor([2]))) # specifically checking vectors for token ID 2,3,5... specifically tensor... "look up operation"

tensor([[ 1.2753, -0.2010, -0.1606]], grad_fn=<EmbeddingBackward0>)


<i>Note:</i>
This embedding layer is the same as a Neural Network linear Layer!!


X is one-hot-encoded matrix with each row for one token ID.. [2, 1, 3]<br>
[ 0 0 1 0 ]<br>
[ 0 0 0 1 ]<br>
[ 0 1 0 0 ]<br>

W uses each row as a the weights for one token ID... so each row is a vector of weights<br><br>
[ W01 - W11 - W21 - W31 ]<br>
[ W02 - W12 - W22 - W32 ]<br>
[ W03 - W13 - W23 - W33 ]<br>
[ W04 - W14 - W24 - W34 ]<br>

So, <b>Output = X . W<sup>T</sup></b> gives us exactly the embedding layer!!!<br>
Each row will simply have the weights of one token ID by index<br><br>
[ Token 0 Weights... ]<br>
[ Token 1 Weights... ]<br>
[ Token 2 Weights... ]<br>
[ Token 3 Weights... ]<br>

*but nn.Embedding() is much more efficient!! No unnecessary multiplications with 0 (from one-hot-encoding) in nn.Layer()



<h5>Soon, we'll give meaning to these vectors by training the Embedding layer and adjusting the weights...</h5>

<hr>
<h2> Part 11: Positional Embedding</h2>

UPTIL NOW...
1) Took sample text
2) Converted to tokens (basic: word-based tokenization.... main: byte-pair-encoding)
3) Created input-target pairs w/ sliding window (PyTorch, Dataloader)... Will later use to train the vector embeddings
4) Created embedding layers with random dimensions for each token's vector (to capture semantic relations based on how close vectors are)... Still need to train!!
5) ALMOST THEREEE... but how do we capture positional relationsssss???

"The cat sat on the mat" <br>
"On the mat the cat sat" <br>

^^ Positions aren't considered!!! The vectors would be the exact same for both sentences... positions arent being considered. not using the data to its entirety!

<h3> Types of Positional Embeddings</h3>

<b><u>Absolute</u></b><br><br>
A unique embedding is added for each position in inputs to convey location

(supposing 3 dimensional vectors... usually much bigger)<br>
If "cat" => [1,5,7]

Sentence 1 "cat" is at index 1 - <br>
Token Embedding => [1, 5, 7]<br>
Positional Embedding => [1.1, 1.2, 1.3] (always maintain same dimensions through decimals)<br>
<b>Final Input Embedding => [2.1, 6.2, 8.3]</b><br>
<br>
VS<br>
<br>
Sentence 2 "cat" is at index 4 - <br>
Token Embedding => [1, 5, 7] (same as before ofc)<br>
Positional Embedding => [4.1, 4.2, 4.3] (different!) <br>
<b>Final Input Embedding => [5.1, 9.2, 11.3]</b><br>
<br>
So input embeddings are different and also take location into consideration!<br>
<br>


<b><i>USUALLY, absolute embedding is good!! Also used by OpenAI and Google OG paper. <br>
But in real world use, if a token embedding is 12,357 or smth, adding just 1 or 2 will not help!!!<br>
Google had set a specific formula that gave value based on index... OpenAI would direclty just optimize the positonal embeddings as part of model training along with token embedding.</i></b>





<b><u>Relative</u></b><br><br>
Learns relation based on "how far apart" words are from each other instead of exact positions..
Advantage: generalizes better for sentences with varying lengths. An index of 1 vs 4 wont make a huge difference if it needs to be used for a response!! Works even if it hasnt seen such lengths during training.



When, fixed order of tokens is crucial such as sequence generations ---> Absolute<br>
For language modelling and long sequences (same phrase can appear at different places) -----> Relative

USUALLY, absolute embedding!

In [692]:
# Using large embedding layer now, more realistic

vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

print(token_embedding_layer.weight) # embedding layer
print("\n\n")
print(token_embedding_layer(torch.tensor([0]))) # looking up specific token id's/index

Parameter containing:
tensor([[ 1.1074, -0.1227,  0.3289,  ...,  0.6672,  0.0473, -0.4185],
        [ 1.2899, -2.3575,  1.2347,  ..., -0.4903, -1.2346, -1.0143],
        [ 0.3657,  0.5980, -1.1092,  ...,  1.4832, -2.2664,  1.1659],
        ...,
        [ 1.2279,  0.2958, -0.2228,  ..., -1.1340,  1.8262, -0.0661],
        [-1.4783,  2.0495,  0.0999,  ...,  0.0761, -2.3557,  0.5983],
        [-0.3327, -0.1217,  0.0314,  ..., -2.6933, -0.3726,  0.4369]],
       requires_grad=True)



tensor([[ 1.1074, -0.1227,  0.3289, -1.2714,  0.6395, -1.3258, -0.6123, -0.8492,
         -1.4207,  0.9257, -0.5703, -0.1819,  0.0127,  1.0420, -0.1920, -1.0450,
         -0.0462,  0.0226, -0.6988,  0.7842,  0.6454, -1.0759, -1.5956, -0.4089,
          1.3126,  0.8159,  0.8382, -0.5709, -0.7025,  0.8555, -0.1772,  1.8427,
          0.5782, -2.2339, -1.2024,  0.7760,  0.4634,  1.1657, -0.3650,  0.2147,
          1.3730,  1.4244, -0.0530, -0.6828, -0.7491, -1.4472,  1.5555,  1.8022,
          0.3205,  0.4074, -

In [None]:
%%capture
# Setting up dataloader! Input-target pairs with sliding window...
context_size = 4

dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=context_size, stride=context_size, shuffle=False
) 
# Weights update after every 8 pairs (8 rows)
# Context window is 4 input words to predict next output word.
# Stride is same as context_window, so there is no overlap of words. More efficient, maybe slightly less accurate  / underfit

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [None]:
print("TokenIDs:\n", inputs)
print("Targets:\n", targets)
print("Inputs/Targets shape:\n", inputs.shape) # batch size x dimensions ---> rows x columns

TokenIDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])
Inputs/Targets shape:
 torch.Size([8, 4])


EACH number (token ID) in the above matrices has a 256 dimension vector representation of it that we want to set up and refine...
<br><br>
[ (256) (256) (256) (256) ],<br>
[ (256) (256) (256) (256) ],<br>
[ (256) (256) (256) (256) ],<br>
[ (256) (256) (256) (256) ],<br>
....

3-d dataset with rows and columns, and each item is a 256-d array of its own...<br><br>
8 * 4 * 256 <br>
(batch_size * context_window * vector_embedding_dimensions)

In [712]:
token_embedding_test = token_embedding_layer(torch.tensor(([0]))) # vector for first tokenID=0
print(token_embedding_test)

tensor([[ 1.1074, -0.1227,  0.3289, -1.2714,  0.6395, -1.3258, -0.6123, -0.8492,
         -1.4207,  0.9257, -0.5703, -0.1819,  0.0127,  1.0420, -0.1920, -1.0450,
         -0.0462,  0.0226, -0.6988,  0.7842,  0.6454, -1.0759, -1.5956, -0.4089,
          1.3126,  0.8159,  0.8382, -0.5709, -0.7025,  0.8555, -0.1772,  1.8427,
          0.5782, -2.2339, -1.2024,  0.7760,  0.4634,  1.1657, -0.3650,  0.2147,
          1.3730,  1.4244, -0.0530, -0.6828, -0.7491, -1.4472,  1.5555,  1.8022,
          0.3205,  0.4074, -0.0992, -0.8802, -1.4398,  0.4970,  0.2363, -0.2623,
          1.0569,  0.9380, -1.1631,  0.3858, -0.5421,  0.3127, -0.2219,  2.0297,
         -0.3111, -1.7294,  1.1484,  0.4773, -1.3310,  0.8506,  1.2046, -0.1175,
         -1.6273,  1.9859,  0.8985, -0.9730,  1.3443, -0.7319, -0.0710, -0.4423,
          0.1323, -2.2388,  0.3642,  1.4139,  1.1964,  0.2696, -0.5761,  0.0887,
          1.7190, -0.4105, -2.0724,  1.3177,  1.2086, -0.3600,  0.2405,  0.8121,
         -0.3411,  0.0616, -

In [None]:
# For whole thing...
token_embeddings = token_embedding_layer(inputs) # inputs is ALREADY a tensor!!
print(token_embeddings.shape)
# each token id has been represented as a 256 dimensional vector!!

torch.Size([8, 4, 256])


<h3>Now, positional embedding layer</h3>

ONLY need to worry about 4 positions!!! Since context window being used is just 4 words. <br>
When using for trainnig, the positional embedding would only be from 1 - 4 index values
<br>
<br>
So Positional Embedding layer will be, <br>
4 * 256 (Possible positions * vector dimension, same as token embedding vector)

In [729]:
position_embedding_layer = torch.nn.Embedding(context_size, output_dim)


position_embeddings = position_embedding_layer(torch.arange(context_size)) # just sets each fron 0 to max_length!

print(position_embedding_layer.weight)

Parameter containing:
tensor([[-1.0311, -0.5184, -0.3531,  ...,  1.2322, -0.7271,  1.7539],
        [-0.8358,  0.4943, -0.5284,  ..., -0.2198,  0.3671, -0.6770],
        [ 0.3858, -0.1030, -1.5038,  ..., -0.8790, -1.3250, -0.5609],
        [-0.0180, -0.7589, -1.5611,  ...,  1.0646,  0.1947, -1.8180]],
       requires_grad=True)


<h3> Adding token embedding and positional embedding!! </h3>


[ (256) (256) (256) (256) ]<br>
[ (256) (256) (256) (256) ]<br>
[ (256) (256) (256) (256) ]<br>
[ (256) (256) (256) (256) ]<br>
...
(8 x 4 x 256)


&nbsp;&nbsp; +

[ (256) (256) (256) (256) ]<br>
(4 x 256)

<br><br>

Same 4 values of positional embeddings stay like that only forever.

It still just adds that row to EVERY single row in the above matrix!!! <br>Happens automatically through matrix broadcasting!

<b>Finally =====> Input Embeddings (8 x 4 x 256)</b>

<hr>
<h2>Part 12</h2>