<h1> Building LLMs from Scratch </h1>

<h2> Part 7: Tokenization </h2>
(in sync with lectures... Lectures 1-6 were all intuition and theory)

In [3]:
# Retreived the text and stores in raw_text

with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print("Total number of characters:", len(raw_text))
print((raw_text[:99]))


Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [30]:
import re


text = "Hello, world. Is this-- is a test?"
result = re.split(r'(\s)', text) # splitting wherever there is a whitespace! \s is for spaces like \n is for new line

print(result)

['Hello,', ' ', 'world.', ' ', 'Is', ' ', 'this--', ' ', 'is', ' ', 'a', ' ', 'test?']


In [31]:
result = re.split(r'([,.:;?_!"()\']|--|\s)', text) # splitting at more stuff

result = [item for item in result if item.strip()]
print(result) # separated words and punctuations... whitespaces removed from array (fine for this, but what about where structure is important?? like code!!! whitespace is important!)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'is', 'a', 'test', '?']


In [34]:
# for whole text!!

preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item for item in preprocessed if item.strip()]
print(preprocessed[:30])
print("Total count:", len(preprocessed))

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']
Total count: 4690


<h3> Converting to token IDs </h3>

In [44]:
all_words = sorted(set(preprocessed)) # using set removes all repeated values!!
vocab_size = len(all_words)

print(all_words[:99])
print(vocab_size) # only unique words now!

['!', '"', "'", '(', ')', ',', '--', '.', ':', ';', '?', 'A', 'Ah', 'Among', 'And', 'Are', 'Arrt', 'As', 'At', 'Be', 'Begin', 'Burlington', 'But', 'By', 'Carlo', 'Chicago', 'Claude', 'Come', 'Croft', 'Destroyed', 'Devonshire', 'Don', 'Dubarry', 'Emperors', 'Florence', 'For', 'Gallery', 'Gideon', 'Gisburn', 'Gisburns', 'Grafton', 'Greek', 'Grindle', 'Grindles', 'HAD', 'Had', 'Hang', 'Has', 'He', 'Her', 'Hermia', 'His', 'How', 'I', 'If', 'In', 'It', 'Jack', 'Jove', 'Just', 'Lord', 'Made', 'Miss', 'Money', 'Monte', 'Moon-dancers', 'Mr', 'Mrs', 'My', 'Never', 'No', 'Now', 'Nutley', 'Of', 'Oh', 'On', 'Once', 'Only', 'Or', 'Perhaps', 'Poor', 'Professional', 'Renaissance', 'Rickham', 'Riviera', 'Rome', 'Russian', 'Sevres', 'She', 'Stroud', 'Strouds', 'Suddenly', 'That', 'The', 'Then', 'There', 'They', 'This', 'Those']
1130


In [61]:
# assigning the token values (ENCODING!)
vocab = {token:integer for integer, token in enumerate(all_words)} # enumerate takes all items and gives index value to each... mapping as dict!

for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


<h3> Tokenizer Class </h3>

In [90]:
class SimpleTokenizerV1:
     def __init__(self, vocab):
          self.str_to_int = vocab
          self.int_to_str = {i:s for s,i in vocab.items()}

     def encode(self, text): # converting strings to token IDs
          preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
          preprocessed = [item for item in preprocessed if item.strip()]

          IDs = [self.str_to_int[s] for s in preprocessed]
          return IDs
     
     def decode(self, IDs):
          text = " ".join([self.int_to_str[i] for i in IDs]) # .join puts in an array
          text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) # minor change, just removing any spaces created before punctuations
          text = re.sub(r'([\'"])\s+', r'\1', text) # removing spaces after single and double quotes
          return text

In [None]:
tokenizer = SimpleTokenizerV1(vocab) # vocab done earlier... dictionary of each words with a token
text = """"It's the last he painted, you know,"
Mrs. Gisburn said with pardonable pride.
"""

ids = tokenizer.encode(text)
print(ids)

# wont_work = tokenizer.encode("wassup") ----- this wont work because its not in our vocab
# print(wont_work)

text_again = tokenizer.decode(ids)
print(text_again)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
"It's the last he painted, you know,"Mrs. Gisburn said with pardonable pride.


<h3>Special Context Tokens</h3>
Handling unknown words...

<|unk|> for unknown words <br>
<|endoftext|> a token between unrelated texts (when using multiple data sources)


In [111]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens += ["<|endoftext|>", "<|unk|>"]
print(all_tokens[-5:])

vocab = {token:integer for integer, token in enumerate(all_tokens)}

print("New vocabulary size:", len(vocab)) # increased by 2!

['younger', 'your', 'yourself', '<|endoftext|>', '<|unk|>']
New vocabulary size: 1132


In [112]:
class SimpleTokenizerV2:
     def __init__(self, vocab):
          self.str_to_int = vocab
          self.int_to_str = {i:s for s,i in vocab.items()}

     def encode(self, text): # converting strings to token IDs
          preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
          preprocessed = [item for item in preprocessed if item.strip()]
          preprocessed = [
               item if item in self.str_to_int
               else "<|unk|>" for item in preprocessed
          ]

          IDs = [self.str_to_int[s] for s in preprocessed]
          return IDs
     
     def decode(self, IDs):
          text = " ".join([self.int_to_str[i] for i in IDs]) # .join puts in an array
          text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) # minor change, just removing any spaces created before punctuations
          text = re.sub(r'([\'"])\s+', r'\1', text) # removing spaces after single and double quotes
          return text
    

In [156]:
tokenizer = SimpleTokenizerV2(vocab) # using v2 that handles special cases!

## testing weird stuff!!
weird_text = "She had pardonable pride... <|endoftext|> Gisburn gave marshmellow tea!"
print(weird_text, "\n")

will_work = tokenizer.encode(weird_text) ## now it works!! gets assigned unknown id
print("IDs:", will_work, "\n") ## testing both unknown word and <|endoftext|>!!

weird_text_again = tokenizer.decode(will_work)
print(weird_text_again)

She had pardonable pride... <|endoftext|> Gisburn gave marshmellow tea! 

IDs: [88, 514, 754, 793, 7, 7, 7, 1130, 38, 484, 1131, 975, 0] 

She had pardonable pride... <|endoftext|> Gisburn gave <|unk|> tea!


<h4>Note:</h4>Other special tokens also exist... (but even OpenAI gpt's dont used these)
1) beginning of sequence (BOS) - start of new text source
2) padding (PAD) => shorter text sources are padded so all sources have same size (as much as the largest)... Helps with parallel processing

<hr>
<h2>Part 8</h2>