# LLM from scratch

## Step 1: Creating Tokens

In [None]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print("total number of character: ", len(raw_text))

# print the first 100 characters 
print(raw_text[:99])

total number of character:  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


the goal is to tokenize all the characters in the document 20479
to split the text into individual tokens we can use the python library `re` (regular expressions)

In [2]:
import re

text = "hello, world. This is a test"
result = re.split(r'(\s)', text)

print(result)

['hello,', ' ', 'world.', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test']


the result is words and whitespaces but we need commas and fullstops

In [3]:
result = re.split(r'([,.]|\s)', text)
print(result)

['hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test']


In [None]:
# remove the whitespaces 
result = [item for item in result if item.strip()]
print(result)

['hello', ',', 'world', '.', 'This', 'is', 'a', 'test']


Removing whitespaces reduces the memory and computing requirement. However, whitespaces can be useful for models that are sensitive to the exact structure of the text e.g. Python Code is sensitive to indentation.

In [7]:
# include other punctuation marks 
text = "hello, world. This is -- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item for item in result if item.strip()]
print(result)

['hello', ',', 'world', '.', 'This', 'is', '--', 'a', 'test', '?']


Now we have a basic tokenizer working. Let's apply it to our document

In [9]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])
print(len(preprocessed))

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']
4690


## Step 2: Convert Tokens into Token IDs

In [10]:
# build a vocabulary of unique tokens
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

1130


In [None]:
# create the vocabulary and print its first 51 entries 
vocab = {token:integer for integer, token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i>=50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


### Implement the Tokenizer class in Python

Encode method takes text and gives ids

Decode method takes ids and gives text

In [18]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:t for t,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        # remove whitespaces 
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        # assign an id to each token t 
        ids = [self.str_to_int[t] for t in preprocessed]
        return ids

    def decode(self, ids):
        # convert the ids to tokens 
        text = " ".join([self.int_to_str[i] for i in ids])
        # replace spaces before punctuations 
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [19]:
# instantiate a new tokenizer object from the above class and tokenize a passage 
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know,"
            Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [20]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [21]:
text = "Hello, do you like tea?"
ids = tokenizer.encode(text)
print(ids)

KeyError: 'Hello'

Hello does not exist in the vocabulary so it throws the error. To prevent this we add special context tokens

### Adding Special Context Tokens

Modify the tokenizer to handle unknown words (not in vocabulary. Use the special unkown word token <|unk|>and the end of text token <|endoftext|>

In [33]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer, token in enumerate(all_tokens)}
len(vocab.items())

1132

In [34]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:t for t,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        # remove whitespaces 
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed         # the change and addition of unknown
        ]
        # assign an id to each token t 
        ids = [self.str_to_int[t] for t in preprocessed]
        return ids

    def decode(self, ids):
        # convert the ids to tokens 
        text = " ".join([self.int_to_str[i] for i in ids])
        # replace spaces before punctuations 
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [35]:
tokenizer = SimpleTokenizerV2(vocab)
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))

print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [36]:
tokenizer.encode(text)

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]

In [37]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'

### BYTE PAIR ENCODING
what GPT uses for tokenization