In [31]:
# Store the book in a variable raw_text
with open("books_text/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
with open("books_text/sorcerers-stone.txt", "r", encoding="utf-8") as f:
    hp1_sorcerers_stone_text = f.read()
print(raw_text[:50])
print(hp1_sorcerers_stone_text[:50])

I HAD always thought Jack Gisburn rather a cheap g
THE BOY WHO LIVED

Mr. and Mrs. Dursley, of number


In [32]:
import re

# Use regex to get split by punctuation
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
# Get rid of the white spaces. White spaces are not needed for this use case
preprocessed = [item.strip() for item in preprocessed if item.strip()]

hp1_preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', hp1_sorcerers_stone_text)
hp1_preprocessed = [item.strip() for item in hp1_preprocessed if item.strip()]

print(preprocessed[:50])
print(hp1_preprocessed[:50])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself']
['THE', 'BOY', 'WHO', 'LIVED', 'Mr', '.', 'and', 'Mrs', '.', 'Dursley', ',', 'of', 'number', 'four', ',', 'Privet', 'Drive', ',', 'were', 'proud', 'to', 'say', 'that', 'they', 'were', 'perfectly', 'normal', ',', 'thank', 'you', 'very', 'much', '.', 'They', 'were', 'the', 'last', 'people', 'you’d', 'expect', 'to', 'be', 'involved', 'in', 'anything', 'strange', 'or', 'mysterious', ',', 'because']


In [4]:
# Create the Token IDs
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

1130


In [10]:
vocab = {token: integer for integer,token in enumerate(all_words)}
for token,i in vocab.items():
    print(f'ID is {i} and item is {token}')
    if i == 10:
        break


ID is 0 and item is !
ID is 1 and item is "
ID is 2 and item is '
ID is 3 and item is (
ID is 4 and item is )
ID is 5 and item is ,
ID is 6 and item is --
ID is 7 and item is .
ID is 8 and item is :
ID is 9 and item is ;
ID is 10 and item is ?


In [None]:
# Create Tokenizer class
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i: item for item,i in vocab.items()}
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
                                
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

# Test out the Tokenizer class, and the encode method
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)


[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [None]:
# Test out the decode method
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [None]:
# If tokens are not properly mapped, an error will be thrown since in this case, 
# Hello was not in the pretrained data
text = "Hello, do you like tea?"
print(tokenizer.encode(text))

KeyError: 'Hello'

In [33]:
# <|unk|> and <|endoftext|> can be used when,
# Use <|unk|> when tokenizer encounters an unknown word (isn't in vocab)
# Use <|endoftext|> to signifiy the end of a text 
# Ex. When we finish reading article 1, we put <|endoftext|>
# At the start of article 2, and so on
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

all_tokens_hp1 = sorted(list(set(hp1_preprocessed)))
all_tokens_hp1.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token: i for i,token in enumerate(all_tokens)}
vocab_hp1 = {token: i for i,token in enumerate(all_tokens_hp1)}

print(len(vocab))
print(len(vocab_hp1))

1132
7388


In [35]:
# As you can see, the two special tokens are present at the bottom
for i,item in enumerate(list(vocab.items())[-5:]):
    print(f'({item}, {i})')
print("--------")
for i,item in enumerate(list(vocab_hp1.items())[-5:]):
    print(f'({item}, {i})')

(('younger', 1127), 0)
(('your', 1128), 1)
(('yourself', 1129), 2)
(('<|endoftext|>', 1130), 3)
(('<|unk|>', 1131), 4)
--------
(('”', 7383), 0)
(('•k', 7384), 1)
(('■”', 7385), 2)
(('<|endoftext|>', 7386), 3)
(('<|unk|>', 7387), 4)


In [42]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i: token for token,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)           
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        # If token is not present in our current vocab, replace it by an unknown token
        preprocessed = [
            token if token in self.str_to_int 
            else "<|unk|>" for token in preprocessed
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

tokenizer = SimpleTokenizerV2(vocab)
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace"
# Append the <|endoftext|> at the end of text 1 and beginning
# of text 2
text = " <|endoftext|> ".join((text1,text2))
print(text)
        

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace


In [43]:
hp1_tokenizer = SimpleTokenizerV2(vocab_hp1)
text1 = "Percy Jackson and the Lightning Thief"
text2 = "Hello. It's me. I was wondering if after all..."

text_hp = " <|endoftext|> ".join((text1,text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace


In [44]:
# 1131 is the token ID of "<|unk|>" and 1130 is the 
# token ID of <|endoftext|>
tokenizer.encode(text)

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131]

In [45]:
hp1_tokenizer.encode(text_hp)

[849,
 7387,
 1380,
 6013,
 7387,
 1105,
 7386,
 7387,
 7,
 587,
 2,
 5058,
 4058,
 7,
 568,
 6452,
 6641,
 3604,
 1328,
 1354,
 7,
 7,
 7]

In [46]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>'

In [47]:
hp1_tokenizer.decode(hp1_tokenizer.encode(text_hp))

"Percy <|unk|> and the <|unk|> Thief <|endoftext|> <|unk|>. It' s me. I was wondering if after all..."

In [None]:
# From comparing the decoded tokenized text above with 
# The original input text, we know that the training set 
# did not contain the words "Hello" and "palace"

# Could use other tokens
# [BOS] (beginning of sequence) Marks the start of a text/sequence
# [EOS] (end of sequence): Positioned at the end of a text
# and is useful for concatenating multiple unrelated texts.
# [PAD] (padding): When training LLMs with a batch size larger
# than one, the batch might contain texts of varying lengths. To ensure that 
# all the texts have the smae length the shorter texts
# are padded using [PAD] token, up to length of the longest text in batch

# Tokenizer used for GPT models does not need any of these 
# tokens mentioned but only uses an <|endoftext|> for simplicity

# byte pair enconding tokenizer breaks down words 
# into subword units for tokenizer with GPT models

#### BYTE PAIR ENCODING

In [48]:
# Use tiktoken github
! pip3 install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.11.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.7 kB)
Downloading tiktoken-0.11.0-cp312-cp312-macosx_11_0_arm64.whl (996 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m996.7/996.7 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.11.0


In [49]:
import importlib
import tiktoken

print(f'tiktoken version: {importlib.metadata.version("tiktoken")}')

tiktoken version: 0.11.0


In [None]:
# Instantiate BPE tokenizer from tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
hp1_tokenizer = tiktoken.get_encoding("gpt2")

In [51]:
# Usage of tokenizer is similar to SimpleTokenzizerV2
# We implemented previously via an encode method
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
    "of someunknownPlace"
)
encoded_mapping = tokenizer.encode(text, allowed_special = {"<|endoftext|>"})
print(encoded_mapping)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271]


In [52]:
# Can then convert token IDs back into text using the decode
# method. Similar to the SimpleTokenizerV2
# Tokenizer is able to encode words that look wrong, like 
# someunknownPlace

tokens = tokenizer.decode(encoded_mapping)
print(tokens)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace


In [None]:
# Two advantages so far.
# 1. Reduces the amount of tokens we have
# 2. Can deal with unknown words


### Can make two noteworthy observations based on the token IDs and decoded text above:

#### 1. <|endoftext|> token is assigned to relatively large token ID (50256)
#### 2. BPE tokenizer above encodes and decodes unknown words, can handle any unknown word.

#### Algorithm underlying BPE breaks down words that aren't in its predefined vocab into smaller subwords. 
#### This enables it to handle out-of-vocabulary words.
#### Because of BPE algorithm, if the tokenizer encounters an unfamilar word during tokenization, it can represent it as a sequence of subwords or characters

In [None]:
# Proof that it works, even with gibberish
encoded_mapping = tokenizer.encode("Akwirq ier")
print(encoded_mapping)

tokens = tokenizer.decode(encoded_mapping)
print(tokens)

[33901, 86, 343, 80, 220, 959]
Akwirq ier


In [None]:
# One more example with words that include apostrophes
text = "tames, breaks, lames, fames, dont's, can't, shouldn't, can't, won't, against, lust, must, rust, fuss, shan't"
encoded_mapping = tokenizer.encode(text)
print(encoded_mapping)

tokens = tokenizer.decode(encoded_mapping)
print(tokens)

[83, 1047, 11, 9457, 11, 300, 1047, 11, 277, 1047, 11, 17666, 338, 11, 460, 470, 11, 6584, 470, 11, 460, 470, 11, 1839, 470, 11, 1028, 11, 22279, 11, 1276, 11, 17000, 11, 34297, 11, 427, 272, 470]
tames, breaks, lames, fames, dont's, can't, shouldn't, can't, won't, against, lust, must, rust, fuss, shan't
