In [1]:
import os
import urllib.request
import re

In [2]:
if not os.path.exists("the-verdict.txt"):
    url = ("https://raw.githubusercontent.com/rasbt/"
           "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
           "the-verdict.txt")
    file_path = "the-verdict.txt"
    urllib.request.urlretrieve(url, file_path)

In [3]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


#### Main goal is to create a vocabulary i.e dictionary mapping a toke to token id
Steps:
<ul>
    <li> split the words into tokens using regex
    <li> create a mapping from toke to token id for the whole text -> vocabulary
</ul>

In [4]:
# Example of splitting text into words
text = "Hello, world. Is this-- a test?"

result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [5]:
# applying on the whole text
tokens = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
tokens = [item.strip() for item in tokens if item.strip()]
print(tokens[:10])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius']


In [6]:
# create token ids in alphabetical order
vocab = {token: idx for idx, token in enumerate(sorted(set(tokens)))}

In [7]:
len(vocab)

1130

In [8]:
type(vocab)

dict

In [16]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('yet', 1125)
('you', 1126)
('younger', 1127)
('your', 1128)
('yourself', 1129)


In [None]:
# extend a vocab with special tokens
all_tokens = sorted(list(set(tokens)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer,token in enumerate(all_tokens)}

In [18]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [10]:
from tokenizer import SimpleTokenizerV1

tokenizer_obj = SimpleTokenizerV1(vocab=vocab)

text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""

ids = tokenizer_obj.encode(text)

In [11]:
ids

[1,
 56,
 2,
 850,
 988,
 602,
 533,
 746,
 5,
 1126,
 596,
 5,
 1,
 67,
 7,
 38,
 851,
 1108,
 754,
 793,
 7]

In [4]:
# Hello and palace are not in the vocab, so they get mapped to <|unk|>
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [20]:
from tokenizer import SimpleTokenizerV2
tokenizer = SimpleTokenizerV2(vocab)

In [21]:
print(tokenizer.encode(text))

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]


In [22]:
print(tokenizer.decode(tokenizer.encode(text)))

<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


In [1]:
import tiktoken 

In [2]:
tokenizer = tiktoken.get_encoding("gpt2")

In [5]:
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 262, 20562, 13]


In [7]:
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [9]:
print(tokenizer.encode("Akwirw ier"))

[33901, 86, 343, 86, 220, 959]


In [10]:
print(tokenizer.decode(tokenizer.encode("Akwirw ier")))

Akwirw ier
