In [1]:
from importlib.metadata import version

print("torch version: ", version("torch"))
print("tiktoken version: ", version("tiktoken"))


torch version:  2.5.1
tiktoken version:  0.9.0


In [2]:
file_path = './data/Jane-Austen-Pride_and_Prejudice-pg1342.txt'

with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

print("Total number of charactoer: ", len(raw_text))
print(raw_text[:299])

Total number of charactoer:  748126
﻿The Project Gutenberg eBook of Pride and Prejudice
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg Lic


In [3]:
import re

text = 'Hello, world. This, is a test.'
result = re.split(r'(\s)', text)

print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [4]:
result = re.split(r'([,.]|\s)', text)
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [5]:
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [6]:
text = "Hello, world. Is this-- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [7]:
preprocessed = re.split(r'([,.;:?_!"()\"]|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))


153971


In [8]:
print( preprocessed[:45])

['\ufeffThe', 'Project', 'Gutenberg', 'eBook', 'of', 'Pride', 'and', 'Prejudice', 'This', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'in', 'the', 'United', 'States', 'and', 'most', 'other', 'parts', 'of', 'the', 'world', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', '.', 'You', 'may', 'copy', 'it', ',', 'give', 'it']


In [9]:
# Create a Vocabulary
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

8254


In [10]:
vocab = {token:integer for integer, token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i > 50:
        break

('!', 0)
('#1342]', 1)
('$1', 2)
('$5', 3)
('&', 4)
('(', 5)
(')', 6)
('***', 7)
('*/', 8)
(',', 9)
('-', 10)
('--', 11)
('.', 12)
('/*', 13)
('//www', 14)
('000', 15)
('1', 16)
('10', 17)
('108', 18)
('113', 19)
('118', 20)
('12', 21)
('132', 22)
('139', 23)
('143', 24)
('146', 25)
('148', 26)
('15', 27)
('1500', 28)
('154', 29)
('156', 30)
('15th', 31)
('161', 32)
('166', 33)
('168', 34)
('175', 35)
('177', 36)
('1796', 37)
('18', 38)
('181', 39)
('1813', 40)
('189', 41)
('1894', 42)
('1894]', 43)
('18th', 44)
('194', 45)
('198', 46)
('1998', 47)
('2', 48)
('20%', 49)
('200', 50)
('2001', 51)


In [11]:
import os

class SimpleTokenizerV1:
    def __init__(self, vocablary):    # vocabulary
        self.str_to_int = vocablary                                         #A
        self.int_to_str = {i:s for s,i in vocablary.items()}                #B

    def encode(self, text):                                                 #C
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):                                                  #D
        return ' '.join([self.int_to_str[i] for i in ids])   
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)                     #E
        return text

tokenizer = SimpleTokenizerV1(vocab)
text = "but I think she would have made a pretty good journey even in a black one."
print(text)
ids = tokenizer.encode(text)
print(ids)

but I think she would have made a pretty good journey even in a black one.
[1827, 501, 7173, 6571, 7860, 3928, 4752, 1033, 5720, 3761, 4494, 3206, 4193, 1033, 1703, 5235, 12]


In [12]:
print(tokenizer.decode(ids))

but I think she would have made a pretty good journey even in a black one .


In [13]:
# Add the special context token
all_tokens = sorted(list(set(preprocessed)))
print(" All number of the all_tokens : ", len(all_tokens))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer, token in enumerate(all_tokens)}
print(len(vocab.items()))

 All number of the all_tokens :  8254
8256


In [14]:
for i, item in enumerate(list(vocab.items())[-30:]):
    print(item)

('“this', 8226)
('“though', 8227)
('“to', 8228)
('“undoubtedly', 8229)
('“very', 8230)
('“was', 8231)
('“we', 8232)
('“were', 8233)
('“what', 8234)
('“when', 8235)
('“where', 8236)
('“whether', 8237)
('“which', 8238)
('“while', 8239)
('“who', 8240)
('“why', 8241)
('“will', 8242)
('“without', 8243)
('“yes', 8244)
('“you', 8245)
('“your', 8246)
('“‘After', 8247)
('“‘I', 8248)
('“‘When', 8249)
('“’Tis', 8250)
('”', 8251)
('•', 8252)
('\ufeffThe', 8253)
('<|endoftext|>', 8254)
('<|unk|>', 8255)


In [26]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        tokens = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        tokens = [token.strip() for token in tokens if token.strip()]
        #print(" Tokens = ", tokens)
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in tokens]
        ids = [self.str_to_int[s] for s in preprocessed]
        #print(" idx = ", ids)
        return ids

    def decode(self, token_ids):
        text = " ".join([self.int_to_str[i] for i in ids])                   #B
        text = re.sub(r'\s+([,.;:?!"()\'])', r'\1', text)
        return text

text1 = "Hello, do you like tea?"
text2 = "but I overhaul she would have made a pretty good journey even in a pink one."
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> but I overhaul she would have made a pretty good journey even in a pink one.


In [27]:
tokenizer = SimpleTokenizerV2(vocab)
print(tokenizer.encode(text))

 Tokens =  ['Hello', ',', 'do', 'you', 'like', 'tea', '?', '<|endoftext|>', 'but', 'I', 'overhaul', 'she', 'would', 'have', 'made', 'a', 'pretty', 'good', 'journey', 'even', 'in', 'a', 'pink', 'one', '.']
 idx =  [8255, 9, 2894, 7890, 4656, 7098, 142, 8254, 1827, 501, 8255, 6571, 7860, 3928, 4752, 1033, 5720, 3761, 4494, 3206, 4193, 1033, 8255, 5235, 12]
[8255, 9, 2894, 7890, 4656, 7098, 142, 8254, 1827, 501, 8255, 6571, 7860, 3928, 4752, 1033, 5720, 3761, 4494, 3206, 4193, 1033, 8255, 5235, 12]


In [32]:
print("text = ", text)
encoded = tokenizer.encode(text)
print(" Encoding = ", encoded)
print(" length = ", len(encoded))

decoded = tokenizer.decode(tokenizer.encode(text)) 
print(" Decoded = ", decoded)
print(" decoded length = ", len(decoded))


text =  Hello, do you like tea? <|endoftext|> but I overhaul she would have made a pretty good journey even in a pink one.
 Tokens =  ['Hello', ',', 'do', 'you', 'like', 'tea', '?', '<|endoftext|>', 'but', 'I', 'overhaul', 'she', 'would', 'have', 'made', 'a', 'pretty', 'good', 'journey', 'even', 'in', 'a', 'pink', 'one', '.']
 idx =  [8255, 9, 2894, 7890, 4656, 7098, 142, 8254, 1827, 501, 8255, 6571, 7860, 3928, 4752, 1033, 5720, 3761, 4494, 3206, 4193, 1033, 8255, 5235, 12]
 Encoding =  [8255, 9, 2894, 7890, 4656, 7098, 142, 8254, 1827, 501, 8255, 6571, 7860, 3928, 4752, 1033, 5720, 3761, 4494, 3206, 4193, 1033, 8255, 5235, 12]
 length =  25
 Tokens =  ['Hello', ',', 'do', 'you', 'like', 'tea', '?', '<|endoftext|>', 'but', 'I', 'overhaul', 'she', 'would', 'have', 'made', 'a', 'pretty', 'good', 'journey', 'even', 'in', 'a', 'pink', 'one', '.']
 idx =  [8255, 9, 2894, 7890, 4656, 7098, 142, 8254, 1827, 501, 8255, 6571, 7860, 3928, 4752, 1033, 5720, 3761, 4494, 3206, 4193, 1033, 8255, 52