In [103]:
class BasicTokenizer:
    def __init__(self):
        self.load = None
        self.merges = {}
        self.processed = None
        self.num_merges = 0
        self.vocab = {}
    def train(self, text, vocab_size=276, verbose=False):
        self.load = None

    
        f = open(text, "r", encoding="utf8")
        self.load = f.read()
        # print(self.load)
        f.close

        tokens = self.load.encode("utf-8") # raw bytes
        tokens = list(map(int, tokens)) # convert to a list of integers in range 0..255 for convenience

        #print max token value and print the readed text
        if verbose:
            print(max(tokens))
            stats = self.get_stats(tokens)
            print("top_pair: ",max(stats, key=stats.get))
        # resetting the number of merge and the merges dictionary
        self.num_merges = vocab_size-256
        ids = tokens # copy so we don't destroy the original list
        
        # resetting the merge dictionary every time we train the tokenizor
        self.merges = {} # (int, int)-> int
        self.vocab = {}
        for i in range(self.num_merges):
            stats = self.get_stats(ids)
            pair = max(stats, key=stats.get)
            idx = 256 + i
            if verbose:
                print(f"merging {pair} into a new token {idx}")
            ids = self.merge(ids, pair, idx)
            self.merges[pair] = idx

        # building the vocabary

        self.vocab = {idx: bytes([idx]) for idx in range(256)}
        for (p0, p1), idx in self.merges.items():
            self.vocab[idx] = self.vocab[p0] + self.vocab[p1]
    def encode(self, text):
        # givent a string return list of integers (the tokens)
    
        tokens = list(text.encode("utf-8"))
        while len(tokens) >= 2:
            stats = self.get_stats(tokens)
            pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
            if pair not in self.merges:
                break # nothing else can be merged
            idx = self.merges[pair]
            tokens = self.merge(tokens, pair, idx)
        return tokens
    def decode(self, ids):
        # given ids (list of integers), return Python string
        # todo, send this 2-3 lines of code to the main code block
        # to avoid repeating when everytime decode is called

        
        tokens = b"".join(self.vocab[idx] for idx in ids)
        text = tokens.decode("utf-8", errors="replace")
        return text
    def get_stats(self, ids):
        counts = {}
        for pair in zip(ids, ids[1:]): #Pythonic way to interate
            counts[pair] = counts.get(pair, 0) +1
        return counts
    def merge(self, ids, pair, idx):
        newids = []
        i = 0
        while i < len(ids):
            if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
                newids.append(idx)
                i += 2
            else:
                newids.append(ids[i])
                i += 1
        return newids


In [104]:
btoken = BasicTokenizer()

In [105]:
# btoken.train("taylorswift.txt",25)
btoken.train("taylorswift.txt", verbose=True)

226
top_pair:  (101, 32)
merging (101, 32) into a new token 256
merging (44, 32) into a new token 257
merging (100, 32) into a new token 258
merging (46, 32) into a new token 259
merging (114, 32) into a new token 260
merging (50, 48) into a new token 261
merging (115, 32) into a new token 262
merging (105, 110) into a new token 263
merging (111, 110) into a new token 264
merging (114, 105) into a new token 265
merging (116, 32) into a new token 266
merging (116, 104) into a new token 267
merging (101, 258) into a new token 268
merging (257, 261) into a new token 269
merging (97, 110) into a new token 270
merging (97, 114) into a new token 271
merging (101, 260) into a new token 272
merging (121, 32) into a new token 273
merging (97, 108) into a new token 274
merging (267, 256) into a new token 275


In [106]:
print(btoken.decode(btoken.encode("hello world")))

hello world


We can see "hello word" works with this tokenizor. If we encode it to 8 byte and decode it back, we will get back the orginal "hello".

However, I am going to test 3 more chunk of text to see it is good or not.

In [107]:
ch1 = """upporter of the arts. A benefactor of the Nashville Songwriters Hall of Fame,[610] Swift has donated $75,000 to Nashville's Hendersonville High School to help refurbish the school auditorium,[611] $4 million to build a new education center at the Country Music Hall of Fame and Museum in Nashville,[612] $60,000 to the music departments of six US colleges,[613] and $100,000 to the Nashville Symphony.[614] Also a promoter of children's literacy, she has donated money and books to schools around the country.[615][616] In 2007, Swift partnered with the Tennessee Association of Chiefs of Police to launch a campaign to protect children from online predators.[617] She has donated items to several charities for auction, including the UNICEF Tap Project and MusiCares.[618] As recipient of the Academy of Country Music's Entertainer of the Year in 2011, Swift donated $25,000 to St. Jude Children's Research Hospital, Tennessee.[619] In 2012, Swift participated in the Stand Up to Cancer telethon, performing the charity single "Ronan", which she wrote in memory of a four-year-old boy who died of neuroblastoma.[620] She has also donated $100,000 to the V Foundation for Cancer Research[621] and $50,000 to the Children's Hospital of Philadelphia.[622] Swift has encouraged young people to volunteer in their local communities as part of Global Youth Service Day.[623]
Swift donated to fellow singer-songwriter Kesha to help with her legal battles against Dr. Luke and to actress Mariska Hargitay's Joyful Heart Foundation.[593][624] During the COVID-19 pandemic, Swift donated to the World Health Organization and Feeding America,[625] and supported independent record stores.[626][627] Swift performed "Soon You'll Get Better" on the One World: Together At Home television special, a benefit concert curated by Lady Gaga for Global Citizen to raise funds for the World Health Organization's COVID-19 Solidarity Response Fund.[628] In 2018 and 2021, Swift donated to the Rape, Abuse & Incest National Network in honor of Sexual Assault Awareness and Prevention Month.[593][629] She has made donations to her fans several times for their medical or academic expenses.[630] In December 2023, Swift attended Ramy Youssef's fundraiser for the Gaza Strip.[631]
Discography"""

ch2 = """ard Professor Critiques Taylor Swift's New Poems". Cosmopolitan. Retrieved December 21, 2021.
 Sheffield, Rob (October 13, 2023). "Taylor Swift's 'Era"""

ch3 = """
Eras Tour' Debut Slays (And Could Break All-Time Touring Record)". Pollstar. March 18, 2023. Archived from the original on March 20, 2023. Retrieved June 30, 2023.
 Aramesh, Waiss David (March 18, 2023). "Taylor Swift's The Eras Tour Is a 3-Hour Career-Spanning Victory Lap". Rolling Stone. OCLC 1787396. Archived from the original on March 18, 2023. Retrieved June 30, 2023.
 Gambles, Sarah (July 23, 2023). "The ubiquitous power of Taylor Swift". Deseret News. Retrieved July 23, 2023.
 McCormick, Neil (March 18, 2023). "Taylor Swift:
"""



In [110]:
btoken.decode(btoken.encode("hello world")) == "hello world"

True

In [118]:
btoken.decode(btoken.encode(ch1)) == ch1,\
btoken.decode(btoken.encode(ch2)) == ch2,\
btoken.decode(btoken.encode(ch3)) == ch3

(True, True, True)

We can see they are the same which the tokenizor works

In [123]:
btoken.encode("Tour' "), btoken.encode("Tour'")

([84, 111, 117, 114, 39, 32], [84, 111, 117, 114, 39])

Step 2

Convert you BasicTokenizer into a RegexTokenizer, which takes a regex pattern and splits the text exactly as GPT-4 would. Process the parts separately as before, then concatenate the results. Retrain your tokenizer and compare the results before and after. You should see that you will now have no tokens that go across categories (numbers, letters, punctuation, more than one whitespace). Use the GPT-4 pattern:

In [125]:
GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""

In [126]:
import regex as re
gpt2pat = re.compile(GPT4_SPLIT_PATTERN)
print(re.findall(gpt2pat, "Hello! How've you been?"))

['Hello', '!', ' How', "'ve", ' you', ' been', '?']


In [None]:
GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
class RegexTokenizer:
    def __init__(self):
        self.load = None
        self.merges = {}
        self.processed = None
        self.num_merges = 0
        self.vocab = {}
        self.GPT4_SPLIT_PATTERN =\
        r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""

    def train(self, text, vocab_size=276, verbose=False):
        self.load = None

    
        f = open(text, "r", encoding="utf8")
        self.load = f.read()
        # print(self.load)
        f.close

        tokens = self.load.encode("utf-8") # raw bytes
        tokens = list(map(int, tokens)) # convert to a list of integers in range 0..255 for convenience

        #print max token value and print the readed text
        if verbose:
            print(max(tokens))
            stats = self.get_stats(tokens)
            print("top_pair: ",max(stats, key=stats.get))
        # resetting the number of merge and the merges dictionary
        self.num_merges = vocab_size-256
        ids = tokens # copy so we don't destroy the original list
        
        # resetting the merge dictionary every time we train the tokenizor
        self.merges = {} # (int, int)-> int
        self.vocab = {}
        for i in range(self.num_merges):
            stats = self.get_stats(ids)
            pair = max(stats, key=stats.get)
            idx = 256 + i
            if verbose:
                print(f"merging {pair} into a new token {idx}")
            ids = self.merge(ids, pair, idx)
            self.merges[pair] = idx

        # building the vocabary

        self.vocab = {idx: bytes([idx]) for idx in range(256)}
        for (p0, p1), idx in self.merges.items():
            self.vocab[idx] = self.vocab[p0] + self.vocab[p1]
    def encode(self, text):
        # givent a string return list of integers (the tokens)
    
        tokens = list(text.encode("utf-8"))
        while len(tokens) >= 2:
            stats = self.get_stats(tokens)
            pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
            if pair not in self.merges:
                break # nothing else can be merged
            idx = self.merges[pair]
            tokens = self.merge(tokens, pair, idx)
        return tokens
    def decode(self, ids):
        # given ids (list of integers), return Python string
        # todo, send this 2-3 lines of code to the main code block
        # to avoid repeating when everytime decode is called

        
        tokens = b"".join(self.vocab[idx] for idx in ids)
        text = tokens.decode("utf-8", errors="replace")
        return text
    def get_stats(self, ids):
        counts = {}
        for pair in zip(ids, ids[1:]): #Pythonic way to interate
            counts[pair] = counts.get(pair, 0) +1
        return counts
    def merge(self, ids, pair, idx):
        newids = []
        i = 0
        while i < len(ids):
            if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
                newids.append(idx)
                i += 2
            else:
                newids.append(ids[i])
                i += 1
        return newids
