In [1]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
	raw_text = f.read()
print("Total number of characters:", len(raw_text))
print(raw_text[:99])

Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [2]:
import re

text = "Hello, world. This is a test."
result = re.split(r' ', text)
print(result)

['Hello,', 'world.', 'This', 'is', 'a', 'test.']


In [3]:
import re

text = "Hello, world. This is a test."
result = re.split(r'(\s)', text)
print(result)

['Hello,', ' ', 'world.', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test.']


The following splits on whitespace characters but includes the space in a list (\s):

(I believe this is important due to the fact that LLMs require to know spacing to understand sentence structuring)

In [4]:
import re

text = "Hello, world. This is a test."
result = re.split(r'(\s)', text)
print(result)

['Hello,', ' ', 'world.', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test.']


lets split on punctuation as well as spaces

In [5]:
import re

text = "Hello, world. This is a test."
result = re.split(r'([,.]|\s)', text)
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


Now lets remove the spaces in the itemised list:

In [6]:
import re

text = "Hello, world. This is a test."
result = re.split(r'([,.]|\s)', text)

result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', 'is', 'a', 'test', '.']


Reducing whitespaces reduces computing requirements and memory. However whitespaces might be required when training a model on sentence structure of the text.

Now lets modify to add all possible punctuation

In [7]:
import re

text = "Hello, world! Is this-- a test?"
result = re.split(r'([.,:?_!-"()\']|--|\s)', text)

result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '!', 'Is', 'this', '--', 'a', 'test', '?']


Going back to the verdict text:

1.	Iterate over each element (item) in preprocessed:
2.	Apply strip() to item: For each item, item.strip() removes any leading and trailing whitespace from the string. For example:
	•	"   hello   " becomes "hello"
3.	Check if item.strip() is non-empty:If item.strip() results in an empty string (which means the original string was either empty or consisted only of whitespace), that item is excluded from the new list.
4.	Include the stripped version of item in the new list:If the condition if item.strip() evaluates to True (i.e., item.strip() is not an empty string), then item.strip() is included in the new list.
5.	Build the new list:

In [8]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
	raw_text = f.read()
preprocessed = re.split(r'([,.:?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))
print(preprocessed[:30])

4669
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


Sorting the list into alphabetical tokens:

In [9]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
	raw_text = f.read()
preprocessed = re.split(r'([,.:?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1143


The set() function is used to convert the list preprocessed into a set. A set is a collection of unique elements, meaning it automatically removes any duplicate entries.
•	So, if the preprocessed list contains repeated words or items, they will be eliminated in the resulting set.

In [10]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
	raw_text = f.read()
preprocessed = re.split(r'([,.:?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
all_words = sorted(set(preprocessed))
vocab = {token:integer for integer, token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
	print(item)
	if i > 50:
		break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Carlo;', 25)
('Chicago', 26)
('Claude', 27)
('Come', 28)
('Croft', 29)
('Destroyed', 30)
('Devonshire', 31)
('Don', 32)
('Dubarry', 33)
('Emperors', 34)
('Florence', 35)
('For', 36)
('Gallery', 37)
('Gideon', 38)
('Gisburn', 39)
('Gisburns', 40)
('Grafton', 41)
('Greek', 42)
('Grindle', 43)
('Grindles', 44)
('HAD', 45)
('Had', 46)
('Hang', 47)
('Has', 48)
('He', 49)
('Her', 50)
('Hermia', 51)


We have turned the itemised list, tokenised it into a dictionary

Im now going to use these notes to test Simple text tokenizer:

In [13]:
import re
class SimpleTokenizerV1:
    def __init__(self,vocab):
        self.str_to_int = vocab #stores the vocab as a class attribute for access in the encode and decode methods
        self.int_to_str = {i:s for s, i in vocab.items()} # creates an inverse vocab that maps token ids back to original text tokens
    def encode(self, text):
        preprocessed = re.split(r'([,.:?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
            ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""
ids= tokenizer.encode(text)
print(ids)

[1, 57, 2, 861, 999, 610, 538, 754, 5, 1139, 603, 5, 1, 68, 7, 39, 862, 1121, 764, 803, 7]


In [14]:
print(tokenizer.decode(ids))

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


Now creating a new text and seeing how it handles this:

In [15]:
text = "Hello, do you like Tea?"
print(tokenizer.encode(text))

KeyError: 'Hello'

This has provided an error due to the fact that "Hello" was never used in the verdict text - highlighting the need to have large and diverse training sets

In [18]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab ={token:integer for integer, token in enumerate(all_tokens)}

print(len(vocab.items()))

1145


printing last 5 entries of the updated vocab dictionary:

In [19]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1140)
('your', 1141)
('yourself', 1142)
('<|endoftext|>', 1143)
('<|unk|>', 1144)


"string to integer” mapping and is used to store a dictionary that converts string values into integers.

In [20]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.stri_to_int = vocab 
        slef.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        preprocessed = [item if item in self.str_to_int
                        else "<|unk|>" for item in preprocessed]
        ids = [self.stri_to_int[s] for s in preprocessed]
        return ids
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text
    
