In [None]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
	raw_text = f.read()
print("Total number of characters:", len(raw_text))
print(raw_text[:99])

In [None]:
import re

text = "Hello, world. This is a test."
result = re.split(r' ', text)
print(result)

In [None]:
import re

text = "Hello, world. This is a test."
result = re.split(r'(\s)', text)
print(result)

The following splits on whitespace characters but includes the space in a list (\s):

(I believe this is important due to the fact that LLMs require to know spacing to understand sentence structuring)

In [None]:
import re

text = "Hello, world. This is a test."
result = re.split(r'(\s)', text)
print(result)

lets split on punctuation as well as spaces

In [None]:
import re

text = "Hello, world. This is a test."
result = re.split(r'([,.]|\s)', text)
print(result)

Now lets remove the spaces in the itemised list:

In [None]:
import re

text = "Hello, world. This is a test."
result = re.split(r'([,.]|\s)', text)

result = [item for item in result if item.strip()]
print(result)

Reducing whitespaces reduces computing requirements and memory. However whitespaces might be required when training a model on sentence structure of the text.

Now lets modify to add all possible punctuation

In [None]:
import re

text = "Hello, world! Is this-- a test?"
result = re.split(r'([.,:?_!-"()\']|--|\s)', text)

result = [item for item in result if item.strip()]
print(result)

Going back to the verdict text:

1.	Iterate over each element (item) in preprocessed:
2.	Apply strip() to item: For each item, item.strip() removes any leading and trailing whitespace from the string. For example:
	•	"   hello   " becomes "hello"
3.	Check if item.strip() is non-empty:If item.strip() results in an empty string (which means the original string was either empty or consisted only of whitespace), that item is excluded from the new list.
4.	Include the stripped version of item in the new list:If the condition if item.strip() evaluates to True (i.e., item.strip() is not an empty string), then item.strip() is included in the new list.
5.	Build the new list:

In [None]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
	raw_text = f.read()
preprocessed = re.split(r'([,.:?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))
print(preprocessed[:30])

Sorting the list into alphabetical tokens:

In [None]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
	raw_text = f.read()
preprocessed = re.split(r'([,.:?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

The set() function is used to convert the list preprocessed into a set. A set is a collection of unique elements, meaning it automatically removes any duplicate entries.
•	So, if the preprocessed list contains repeated words or items, they will be eliminated in the resulting set.

In [None]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
	raw_text = f.read()
preprocessed = re.split(r'([,.:?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
all_words = sorted(set(preprocessed))
vocab = {token:integer for integer, token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
	print(item)
	if i > 50:
		break

We have turned the itemised list, tokenised it into a dictionary

Im now going to use these notes to test Simple text tokenizer:

In [None]:
import main
tokenizer = main.SimpleTokenizerV1(vocab)
text = """It's the last he painted, you know, Mrs. Gisburn said with pardonable pride."""
ids= tokenizer.encode(text)
print(ids)

This one didnt work likely due to the imported section

In [None]:
import main
tokenizer = main.SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""
ids= tokenizer.encode(text)
print(tokenizer.decode(ids))

In [None]:
import re
class SimpleTokenizerV1:
    def __init__(self,vocab):
        self.str_to_int = vocab #stores the vocab as a class attribute for access in the encode and decode methods
        self.int_to_str = {i:s for s, i in vocab.items()} # creates an inverse vocab that maps token ids back to original text tokens
    def encode(self, text):
        preprocessed = re.split(r'([,.:?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
            ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""
ids= tokenizer.encode(text)
print(ids)

In [None]:
print(tokenizer.decode(ids))