## 2.2 Tokenizing text

In [1]:
import os
import urllib.request

if not os.path.exists('the-verdict.txt'):
    url = 'https://raw.githubusercontent.com/GlebTanaka/LLMs-from-scratch/refs/heads/main/ch02/01_main-chapter-code/the-verdict.txt'
    file_path = 'the-verdict.txt'
    urllib.request.urlretrieve(url, file_path)

In [7]:
# Read the file and get character count
with open('the-verdict.txt', 'r', encoding='utf-8') as file:
    the_verdict_text = file.read()
    char_count = len(the_verdict_text)
print(f"Total number of characters in the file: {char_count}")
print(the_verdict_text[:99])

Total number of characters in the file: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [4]:
import re

# Sample sentence
sample_sentence = "Hello, world! This is a simple example of tokenization (using regex)."

# Tokenize using re.findall
# \w+ matches one or more word characters (letters, digits, underscores)
tokens = re.findall(r'\w+', sample_sentence.lower())

print("Original text:", sample_sentence)
print("\nTokens:", tokens)
print("Number of tokens:", len(tokens))

Original text: Hello, world! This is a simple example of tokenization (using regex).

Tokens: ['hello', 'world', 'this', 'is', 'a', 'simple', 'example', 'of', 'tokenization', 'using', 'regex']
Number of tokens: 11


In [5]:
# A few example texts with different patterns
texts = [
    "Hello world",                    # Simple space-separated
    "Hello, world!",                  # With punctuation
    "Is this--a test?",              # With double dash
    "Word. Another word... Final"     # With multiple dots
]

# Try different patterns
patterns = [
    r'\w+',                          # Just words
    r'[A-Za-z]+',                    # Only letters
    r'([,.:;?_!"()\']|--|\s)',      # more complex pattern
    r'\S+'                           # Non-whitespace chunks
]

# Test each pattern on each text
for text in texts:
    print(f"\nOriginal text: '{text}'")
    for pattern in patterns:
        print(f"\nPattern '{pattern}':")
        if pattern.startswith('('):
            # Use split for patterns with groups
            tokens = [t.strip() for t in re.split(pattern, text) if t.strip()]
        else:
            # Use findall for simple patterns
            tokens = re.findall(pattern, text)
        print(f"Tokens: {tokens}")


Original text: 'Hello world'

Pattern '\w+':
Tokens: ['Hello', 'world']

Pattern '[A-Za-z]+':
Tokens: ['Hello', 'world']

Pattern '([,.:;?_!"()\']|--|\s)':
Tokens: ['Hello', 'world']

Pattern '\S+':
Tokens: ['Hello', 'world']

Original text: 'Hello, world!'

Pattern '\w+':
Tokens: ['Hello', 'world']

Pattern '[A-Za-z]+':
Tokens: ['Hello', 'world']

Pattern '([,.:;?_!"()\']|--|\s)':
Tokens: ['Hello', ',', 'world', '!']

Pattern '\S+':
Tokens: ['Hello,', 'world!']

Original text: 'Is this--a test?'

Pattern '\w+':
Tokens: ['Is', 'this', 'a', 'test']

Pattern '[A-Za-z]+':
Tokens: ['Is', 'this', 'a', 'test']

Pattern '([,.:;?_!"()\']|--|\s)':
Tokens: ['Is', 'this', '--', 'a', 'test', '?']

Pattern '\S+':
Tokens: ['Is', 'this--a', 'test?']

Original text: 'Word. Another word... Final'

Pattern '\w+':
Tokens: ['Word', 'Another', 'word', 'Final']

Pattern '[A-Za-z]+':
Tokens: ['Word', 'Another', 'word', 'Final']

Pattern '([,.:;?_!"()\']|--|\s)':
Tokens: ['Word', '.', 'Another', 'word', '.',

In [8]:
# Define the pattern as a variable
pattern = r'([,.:;?_!"()\']|--|\s)'

# Tokenize using the pattern
tokens = [token.strip() for token in re.split(pattern, the_verdict_text) if token.strip()]

print("\nTokens:", tokens[:30])
print("Number of tokens:", len(tokens))



Tokens: ['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']
Number of tokens: 4690


## 2.3 Converting tokens into token IDs

In [14]:
# Convert tokens to a sorted set (vocabulary)
vocab = sorted(set(tokens))

print("Vocabulary (unique tokens):", vocab[:5])
print("Vocabulary size:", len(vocab))

# Create a token-to-id mapping dictionary
token_to_id = {token: idx for idx, token in enumerate(vocab)}

# Convert tokens to IDs
token_ids = [token_to_id[token] for token in tokens]

# Print first 5 items from token_to_id dictionary
print("\nFirst 5 token-to-ID mappings:")
for i, (token, id_num) in enumerate(token_to_id.items()):
    if i >= 5:
        break
    print(f"'{token}' -> {id_num}")

# Print first 10 token IDs
print("\nFirst 10 tokens as IDs:", token_ids[:10])

Vocabulary (unique tokens): ['!', '"', "'", '(', ')']
Vocabulary size: 1130

First 5 token-to-ID mappings:
'!' -> 0
'"' -> 1
''' -> 2
'(' -> 3
')' -> 4

First 10 tokens as IDs: [53, 44, 149, 1003, 57, 38, 818, 115, 256, 486]


In [15]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab # Dictionary mapping tokens to IDs
        self.int_to_str = {i:s for s,i in vocab.items()} # Reverse mapping: IDs to tokens

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)

        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [17]:
# Example usage
vocab = {'hello': 0, 'world': 1, '!': 2}
tokenizer = SimpleTokenizerV1(vocab)

# Encoding
ids = tokenizer.encode("hello world!")  # Returns: [0, 1, 2]

# Decoding
text = tokenizer.decode([0, 1, 2])  # Returns: "hello world!"
print(text)

hello world!


In [18]:
# Using tokenizer on prior created vocabulary.
tokenizer_2 = SimpleTokenizerV1(token_to_id)

text_section = """"It's the last he painted, you know,"
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer_2.encode(text_section)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [19]:
# decode integers into text
tokenizer_2.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [20]:
# First tokenizer_2.encode(text_section) converts the text into a list of token IDs
# Then tokenizer_2.decode() converts those IDs back into text
# This is a roundtrip conversion: text -> IDs -> text
# Useful for testing if the tokenizer preserves the text correctly
tokenizer_2.decode(tokenizer_2.encode(text_section))

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'