## 2.2 Tokenizing text

In [1]:
import os
import urllib.request

if not os.path.exists('the-verdict.txt'):
    url = 'https://raw.githubusercontent.com/GlebTanaka/LLMs-from-scratch/refs/heads/main/ch02/01_main-chapter-code/the-verdict.txt'
    file_path = 'the-verdict.txt'
    urllib.request.urlretrieve(url, file_path)

In [7]:
# Read the file and get character count
with open('the-verdict.txt', 'r', encoding='utf-8') as file:
    the_verdict_text = file.read()
    char_count = len(the_verdict_text)
print(f"Total number of characters in the file: {char_count}")
print(the_verdict_text[:99])

Total number of characters in the file: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [4]:
import re

# Sample sentence
sample_sentence = "Hello, world! This is a simple example of tokenization (using regex)."

# Tokenize using re.findall
# \w+ matches one or more word characters (letters, digits, underscores)
tokens = re.findall(r'\w+', sample_sentence.lower())

print("Original text:", sample_sentence)
print("\nTokens:", tokens)
print("Number of tokens:", len(tokens))

Original text: Hello, world! This is a simple example of tokenization (using regex).

Tokens: ['hello', 'world', 'this', 'is', 'a', 'simple', 'example', 'of', 'tokenization', 'using', 'regex']
Number of tokens: 11


In [5]:
# A few example texts with different patterns
texts = [
    "Hello world",                    # Simple space-separated
    "Hello, world!",                  # With punctuation
    "Is this--a test?",              # With double dash
    "Word. Another word... Final"     # With multiple dots
]

# Try different patterns
patterns = [
    r'\w+',                          # Just words
    r'[A-Za-z]+',                    # Only letters
    r'([,.:;?_!"()\']|--|\s)',      # more complex pattern
    r'\S+'                           # Non-whitespace chunks
]

# Test each pattern on each text
for text in texts:
    print(f"\nOriginal text: '{text}'")
    for pattern in patterns:
        print(f"\nPattern '{pattern}':")
        if pattern.startswith('('):
            # Use split for patterns with groups
            tokens = [t.strip() for t in re.split(pattern, text) if t.strip()]
        else:
            # Use findall for simple patterns
            tokens = re.findall(pattern, text)
        print(f"Tokens: {tokens}")


Original text: 'Hello world'

Pattern '\w+':
Tokens: ['Hello', 'world']

Pattern '[A-Za-z]+':
Tokens: ['Hello', 'world']

Pattern '([,.:;?_!"()\']|--|\s)':
Tokens: ['Hello', 'world']

Pattern '\S+':
Tokens: ['Hello', 'world']

Original text: 'Hello, world!'

Pattern '\w+':
Tokens: ['Hello', 'world']

Pattern '[A-Za-z]+':
Tokens: ['Hello', 'world']

Pattern '([,.:;?_!"()\']|--|\s)':
Tokens: ['Hello', ',', 'world', '!']

Pattern '\S+':
Tokens: ['Hello,', 'world!']

Original text: 'Is this--a test?'

Pattern '\w+':
Tokens: ['Is', 'this', 'a', 'test']

Pattern '[A-Za-z]+':
Tokens: ['Is', 'this', 'a', 'test']

Pattern '([,.:;?_!"()\']|--|\s)':
Tokens: ['Is', 'this', '--', 'a', 'test', '?']

Pattern '\S+':
Tokens: ['Is', 'this--a', 'test?']

Original text: 'Word. Another word... Final'

Pattern '\w+':
Tokens: ['Word', 'Another', 'word', 'Final']

Pattern '[A-Za-z]+':
Tokens: ['Word', 'Another', 'word', 'Final']

Pattern '([,.:;?_!"()\']|--|\s)':
Tokens: ['Word', '.', 'Another', 'word', '.',

In [8]:
# Define the pattern as a variable
pattern = r'([,.:;?_!"()\']|--|\s)'

# Tokenize using the pattern
tokens = [token.strip() for token in re.split(pattern, the_verdict_text) if token.strip()]

print("\nTokens:", tokens[:30])
print("Number of tokens:", len(tokens))



Tokens: ['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']
Number of tokens: 4690
