# Tokenization

- With custom regexp and NLTK. 
- With default NLTK tokenizer.


In [5]:
import nltk
import re
from nltk.tokenize import regexp_tokenize

# Sample realistic text from a news headline
text = "Dr. O'Connor's Tesla Model Y costs $49,999.99! Meanwhile, U.K. inflation hits 5.4% in Jan."

# Define a verbose regex pattern for complex tokenization
pattern = r'''(?x)   # Enable verbose mode
    (?:[A-Z]\.)+            # Abbreviations like U.S.A., U.K.
  | \$?\d+(?:,\d{3})*(?:\.\d+)?%?  # Currency and percentages ($49,999.99, 5.4%)
  | \w+(?:[-']\w+)*         # Words with optional hyphens/apostrophes (O'Connor, Model-Y)
  | \.\.\.                 # Ellipsis (...)
  | [][.,;"’?():_‘-]       # Separate punctuation tokens
'''

# Tokenize the text
tokens = regexp_tokenize(text, pattern)

# Print the tokenized output
print(tokens)


['Dr', '.', "O'Connor's", 'Tesla', 'Model', 'Y', 'costs', '$49,999.99', 'Meanwhile', ',', 'U.K.', 'inflation', 'hits', '5.4%', 'in', 'Jan', '.']


In [6]:

# Download the 'punkt' and 'punkt_tab' resources needed for tokenization
#nltk.download('punkt')
#nltk.download('punkt_tab')

tokens = nltk.word_tokenize(text)
print(tokens)


['Dr.', "O'Connor", "'s", 'Tesla', 'Model', 'Y', 'costs', '$', '49,999.99', '!', 'Meanwhile', ',', 'U.K.', 'inflation', 'hits', '5.4', '%', 'in', 'Jan', '.']


# BPE subword tokenization

Example using sentencepiece

In [7]:
#!pip install sentencepiece
import sentencepiece as spm

# Step 1: Create a training corpus
corpus = """ 
newer lower lowest slowest newest 
the newer cars are slower than the older ones
she bought the newest model of the phone
"""

# Save the corpus to a file
with open("corpus.txt", "w", encoding="utf-8") as f:
    f.write(corpus)

# Step 2: Train a BPE tokenizer
spm.SentencePieceTrainer.Train('--input=corpus.txt --model_prefix=bpe --vocab_size=50 --model_type=bpe')

# Step 3: Load the trained model
sp = spm.SentencePieceProcessor(model_file='bpe.model')

# Step 4: Tokenize a new sentence using the trained BPE tokenizer
test_sentence = "She drives the newest and slowest car."
tokens = sp.encode_as_pieces(test_sentence)

# Print the BPE tokens
print("BPE Tokens:", tokens)

BPE Tokens: ['▁', 'S', 'he', '▁', 'd', 'r', 'iv', 'e', 's', '▁the', '▁newest', '▁', 'an', 'd', '▁slo', 'west', '▁c', 'ar', '.']


In [8]:
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('wordnet')

# Sample text
text = "Dr. Smith is an expert. He works at U.K. Tech Inc. His email is dr.smith@example.com. Running, runs, and ran are similar words. He paid $45.55 on 01/02/2024."
print(text)

# 1️ Case Folding (Lowercasing)
print("\n--- Case Folding ---")
pure_lower = text.lower()  # Pure Python
print(pure_lower)

# 2️ Removing Special Characters and Numbers
print("\n--- Removing Special Characters ---")
pure_clean = re.sub(r'[^a-zA-Z\s]', '', pure_lower)  # Pure Python
print(pure_clean)

# 3️ Tokenization
print("\n--- Tokenization ---")
pure_tokens = pure_clean.split()  # Pure Python
nltk_tokens = nltk.word_tokenize(pure_clean)  # Using NLTK
print("Pure Python:", pure_tokens)
print("NLTK:", nltk_tokens)

# 4️ Stemming
print("\n--- Stemming ---")
stemmer = PorterStemmer()
pure_stemmed = [word.rstrip("ing") for word in pure_tokens]  # Simple Pure Python Example
nltk_stemmed = [stemmer.stem(word) for word in nltk_tokens]  # Using NLTK
print("Pure Python:", pure_stemmed)
print("NLTK:", nltk_stemmed)

# 5️ Lemmatization
print("\n--- Lemmatization ---")
lemmatizer = WordNetLemmatizer()
nltk_lemmatized = [lemmatizer.lemmatize(word, pos="v") for word in nltk_tokens]  # NLTK has proper lemmatization
print("NLTK:", nltk_lemmatized)

# 6️ Sentence Segmentation
print("\n--- Sentence Segmentation ---")
pure_sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)  # Pure Python Regex Approach
nltk_sentences = sent_tokenize(text)  # Using NLTK
print("Pure Python:", pure_sentences)
print("NLTK:", nltk_sentences)



Dr. Smith is an expert. He works at U.K. Tech Inc. His email is dr.smith@example.com. Running, runs, and ran are similar words. He paid $45.55 on 01/02/2024.

--- Case Folding ---
dr. smith is an expert. he works at u.k. tech inc. his email is dr.smith@example.com. running, runs, and ran are similar words. he paid $45.55 on 01/02/2024.

--- Removing Special Characters ---
dr smith is an expert he works at uk tech inc his email is drsmithexamplecom running runs and ran are similar words he paid  on 

--- Tokenization ---
Pure Python: ['dr', 'smith', 'is', 'an', 'expert', 'he', 'works', 'at', 'uk', 'tech', 'inc', 'his', 'email', 'is', 'drsmithexamplecom', 'running', 'runs', 'and', 'ran', 'are', 'similar', 'words', 'he', 'paid', 'on']
NLTK: ['dr', 'smith', 'is', 'an', 'expert', 'he', 'works', 'at', 'uk', 'tech', 'inc', 'his', 'email', 'is', 'drsmithexamplecom', 'running', 'runs', 'and', 'ran', 'are', 'similar', 'words', 'he', 'paid', 'on']

--- Stemming ---
Pure Python: ['dr', 'smith', 'i

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\luiss\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\luiss\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
