#                                                           TOKENIZATION TECHNIQUES

## Word Tokenization
✅ Splits text into words based on spaces and punctuation.

In [1]:
import spacy

nlp = spacy.load("en_core_web_sm")
text = "Tokenization is an important NLP task!"

# Word tokenization
doc = nlp(text)
words = [token.text for token in doc]
print("Word Tokenization:", words)



Word Tokenization: ['Tokenization', 'is', 'an', 'important', 'NLP', 'task', '!']


## Whitespace Tokenization
✅ Splits text based on spaces.

In [2]:
text = "Tokenization is an important NLP task!"

# Whitespace tokenization
tokens = text.split()
print("Whitespace Tokenization:", tokens)

Whitespace Tokenization: ['Tokenization', 'is', 'an', 'important', 'NLP', 'task!']


## Sentence Tokenization
✅ Splits text into sentences rather than words.

In [3]:
import spacy

nlp = spacy.load("en_core_web_sm") 
text = "The sun was setting over the quiet village, casting a golden glow over the rooftops. Birds chirped as they returned to their nests, and a gentle breeze rustled through the trees. Children laughed and played in the narrow streets, while elders sat outside their homes, sharing stories from the past. It was a peaceful evening, a moment of calm before the night arrived."

# Sentence tokenization
doc = nlp(text)
sentences = [sent.text for sent in doc.sents]
print("Sentence Tokenization:", sentences)

Sentence Tokenization: ['The sun was setting over the quiet village, casting a golden glow over the rooftops.', 'Birds chirped as they returned to their nests, and a gentle breeze rustled through the trees.', 'Children laughed and played in the narrow streets, while elders sat outside their homes, sharing stories from the past.', 'It was a peaceful evening, a moment of calm before the night arrived.']


## Rule-Based Tokenization
✅ Uses custom rules for tokenization.

In [4]:
import re

text = "Tokenization-is-important! Let's do it."

# Rule-based tokenization (split by hyphens and spaces)
tokens = re.split(r"[- ]", text)
print("Rule-Based Tokenization:", tokens)

Rule-Based Tokenization: ['Tokenization', 'is', 'important!', "Let's", 'do', 'it.']


## Character Tokenization
✅ Splits text into individual characters.

In [5]:
text = "Machine Learning is a subset of AI"

# Character tokenization
characters = list(text)
print("Character Tokenization:", characters)

Character Tokenization: ['M', 'a', 'c', 'h', 'i', 'n', 'e', ' ', 'L', 'e', 'a', 'r', 'n', 'i', 'n', 'g', ' ', 'i', 's', ' ', 'a', ' ', 's', 'u', 'b', 's', 'e', 't', ' ', 'o', 'f', ' ', 'A', 'I']


## Byte-Pair Encoding (BPE) Tokenization
✅ Uses subword tokenization.

In [6]:
from transformers import GPT2Tokenizer

# Load a pre-trained BPE tokenizer (e.g., GPT-2)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Tokenize a sample text
text = "Tokenization is important."
tokens = tokenizer.tokenize(text)
print("Byte-Pair Encoding Tokenization (Pre-trained):", tokens)

Byte-Pair Encoding Tokenization (Pre-trained): ['Token', 'ization', 'Ġis', 'Ġimportant', '.']


## Morphological Tokenization
✅ Splits words into morphemes (prefix, root, suffix).

In [7]:
pip install lemminflect

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [8]:
from lemminflect import getLemma

text = "Machine Learning is a subset of AI"

# Morphological tokenization (split into morphemes)
morphemes = [getLemma(word, upos="NOUN") for word in text.split()]
print("Morphological Tokenization:", morphemes)

Morphological Tokenization: [('Machine',), ('Learning',), ('is',), ('a',), ('subset',), ('of',), ('AI',)]


## Punctuation Tokenization
✅ Splits text based on punctuation.

In [9]:
import re

text = "Tokenization, is important! Right?"

# Punctuation tokenization
tokens = re.findall(r"\w+|\W+", text)
print("Punctuation Tokenization:", tokens)

Punctuation Tokenization: ['Tokenization', ', ', 'is', ' ', 'important', '! ', 'Right', '?']


## N-Gram Tokenization
✅ Generates n-grams from text.

In [10]:
from nltk import ngrams

text = "Tokenization is an important NLP task!"

# N-gram tokenization (bi-grams)
n = 3
tokens = list(ngrams(text.split(), n))
print(f"{n}-Gram Tokenization:", tokens)

3-Gram Tokenization: [('Tokenization', 'is', 'an'), ('is', 'an', 'important'), ('an', 'important', 'NLP'), ('important', 'NLP', 'task!')]


## Hybrid Tokenization
✅ Combines multiple tokenization techniques.

In [11]:
import re

text = "Tokenization is an important NLP task!"
# Hybrid tokenization (word + punctuation)
words = [token.text for token in nlp(text)]
punctuation_tokens = re.findall(r"\W+", text)
hybrid_tokens = words + punctuation_tokens
print("Hybrid Tokenization:", hybrid_tokens)

Hybrid Tokenization: ['Tokenization', 'is', 'an', 'important', 'NLP', 'task', '!', ' ', ' ', ' ', ' ', ' ', '!']
