Use TensorFlow Datasets’ SubwordTextEncoder to build a subword-level tokenizer, then encode and decode sentences for NLP tasks like translation or classification.

In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds

In [3]:
#Sample Sentences
corpus=[
    "TensorFlow is an end-to-end open-source platform for machine learning.",
    "Natural Language Processing is a fascinating field.",
    "Tokenization is the first step in NLP pipelines.",
    "Subword tokenization helps with rare words."
]

In [5]:
#Build SubwordTextEncoder from corpus
tokenizer=tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    corpus,target_vocab_size=1000
)

In [6]:
#Print vocabulary size
print("Subword vocabulary size:",tokenizer.vocab_size)

Subword vocabulary size: 288


In [7]:
#Encode and decode a test sentence
test_sentence="Subword tokenization is powerful for text models."
encoded=tokenizer.encode(test_sentence)
decoded=tokenizer.decode(encoded)

In [8]:
#Display results
print("Original sentence:",test_sentence)
print("Encoded sentence:",encoded)
print("Decoded sentence:",decoded)

Original sentence: Subword tokenization is powerful for text models.
Encoded sentence: [27, 4, 1, 144, 143, 151, 133, 146, 134, 149, 140, 64, 17, 148, 133, 152, 148, 64, 141, 143, 132, 133, 140, 147, 78]
Decoded sentence: Subword tokenization is powerful for text models.


In [9]:
print("\nSubword Tokens:")
print([tokenizer.decode([token]) for token in encoded])


Subword Tokens:
['Subword ', 'tokenization ', 'is ', 'p', 'o', 'w', 'e', 'r', 'f', 'u', 'l', ' ', 'for ', 't', 'e', 'x', 't', ' ', 'm', 'o', 'd', 'e', 'l', 's', '.']


In [None]:
#Types of Tokenization

#Type	                 Example	                                        Use

#Word Tokenization	    "NLP is fun" → ["NLP", "is", "fun"]	                Basic models

#Character Tokenization	"fun" → ["f", "u", "n"]	                            Language modeling

#Subword Tokenization	"tokenization" → ["token", "ization"]	            Used in BERT, T5, GPT

#Sentence Tokenization	"Hello. How are you?" → ["Hello.", "How are you?"]	Paragraph understanding