In [34]:
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize, word_tokenize


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [35]:
text="Time travel—an enigmatic concept—intrigues scientists, philosophers, & dreamers alike! 🚀🕰️ Imagine zipping through spacetime, \n\
witnessing the birth of galaxies or altering history (but wait… don't mess with the past! ⚠️⏳)."

#Sentence tokenization
Splits text into sentences instead of words.


In [36]:
sent_tokenize(text)

['Time travel—an enigmatic concept—intrigues scientists, philosophers, & dreamers alike!',
 "🚀🕰️ Imagine zipping through spacetime, \nwitnessing the birth of galaxies or altering history (but wait… don't mess with the past!",
 '⚠️⏳).']

#Word tokenization
Splits text into words.

In [37]:
word_tokenize(text)

['Time',
 'travel—an',
 'enigmatic',
 'concept—intrigues',
 'scientists',
 ',',
 'philosophers',
 ',',
 '&',
 'dreamers',
 'alike',
 '!',
 '🚀🕰️',
 'Imagine',
 'zipping',
 'through',
 'spacetime',
 ',',
 'witnessing',
 'the',
 'birth',
 'of',
 'galaxies',
 'or',
 'altering',
 'history',
 '(',
 'but',
 'wait…',
 'do',
 "n't",
 'mess',
 'with',
 'the',
 'past',
 '!',
 '⚠️⏳',
 ')',
 '.']

#Punctuation-based Tokenizer
Uses punctuation marks as delimiters to split words.

In [38]:
import re
tokens = re.findall(r"[\w']+|[.,!?;]", text)
print(tokens)

['Time', 'travel', 'an', 'enigmatic', 'concept', 'intrigues', 'scientists', ',', 'philosophers', ',', 'dreamers', 'alike', '!', 'Imagine', 'zipping', 'through', 'spacetime', ',', 'witnessing', 'the', 'birth', 'of', 'galaxies', 'or', 'altering', 'history', 'but', 'wait', "don't", 'mess', 'with', 'the', 'past', '!', '.']


#TreeBankword Tokenizer

The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.

This tokenizer performs the following steps:

split standard contractions, e.g. don't -> do n't and they'll -> they 'll

treat most punctuation characters as separate tokens

split off commas and single quotes, when followed by whitespace

separate periods that appear at the end of line

In [39]:
from nltk.tokenize import TreebankWordTokenizer
s = "They'll save and invest more."
s1 = "hi, my name can't hello,"
print(TreebankWordTokenizer().tokenize(s))
print(TreebankWordTokenizer().tokenize(s1))


['They', "'ll", 'save', 'and', 'invest', 'more', '.']
['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ',']


#Tweet Tokenizer

With the help of NLTK nltk.TweetTokenizer() method, we are able to convert the stream of words into small  tokens so that we can analyse the audio stream with the help of nltk.TweetTokenizer() method.

In [40]:

# import TweetTokenizer() method from nltk
from nltk.tokenize import TweetTokenizer

# Create a reference variable for Class TweetTokenizer
tk = TweetTokenizer()
print(tk.tokenize(text) )


['Time', 'travel', '—', 'an', 'enigmatic', 'concept', '—', 'intrigues', 'scientists', ',', 'philosophers', ',', '&', 'dreamers', 'alike', '!', '🚀', '🕰', '️', 'Imagine', 'zipping', 'through', 'spacetime', ',', 'witnessing', 'the', 'birth', 'of', 'galaxies', 'or', 'altering', 'history', '(', 'but', 'wait', '…', "don't", 'mess', 'with', 'the', 'past', '!', '⚠', '️', '⏳', ')', '.']


#Multi-Word Expression Tokenizer

 Identifies multi-word expressions as single tokens.

In [47]:
from nltk.tokenize import MWETokenizer

tokenizer = MWETokenizer([('New', 'York'), ('machine', 'learning')])
text = "I live in New York and love machine learning ."
tokens = tokenizer.tokenize(text.split())
print(tokens)

['I', 'live', 'in', 'New_York', 'and', 'love', 'machine_learning', '.']


# Text Blob

Uses the TextBlob library for simple word tokenization.

In [42]:
from textblob import TextBlob


In [43]:
# create a TextBlob object
blob_object = TextBlob(text)

# tokenize paragraph into words.
print(" Word Tokenize :\n", blob_object.words)

 Word Tokenize :
 ['I', 'live', 'in', 'New', 'York', 'and', 'love', 'machine', 'learning']


# spaCy Tokenizer

Tokenizer from the spaCy library, optimized for speed and accuracy.

In [44]:
# First we need to import spacy
import spacy

# Creating blank language object then
# tokenizing words of the sentence
nlp = spacy.blank("en")

doc = nlp(text)

for token in doc:
    print(token)



I
live
in
New
York
and
love
machine
learning
.


# Gensim word tokenizer

 Uses gensim.utils.simple_preprocess() for efficient tokenization.

In [45]:
from gensim.utils import tokenize

tokens = list(tokenize(text))

print(tokens)

['I', 'live', 'in', 'New', 'York', 'and', 'love', 'machine', 'learning']


# Tokenization with Keras

Keras provides tokenization using Tokenizer() for deep learning preprocessing.

In [46]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence


# Tokenize the text into words
tokens = text_to_word_sequence(text)

print("Tokenized Words:", tokens)


Tokenized Words: ['i', 'live', 'in', 'new', 'york', 'and', 'love', 'machine', 'learning']
