In [8]:
import nltk
from nltk.tokenize import (word_tokenize, sent_tokenize, RegexpTokenizer, TweetTokenizer, MWETokenizer, TreebankWordTokenizer)
from textblob import TextBlob
import spacy
from gensim.utils import tokenize
from tensorflow.keras.preprocessing.text import text_to_word_sequence

In [15]:
# Install required packages
import os
os.system("pip install spacy gensim tensorflow")
nltk.download('punkt')
nltk.download('punkt_tab')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [16]:
# Paragraph with special characters, emojis, negations, and punctuations
text = ("The environment 🌍💚 doesn't deserve such ignorance! Natural disasters, "
        "like floods, hurricanes, and wildfires, are becoming common—let's act "
        "NOW. Please don't delay action; it's time for change! 🌱🌦️")

print("Original Text:\n", text)


Original Text:
 The environment 🌍💚 doesn't deserve such ignorance! Natural disasters, like floods, hurricanes, and wildfires, are becoming common—let's act NOW. Please don't delay action; it's time for change! 🌱🌦️


In [17]:
print("\nTokenizations:")


Tokenizations:


1. Word Tokenization

Insight: This tokenization method breaks text into individual words while preserving punctuation as separate tokens.

---


Applications: Suitable for basic tasks such as counting word frequencies, text preprocessing for sentiment analysis, and feature extraction for classification tasks.

In [19]:
# a. Word Tokenization
print("\nWord Tokenization (nltk.word_tokenize):")
print(word_tokenize(text))



Word Tokenization (nltk.word_tokenize):
['The', 'environment', '🌍💚', 'does', "n't", 'deserve', 'such', 'ignorance', '!', 'Natural', 'disasters', ',', 'like', 'floods', ',', 'hurricanes', ',', 'and', 'wildfires', ',', 'are', 'becoming', 'common—let', "'s", 'act', 'NOW', '.', 'Please', 'do', "n't", 'delay', 'action', ';', 'it', "'s", 'time', 'for', 'change', '!', '🌱🌦️']


2. Sentence Tokenization (nltk.sent_tokenize)

Insight: It splits text into meaningful sentences based on punctuation and grammar rules.

---


Applications: Useful for text summarization, document parsing, and chatbot responses where understanding sentence boundaries is crucial.

In [20]:
# b. Sentence Tokenization
print("\nSentence Tokenization (nltk.sent_tokenize):")
print(sent_tokenize(text))


Sentence Tokenization (nltk.sent_tokenize):
["The environment 🌍💚 doesn't deserve such ignorance!", "Natural disasters, like floods, hurricanes, and wildfires, are becoming common—let's act NOW.", "Please don't delay action; it's time for change!", '🌱🌦️']


3. Punctuation-based Tokenizer (RegexpTokenizer)

Insight: Removes punctuation and focuses on capturing clean words.

---


Applications: Ideal for clean text analysis when punctuation is not needed, such as spam filtering or keyword extraction.


In [21]:
# c. Punctuation-based Tokenization
print("\nPunctuation-based Tokenization (RegexpTokenizer):")
regex_tokenizer = RegexpTokenizer(r'\w+')
print(regex_tokenizer.tokenize(text))


Punctuation-based Tokenization (RegexpTokenizer):
['The', 'environment', 'doesn', 't', 'deserve', 'such', 'ignorance', 'Natural', 'disasters', 'like', 'floods', 'hurricanes', 'and', 'wildfires', 'are', 'becoming', 'common', 'let', 's', 'act', 'NOW', 'Please', 'don', 't', 'delay', 'action', 'it', 's', 'time', 'for', 'change']


4. Treebank Word Tokenizer (nltk.TreebankWordTokenizer)

Insight: Handles contractions and special cases like hyphenated words.

---


Applications: Suitable for formal text processing in fields like legal document analysis or news article parsing.

In [22]:
# d. Treebank Word Tokenizer
print("\nTreebank Word Tokenization:")
treebank_tokenizer = TreebankWordTokenizer()
print(treebank_tokenizer.tokenize(text))



Treebank Word Tokenization:
['The', 'environment', '🌍💚', 'does', "n't", 'deserve', 'such', 'ignorance', '!', 'Natural', 'disasters', ',', 'like', 'floods', ',', 'hurricanes', ',', 'and', 'wildfires', ',', 'are', 'becoming', 'common—let', "'s", 'act', 'NOW.', 'Please', 'do', "n't", 'delay', 'action', ';', 'it', "'s", 'time', 'for', 'change', '!', '🌱🌦️']


5. Tweet Tokenizer (nltk.TweetTokenizer)

Insight: Efficiently handles social media-specific text, including emojis, hashtags, and URLs.

---


Applications: Best suited for mining social media platforms like Twitter for sentiment analysis, trend detection, and brand monitoring.

In [23]:
# e. Tweet Tokenizer
print("\nTweet Tokenization (nltk.TweetTokenizer):")
tweet_tokenizer = TweetTokenizer()
print(tweet_tokenizer.tokenize(text))


Tweet Tokenization (nltk.TweetTokenizer):
['The', 'environment', '🌍', '💚', "doesn't", 'deserve', 'such', 'ignorance', '!', 'Natural', 'disasters', ',', 'like', 'floods', ',', 'hurricanes', ',', 'and', 'wildfires', ',', 'are', 'becoming', 'common', '—', "let's", 'act', 'NOW', '.', 'Please', "don't", 'delay', 'action', ';', "it's", 'time', 'for', 'change', '!', '🌱', '🌦', '️']


6. Multi-Word Expression (MWE) Tokenizer (nltk.MWETokenizer)

Insight: Recognizes predefined multi-word expressions as single tokens.

---


Applications: Useful for domain-specific tasks like entity recognition, phrase detection, and financial market analysis.

In [24]:
# f. Multi-Word Expression Tokenizer
print("\nMulti-Word Expression Tokenization:")
mwe_tokenizer = MWETokenizer([("Natural", "disasters"), ("time", "for")])
print(mwe_tokenizer.tokenize(text.split()))



Multi-Word Expression Tokenization:
['The', 'environment', '🌍💚', "doesn't", 'deserve', 'such', 'ignorance!', 'Natural', 'disasters,', 'like', 'floods,', 'hurricanes,', 'and', 'wildfires,', 'are', 'becoming', "common—let's", 'act', 'NOW.', 'Please', "don't", 'delay', 'action;', "it's", 'time_for', 'change!', '🌱🌦️']


7. TextBlob Word Tokenizer

Insight: Simplifies tokenization while offering integration with additional NLP features like part-of-speech tagging.

---


Applications: Suitable for rapid prototyping of text classification and sentiment analysis projects.


In [25]:
# g. TextBlob Word Tokenizer
print("\nTextBlob Tokenization:")
blob = TextBlob(text)
print(blob.words)



TextBlob Tokenization:
['The', 'environment', '🌍💚', 'does', "n't", 'deserve', 'such', 'ignorance', 'Natural', 'disasters', 'like', 'floods', 'hurricanes', 'and', 'wildfires', 'are', 'becoming', 'common—let', "'s", 'act', 'NOW', 'Please', 'do', "n't", 'delay', 'action', 'it', "'s", 'time', 'for', 'change', '🌱🌦️']


8. spaCy Tokenizer

Insight: A highly optimized tokenizer for large-scale NLP tasks, capable of recognizing complex linguistic structures.

---


Applications: Ideal for creating NLP pipelines, such as named entity recognition, dependency parsing, and text classification.

In [26]:
# h. spaCy Tokenizer
print("\nspaCy Tokenization:")
doc = nlp(text)
print([token.text for token in doc])


spaCy Tokenization:
['The', 'environment', '🌍', '💚', 'does', "n't", 'deserve', 'such', 'ignorance', '!', 'Natural', 'disasters', ',', 'like', 'floods', ',', 'hurricanes', ',', 'and', 'wildfires', ',', 'are', 'becoming', 'common', '—', 'let', "'s", 'act', 'NOW', '.', 'Please', 'do', "n't", 'delay', 'action', ';', 'it', "'s", 'time', 'for', 'change', '!', '🌱', '🌦', '️']


9. Gensim Word Tokenizer

Insight: Focuses on generating tokens for text input in topic modeling and word embedding tasks.

---


Applications: Best for training word embeddings and building models for topic classification and similarity detection.

In [27]:
# i. Gensim Word Tokenizer
print("\nGensim Tokenization:")
print(list(tokenize(text)))



Gensim Tokenization:
['The', 'environment', 'doesn', 't', 'deserve', 'such', 'ignorance', 'Natural', 'disasters', 'like', 'floods', 'hurricanes', 'and', 'wildfires', 'are', 'becoming', 'common', 'let', 's', 'act', 'NOW', 'Please', 'don', 't', 'delay', 'action', 'it', 's', 'time', 'for', 'change']


10. Keras Tokenization (text_to_word_sequence)

Insight: Designed for preprocessing text in deep learning pipelines by converting text into word sequences.

---


Applications: Essential for preparing input text for neural networks, such as LSTMs or CNNs, for tasks like text generation and sentiment analysis.

In [28]:
# j. Tokenization with Keras
print("\nKeras Tokenization:")
print(text_to_word_sequence(text))


Keras Tokenization:
['the', 'environment', '🌍💚', "doesn't", 'deserve', 'such', 'ignorance', 'natural', 'disasters', 'like', 'floods', 'hurricanes', 'and', 'wildfires', 'are', 'becoming', "common—let's", 'act', 'now', 'please', "don't", 'delay', 'action', "it's", 'time', 'for', 'change', '🌱🌦️']
