**Perform tokenization (Whitespace, Punctuation-based, Treebank, Tweet, MWE) using NLTK
library. Use porter stemmer and snowball stemmer for stemming. Use any technique for
lemmatization. **

In [None]:
import nltk
from nltk.tokenize import (word_tokenize, sent_tokenize, TreebankWordTokenizer,
                            MWETokenizer, TweetTokenizer, WhitespaceTokenizer)
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import string

In [None]:
!pip install nltk
import nltk

# Download required datasets (including 'punkt_tab')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab') # Download the missing data package




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Sample text
text = "NLTK is a leading platform for building Python programs to work with human language data! It's widely used for NLP."



In [None]:
# Tokenization ( to split text)
whitespace_tokenizer = WhitespaceTokenizer()
punctuation_tokenizer = word_tokenize  # Uses NLTK’s built-in word tokenizer (punctuation-based)

tweet_tokenizer = TweetTokenizer()
mwe_tokenizer = MWETokenizer([('human', 'language'), ('widely', 'used')]) #Groups together words that form a single unit of meaning

In [None]:
# Applying tokenization
tokens_whitespace = whitespace_tokenizer.tokenize(text)
tokens_punctuation = punctuation_tokenizer(text)

tokens_tweet = tweet_tokenizer.tokenize(text)
tokens_mwe = mwe_tokenizer.tokenize(word_tokenize(text))

In [None]:
# Stemming (reduces words to their root form)
porter_stemmer = PorterStemmer()
snowball_stemmer = SnowballStemmer("english")

porter_stems = [porter_stemmer.stem(word) for word in tokens_punctuation]
snowball_stems = [snowball_stemmer.stem(word) for word in tokens_punctuation]


In [None]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens_punctuation]


In [None]:
# Output results
print("Whitespace Tokenization:", tokens_whitespace)
print("Punctuation-based Tokenization:", tokens_punctuation)

print("Tweet Tokenization:", tokens_tweet)
print("MWE Tokenization:", tokens_mwe)
print("Porter Stemming:", porter_stems)
print("Snowball Stemming:", snowball_stems)
print("Lemmatization:", lemmatized_words)

Whitespace Tokenization: ['NLTK', 'is', 'a', 'leading', 'platform', 'for', 'building', 'Python', 'programs', 'to', 'work', 'with', 'human', 'language', 'data!', "It's", 'widely', 'used', 'for', 'NLP.']
Punctuation-based Tokenization: ['NLTK', 'is', 'a', 'leading', 'platform', 'for', 'building', 'Python', 'programs', 'to', 'work', 'with', 'human', 'language', 'data', '!', 'It', "'s", 'widely', 'used', 'for', 'NLP', '.']
Tweet Tokenization: ['NLTK', 'is', 'a', 'leading', 'platform', 'for', 'building', 'Python', 'programs', 'to', 'work', 'with', 'human', 'language', 'data', '!', "It's", 'widely', 'used', 'for', 'NLP', '.']
MWE Tokenization: ['NLTK', 'is', 'a', 'leading', 'platform', 'for', 'building', 'Python', 'programs', 'to', 'work', 'with', 'human_language', 'data', '!', 'It', "'s", 'widely_used', 'for', 'NLP', '.']
Porter Stemming: ['nltk', 'is', 'a', 'lead', 'platform', 'for', 'build', 'python', 'program', 'to', 'work', 'with', 'human', 'languag', 'data', '!', 'it', "'s", 'wide', 'u