In [5]:
import nltk
#nltk.download('all')
from nltk.tokenize import sent_tokenize

In [19]:
corpus=""" Hello! I'm testing tokenization. Let's see how it works.
NLTK's tokenizer is quite powerful—it's useful for NLP tasks.
Python 3.9 is great, isn't it? What about e-mail addresses like test@example.com?
Hey... check this out: $50, 100%, and U.S.A. are tricky cases. """

In [20]:
print(corpus)

 Hello! I'm testing tokenization. Let's see how it works.
NLTK's tokenizer is quite powerful—it's useful for NLP tasks.
Python 3.9 is great, isn't it? What about e-mail addresses like test@example.com?
Hey... check this out: $50, 100%, and U.S.A. are tricky cases. 


# Tokenization 
## Sentence from Paragraph

In [21]:
documents=sent_tokenize(corpus)
documents

[' Hello!',
 "I'm testing tokenization.",
 "Let's see how it works.",
 "NLTK's tokenizer is quite powerful—it's useful for NLP tasks.",
 "Python 3.9 is great, isn't it?",
 'What about e-mail addresses like test@example.com?',
 'Hey... check this out: $50, 100%, and U.S.A. are tricky cases.']

In [22]:
type(documents)

list

In [23]:
for sentences in documents:
    print(sentences)

 Hello!
I'm testing tokenization.
Let's see how it works.
NLTK's tokenizer is quite powerful—it's useful for NLP tasks.
Python 3.9 is great, isn't it?
What about e-mail addresses like test@example.com?
Hey... check this out: $50, 100%, and U.S.A. are tricky cases.


## Paragraph -->words

In [None]:
from nltk.tokenize import word_tokenize #Splits text into words, handling punctuation, contractions, and special cases intelligently.

In [25]:
word_tokenize(corpus)

['Hello',
 '!',
 'I',
 "'m",
 'testing',
 'tokenization',
 '.',
 'Let',
 "'s",
 'see',
 'how',
 'it',
 'works',
 '.',
 'NLTK',
 "'s",
 'tokenizer',
 'is',
 'quite',
 'powerful—it',
 "'s",
 'useful',
 'for',
 'NLP',
 'tasks',
 '.',
 'Python',
 '3.9',
 'is',
 'great',
 ',',
 'is',
 "n't",
 'it',
 '?',
 'What',
 'about',
 'e-mail',
 'addresses',
 'like',
 'test',
 '@',
 'example.com',
 '?',
 'Hey',
 '...',
 'check',
 'this',
 'out',
 ':',
 '$',
 '50',
 ',',
 '100',
 '%',
 ',',
 'and',
 'U.S.A.',
 'are',
 'tricky',
 'cases',
 '.']

In [26]:
for words in documents:
    print(word_tokenize(words))

['Hello', '!']
['I', "'m", 'testing', 'tokenization', '.']
['Let', "'s", 'see', 'how', 'it', 'works', '.']
['NLTK', "'s", 'tokenizer', 'is', 'quite', 'powerful—it', "'s", 'useful', 'for', 'NLP', 'tasks', '.']
['Python', '3.9', 'is', 'great', ',', 'is', "n't", 'it', '?']
['What', 'about', 'e-mail', 'addresses', 'like', 'test', '@', 'example.com', '?']
['Hey', '...', 'check', 'this', 'out', ':', '$', '50', ',', '100', '%', ',', 'and', 'U.S.A.', 'are', 'tricky', 'cases', '.']


In [None]:
from nltk.tokenize import wordpunct_tokenize # split words based on punctuation, Splits text whenever it sees a non-alphanumeric character.

In [28]:
wordpunct_tokenize(corpus)

['Hello',
 '!',
 'I',
 "'",
 'm',
 'testing',
 'tokenization',
 '.',
 'Let',
 "'",
 's',
 'see',
 'how',
 'it',
 'works',
 '.',
 'NLTK',
 "'",
 's',
 'tokenizer',
 'is',
 'quite',
 'powerful',
 '—',
 'it',
 "'",
 's',
 'useful',
 'for',
 'NLP',
 'tasks',
 '.',
 'Python',
 '3',
 '.',
 '9',
 'is',
 'great',
 ',',
 'isn',
 "'",
 't',
 'it',
 '?',
 'What',
 'about',
 'e',
 '-',
 'mail',
 'addresses',
 'like',
 'test',
 '@',
 'example',
 '.',
 'com',
 '?',
 'Hey',
 '...',
 'check',
 'this',
 'out',
 ':',
 '$',
 '50',
 ',',
 '100',
 '%,',
 'and',
 'U',
 '.',
 'S',
 '.',
 'A',
 '.',
 'are',
 'tricky',
 'cases',
 '.']

In [32]:
from nltk.tokenize import TreebankWordDetokenizer #The TreebankWordDetokenizer in NLTK is used to reconstruct sentences from tokenized words while ensuring proper spacing and punctuation.

In [34]:
tokenizer= TreebankWordDetokenizer()

In [41]:
token= word_tokenize(corpus)
token

['Hello',
 '!',
 'I',
 "'m",
 'testing',
 'tokenization',
 '.',
 'Let',
 "'s",
 'see',
 'how',
 'it',
 'works',
 '.',
 'NLTK',
 "'s",
 'tokenizer',
 'is',
 'quite',
 'powerful—it',
 "'s",
 'useful',
 'for',
 'NLP',
 'tasks',
 '.',
 'Python',
 '3.9',
 'is',
 'great',
 ',',
 'is',
 "n't",
 'it',
 '?',
 'What',
 'about',
 'e-mail',
 'addresses',
 'like',
 'test',
 '@',
 'example.com',
 '?',
 'Hey',
 '...',
 'check',
 'this',
 'out',
 ':',
 '$',
 '50',
 ',',
 '100',
 '%',
 ',',
 'and',
 'U.S.A.',
 'are',
 'tricky',
 'cases',
 '.']

In [42]:
detokenized_text = tokenizer.detokenize(token)
detokenized_text

"Hello! I'm testing tokenization . Let's see how it works . NLTK's tokenizer is quite powerful—it's useful for NLP tasks . Python 3.9 is great, isn't it? What about e-mail addresses like test @ example.com? Hey...check this out: $50, 100%, and U.S.A. are tricky cases."