In [2]:
corpus = """Ram like to drink the apple juice.
Shaym like to drink the mango juice!.
Rani like to drink the orange's juice.
"""

In [20]:
print(corpus)

Ram like to drink the apple juice.
Shaym like to drink the mango juice!.
Rani like to drink the orange's juice.



# Tokenization
## Paragraph to sentence

In [21]:
from nltk.tokenize import sent_tokenize
docs = sent_tokenize(corpus)

In [22]:
docs    

['Ram like to drink the apple juice.',
 'Shaym like to drink the mango juice!.',
 "Rani like to drink the orange's juice."]

## Paragraph to words
## sentences to words 

In [23]:
from nltk.tokenize import word_tokenize
word_tokenize(corpus)

['Ram',
 'like',
 'to',
 'drink',
 'the',
 'apple',
 'juice',
 '.',
 'Shaym',
 'like',
 'to',
 'drink',
 'the',
 'mango',
 'juice',
 '!',
 '.',
 'Rani',
 'like',
 'to',
 'drink',
 'the',
 'orange',
 "'s",
 'juice',
 '.']

In [24]:
for sentence in docs:
    print(word_tokenize(sentence))

['Ram', 'like', 'to', 'drink', 'the', 'apple', 'juice', '.']
['Shaym', 'like', 'to', 'drink', 'the', 'mango', 'juice', '!', '.']
['Rani', 'like', 'to', 'drink', 'the', 'orange', "'s", 'juice', '.']


# using TreebankWordTokenizer tokenizer

In [None]:
# we use TreebankWordTokenizer to tokenize the corpus
# This tokenizer is more sophisticated and handles punctuation better than the basic word_tokenize
from nltk.tokenize import TreebankWordTokenizer 
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(corpus)

['Ram',
 'like',
 'to',
 'drink',
 'the',
 'apple',
 'juice.',
 'Shaym',
 'like',
 'to',
 'drink',
 'the',
 'mango',
 'juice',
 '!',
 '.',
 'Rani',
 'like',
 'to',
 'drink',
 'the',
 'orange',
 "'s",
 'juice',
 '.']

# ToktokTokenizer

In [None]:
# We can also use the ToktokTokenizer, which is a tokenizer that is designed to handle various types of text, including those with contractions and special characters.
# It is particularly useful for tokenizing text in a more flexible way.
# The ToktokTokenizer from NLTK is a simple and fast tokenizer that's mostly punctuation-aware and tries to mimic Penn Treebank-style tokenization. 
# It does a decent job with basic text, contractions, and punctuation but doesn’t handle as many edge cases as more advanced tokenizers.

from nltk.tokenize import ToktokTokenizer
tok = ToktokTokenizer()
tok.tokenize("""
He said, “I’ll be there at 5:30 p.m.—don’t wait!”
Can’t believe she replied, “Cool… see you then.”
They’ve planned everything—flights, hotels, etc.
Isn’t it amazing? Let’s hope it all works out!
""")


['He',
 'said',
 ',',
 '“',
 'I',
 '’',
 'll',
 'be',
 'there',
 'at',
 '5',
 ':',
 '30',
 'p.m.',
 '—',
 'don',
 '’',
 't',
 'wait',
 '!',
 '”',
 'Can',
 '’',
 't',
 'believe',
 'she',
 'replied',
 ',',
 '“',
 'Cool',
 '…',
 'see',
 'you',
 'then.',
 '”',
 'They',
 '’',
 've',
 'planned',
 'everything',
 '—',
 'flights',
 ',',
 'hotels',
 ',',
 'etc.',
 'Isn',
 '’',
 't',
 'it',
 'amazing',
 '?',
 'Let',
 '’',
 's',
 'hope',
 'it',
 'all',
 'works',
 'out',
 '!']

# TweetTokenizer

In [12]:
# We can also use the TweetTokenizer, which is specifically designed for tokenizing tweets and other informal text.
# It handles hashtags, mentions, and other special characters commonly found in social media text.
# The TweetTokenizer is part of the NLTK library and can be used to tokenize tweets and other informal text.
from nltk.tokenize import TweetTokenizer
str = TweetTokenizer()
str.tokenize("""
Just launched our new NLP model! 🚀
It’s faster, smarter, & open-source.
Check it out here: https://nlp.ai/demo
Thoughts? Drop a comment below! @OpenAI #AI #NLP #MachineLearning
""")


['Just',
 'launched',
 'our',
 'new',
 'NLP',
 'model',
 '!',
 '🚀',
 'It',
 '’',
 's',
 'faster',
 ',',
 'smarter',
 ',',
 '&',
 'open-source',
 '.',
 'Check',
 'it',
 'out',
 'here',
 ':',
 'https://nlp.ai/demo',
 'Thoughts',
 '?',
 'Drop',
 'a',
 'comment',
 'below',
 '!',
 '@OpenAI',
 '#AI',
 '#NLP',
 '#MachineLearning']

In [10]:
# We can also use the LegalitySyllableTokenizer, which is a tokenizer that is designed to handle syllables in words.
# It is particularly useful for tokenizing text in a more flexible way, especially for languages that have syllable-based writing systems.

from nltk.tokenize import LegalitySyllableTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import words
text = "This is a wonderful sentence."
text_Words = word_tokenize(text)
LP = LegalitySyllableTokenizer(words.words())
[LP.tokenize(word) for word in text_Words]

[['This'], ['is'], ['a'], ['won', 'der', 'ful'], ['sen', 'ten', 'ce'], ['.']]

In [None]:
# displaying the first 10 words from the NLTK words corpus
words.words()[:10]

['A',
 'a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'Aani',
 'aardvark',
 'aardwolf',
 'Aaron']

In [17]:
print(LP)

<nltk.tokenize.legality_principle.LegalitySyllableTokenizer object at 0x0000029A1B5EC170>
