In [1]:
import nltk

In [35]:
corpus = '''Wow!!! Is this really happening? 
I can't believe it—amazing, right? 
He said: "You must be joking!" 
Haha, unbelievable...
''' 
print(corpus)

Wow!!! Is this really happening? 
I can't believe it—amazing, right? 
He said: "You must be joking!" 
Haha, unbelievable...



### `-> Sentence tokenizer`

- divides the whole corpus based on <b>`punctuation marks`</b>

In [36]:
# divides each sentence based on 'punctuation marks'
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(corpus)
for i,sentence in enumerate(sentences):
    print(f'{i} -> {sentence}')

0 -> Wow!!!
1 -> Is this really happening?
2 -> I can't believe it—amazing, right?
3 -> He said: "You must be joking!"
4 -> Haha, unbelievable...


### `-> Word tokenizer`

- divides the whole corpus based on <b>`spaces and punctuation marks`</b>

In [37]:
from nltk.tokenize import word_tokenize
words = word_tokenize(corpus)
for i,word in enumerate(words):
    print(f'{i} -> {word}')

0 -> Wow
1 -> !
2 -> !
3 -> !
4 -> Is
5 -> this
6 -> really
7 -> happening
8 -> ?
9 -> I
10 -> ca
11 -> n't
12 -> believe
13 -> it—amazing
14 -> ,
15 -> right
16 -> ?
17 -> He
18 -> said
19 -> :
20 -> ``
21 -> You
22 -> must
23 -> be
24 -> joking
25 -> !
26 -> ''
27 -> Haha
28 -> ,
29 -> unbelievable
30 -> ...


### `-> Word puct tokenizer`

- divides the corpus in a more advanced way to handle <b>`abbreviations`</b>, <b>`initials`</b>, and other complex cases better than simple tokenizers

In [38]:
# apostrophe is separated
from nltk.tokenize import wordpunct_tokenize
wordspunct = wordpunct_tokenize(corpus)
for i,wordpunct in enumerate(wordspunct):
    print(f'{i} -> {wordpunct}')

0 -> Wow
1 -> !!!
2 -> Is
3 -> this
4 -> really
5 -> happening
6 -> ?
7 -> I
8 -> can
9 -> '
10 -> t
11 -> believe
12 -> it
13 -> —
14 -> amazing
15 -> ,
16 -> right
17 -> ?
18 -> He
19 -> said
20 -> :
21 -> "
22 -> You
23 -> must
24 -> be
25 -> joking
26 -> !"
27 -> Haha
28 -> ,
29 -> unbelievable
30 -> ...


In [39]:
from nltk.tokenize import TreebankWordTokenizer
tbwt = TreebankWordTokenizer()
wordstreebank = tbwt.tokenize(corpus)
for i, wordtreebank in enumerate(wordstreebank):
    print(f'{i} -> {wordtreebank}')

0 -> Wow
1 -> !
2 -> !
3 -> !
4 -> Is
5 -> this
6 -> really
7 -> happening
8 -> ?
9 -> I
10 -> ca
11 -> n't
12 -> believe
13 -> it—amazing
14 -> ,
15 -> right
16 -> ?
17 -> He
18 -> said
19 -> :
20 -> ``
21 -> You
22 -> must
23 -> be
24 -> joking
25 -> !
26 -> ''
27 -> Haha
28 -> ,
29 -> unbelievable
30 -> ...
