In [1]:
corpus = """Hello everyone! I am Sowmya.
I am an AI's enthusiast.
I am currently learning NLP.
NLP stands for Natural Language Processing."""

print(corpus)

Hello everyone! I am Sowmya.
I am an AI's enthusiast.
I am currently learning NLP.
NLP stands for Natural Language Processing.


In [9]:
import nltk
nltk.download('punkt')

In [2]:
from nltk.tokenize import sent_tokenize
documents = sent_tokenize(corpus)
documents

['Hello everyone!',
 'I am Sowmya.',
 "I am an AI's enthusiast.",
 'I am currently learning NLP.',
 'NLP stands for Natural Language Processing.']

In [3]:
from nltk.tokenize import word_tokenize
words = [word_tokenize(document) for document in documents]
words

[['Hello', 'everyone', '!'],
 ['I', 'am', 'Sowmya', '.'],
 ['I', 'am', 'an', 'AI', "'s", 'enthusiast', '.'],
 ['I', 'am', 'currently', 'learning', 'NLP', '.'],
 ['NLP', 'stands', 'for', 'Natural', 'Language', 'Processing', '.']]

In [4]:
from nltk.tokenize import wordpunct_tokenize
words_punct = [wordpunct_tokenize(document) for document in documents]
words_punct

[['Hello', 'everyone', '!'],
 ['I', 'am', 'Sowmya', '.'],
 ['I', 'am', 'an', 'AI', "'", 's', 'enthusiast', '.'],
 ['I', 'am', 'currently', 'learning', 'NLP', '.'],
 ['NLP', 'stands', 'for', 'Natural', 'Language', 'Processing', '.']]

In [5]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
words_treebank = tokenizer.tokenize(corpus) 
words_treebank

['Hello',
 'everyone',
 '!',
 'I',
 'am',
 'Sowmya.',
 'I',
 'am',
 'an',
 'AI',
 "'s",
 'enthusiast.',
 'I',
 'am',
 'currently',
 'learning',
 'NLP.',
 'NLP',
 'stands',
 'for',
 'Natural',
 'Language',
 'Processing',
 '.']

In [6]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
words_regexp = [tokenizer.tokenize(document) for document in documents]
words_regexp

[['Hello', 'everyone'],
 ['I', 'am', 'Sowmya'],
 ['I', 'am', 'an', 'AI', 's', 'enthusiast'],
 ['I', 'am', 'currently', 'learning', 'NLP'],
 ['NLP', 'stands', 'for', 'Natural', 'Language', 'Processing']]

In [8]:
from nltk.tokenize import MWETokenizer
tokenizer = MWETokenizer([('Natural', 'Language', 'Processing')])
words_mwe = [tokenizer.tokenize(word) for word in words]
words_mwe

[['Hello', 'everyone', '!'],
 ['I', 'am', 'Sowmya', '.'],
 ['I', 'am', 'an', 'AI', "'s", 'enthusiast', '.'],
 ['I', 'am', 'currently', 'learning', 'NLP', '.'],
 ['NLP', 'stands', 'for', 'Natural_Language_Processing', '.']]

In [9]:
from nltk.tokenize import WhitespaceTokenizer
tokenizer = WhitespaceTokenizer()
words_whitespace = [tokenizer.tokenize(document) for document in documents]
words_whitespace

[['Hello', 'everyone!'],
 ['I', 'am', 'Sowmya.'],
 ['I', 'am', 'an', "AI's", 'enthusiast.'],
 ['I', 'am', 'currently', 'learning', 'NLP.'],
 ['NLP', 'stands', 'for', 'Natural', 'Language', 'Processing.']]

In [13]:
import pandas as pd
tokenizer_df = pd.DataFrame()

tokenizer_df['Original_words'] = words
tokenizer_df['WordPunct'] = words_punct
tokenizer_df['RegExp'] = words_regexp
tokenizer_df['mwe'] = words_mwe
tokenizer_df['Whitespace'] = words_whitespace

tokenizer_df

Unnamed: 0,Original_words,WordPunct,RegExp,mwe,Whitespace
0,"[Hello, everyone, !]","[Hello, everyone, !]","[Hello, everyone]","[Hello, everyone, !]","[Hello, everyone!]"
1,"[I, am, Sowmya, .]","[I, am, Sowmya, .]","[I, am, Sowmya]","[I, am, Sowmya, .]","[I, am, Sowmya.]"
2,"[I, am, an, AI, 's, enthusiast, .]","[I, am, an, AI, ', s, enthusiast, .]","[I, am, an, AI, s, enthusiast]","[I, am, an, AI, 's, enthusiast, .]","[I, am, an, AI's, enthusiast.]"
3,"[I, am, currently, learning, NLP, .]","[I, am, currently, learning, NLP, .]","[I, am, currently, learning, NLP]","[I, am, currently, learning, NLP, .]","[I, am, currently, learning, NLP.]"
4,"[NLP, stands, for, Natural, Language, Processi...","[NLP, stands, for, Natural, Language, Processi...","[NLP, stands, for, Natural, Language, Processing]","[NLP, stands, for, Natural_Language_Processing...","[NLP, stands, for, Natural, Language, Processi..."
