In [1]:
#The Natural Language Toolkit (NLTK) defines a basic infrastructure that can be used
#to build NLP programs in Python. It provides:
#• Basic classes for representing data relevant to natural language processing.
#• Standard interfaces for performing tasks, such as tokenization, tagging, and parsing.
#• Standard implementations for each task, which can be combined to solve complex
#problems.
#This tutorial introduces NLTK, with an emphasis on tokens and tokenization.


In [17]:
# from nltk.token import * - avoid this to import other modues and cause conflicts.

from nltk.tokenize import word_tokenize  # this is also can be done

In [19]:
s = '''Good muffins cost $3.88\nin New York.  Please buy me
       two of them.\n\nThanks.'''
word_tokenize(s)

['Good',
 'muffins',
 'cost',
 '$',
 '3.88',
 'in',
 'New',
 'York',
 '.',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them',
 '.',
 'Thanks',
 '.']

In [20]:
# This particular tokenizer requires the Punkt sentence tokenization
#models to be installed. NLTK also provides a simpler,
#regular-expression based tokenizer, which splits text on whitespace
#and punctuation:

from nltk.tokenize import wordpunct_tokenize
wordpunct_tokenize(s)

['Good',
 'muffins',
 'cost',
 '$',
 '3',
 '.',
 '88',
 'in',
 'New',
 'York',
 '.',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them',
 '.',
 'Thanks',
 '.']

In [21]:
# We can also operate at the level of sentences, using the sentence
# tokenizer directly as follows:
from nltk.tokenize import sent_tokenize, word_tokenize

sent_tokenize(s)

['Good muffins cost $3.88\nin New York.',
 'Please buy me\n       two of them.',
 'Thanks.']

In [22]:
[word_tokenize(t) for t in sent_tokenize(s)]

[['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'],
 ['Please', 'buy', 'me', 'two', 'of', 'them', '.'],
 ['Thanks', '.']]

In [29]:
# NLTK tokenizers can produce token-spans, represented as tuples of integers
#having the same semantics as string slices, to support efficient comparison
# of tokenizers.  (These methods are implemented as generators.)

from nltk.tokenize import WhitespaceTokenizer

list(WhitespaceTokenizer().span_tokenize(s))

[(0, 4),
 (5, 12),
 (13, 17),
 (18, 23),
 (24, 26),
 (27, 30),
 (31, 36),
 (38, 44),
 (45, 48),
 (49, 51),
 (59, 62),
 (63, 65),
 (66, 71),
 (73, 80)]

In [27]:
from nltk.tokenize import word_tokenize
text = "God is Great! I won a lottery."
print(word_tokenize(text))

['God', 'is', 'Great', '!', 'I', 'won', 'a', 'lottery', '.']


In [30]:
#  Tokenization of Sentences

from nltk.tokenize import sent_tokenize
text = "God is Great! I won a lottery."
print(sent_tokenize(text))


['God is Great!', 'I won a lottery.']


In [34]:
# POS (Part-Of-Speech) Tagging & Chunking with NLTK. POS is responsible for reading the text in a language and assigning
# some specific token (Parts of Speech) to each word.

# Tokenize text (word_tokenize)
# apply pos_tag to above step that is nltk.pos_tag(tokenize_text)

# [('Everything', NN),('to', TO), ('permit', VB), ('us', PRP)] - VB=Verb, PRP=Preposition, NN=Noun (singular)

from nltk import pos_tag
from nltk import RegexpParser
text ="learn php from IIM and make study easy".split()
print("After Split:",text)


After Split: ['learn', 'php', 'from', 'IIM', 'and', 'make', 'study', 'easy']
