In [1]:
#WordPunctTokenizer
from nltk.tokenize import WordPunctTokenizer
sentence_data = "The constitution declares India a sovereign, socialist, secular, democratic republic, assuring its citizens justice, equality and liberty, and endeavours to promote fraternity. The original 1950 constitution is preserved in a helium-filled case at the Parliament House in New Delhi."

tokens=WordPunctTokenizer()
output=tokens.tokenize(sentence_data)
print(output)
print(len(output))

['The', 'constitution', 'declares', 'India', 'a', 'sovereign', ',', 'socialist', ',', 'secular', ',', 'democratic', 'republic', ',', 'assuring', 'its', 'citizens', 'justice', ',', 'equality', 'and', 'liberty', ',', 'and', 'endeavours', 'to', 'promote', 'fraternity', '.', 'The', 'original', '1950', 'constitution', 'is', 'preserved', 'in', 'a', 'helium', '-', 'filled', 'case', 'at', 'the', 'Parliament', 'House', 'in', 'New', 'Delhi', '.']
49


In [2]:
# Whitespace Tokenizer
from nltk.tokenize import WhitespaceTokenizer
sentence_data = "The constitution declares India a sovereign, socialist, secular, democratic republic, assuring its citizens justice, equality and liberty, and endeavours to promote fraternity. The original 1950 constitution is preserved in a helium-filled case at the Parliament House in New Delhi."

tokens=WhitespaceTokenizer()
output=tokens.tokenize(sentence_data)
print(output)
print(len(output))

['The', 'constitution', 'declares', 'India', 'a', 'sovereign,', 'socialist,', 'secular,', 'democratic', 'republic,', 'assuring', 'its', 'citizens', 'justice,', 'equality', 'and', 'liberty,', 'and', 'endeavours', 'to', 'promote', 'fraternity.', 'The', 'original', '1950', 'constitution', 'is', 'preserved', 'in', 'a', 'helium-filled', 'case', 'at', 'the', 'Parliament', 'House', 'in', 'New', 'Delhi.']
39


In [3]:
#TreebankWordTokenizer
from nltk.tokenize import TreebankWordTokenizer
sentence_data = "The constitution declares India a sovereign, socialist, secular, democratic republic, assuring its citizens justice, equality and liberty, and endeavours to promote fraternity. The original 1950 constitution is preserved in a helium-filled case at the Parliament House in New Delhi."

tokens=TreebankWordTokenizer()
output=tokens.tokenize(sentence_data)
print(output)
print(len(output))

['The', 'constitution', 'declares', 'India', 'a', 'sovereign', ',', 'socialist', ',', 'secular', ',', 'democratic', 'republic', ',', 'assuring', 'its', 'citizens', 'justice', ',', 'equality', 'and', 'liberty', ',', 'and', 'endeavours', 'to', 'promote', 'fraternity.', 'The', 'original', '1950', 'constitution', 'is', 'preserved', 'in', 'a', 'helium-filled', 'case', 'at', 'the', 'Parliament', 'House', 'in', 'New', 'Delhi', '.']
46


In [9]:
# Stemming
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
paragraph ="""A constitution is an aggregate of fundamental principles or established precedents that constitute the legal basis of a polity, organisation or other type of entity and commonly determine how that entity is to be governed."""

sentences = nltk.sent_tokenize(paragraph)
print(len(sentences))

for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    sentences[i] = ' '.join(words) 

print(sentences) ;

1
[u'A constitut aggreg fundament principl establish preced constitut legal basi politi , organis type entiti commonli determin entiti govern .']


In [10]:
# Lemmitizing
from nltk.stem import WordNetLemmatizer
paragraph ="""A constitution is an aggregate of fundamental principles or established precedents that constitute the legal basis of a polity, organisation or other type of entity and commonly determine how that entity is to be governed."""

lemmatizer = WordNetLemmatizer()
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    sentences[i] = ' '.join(words) 

print(sentences) ;

[u'A constitut aggreg fundament principl establish preced constitut legal basi politi , organis type entiti commonli determin entiti govern .']


In [27]:
# DIFFERENT TYPES OF TOKENIZERS

#1 Tweet Tokenizer
from nltk.tokenize import TweetTokenizer
sentence_data = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
tokens=TweetTokenizer()
output=tokens.tokenize(sentence_data)
print(output)
print(len(output))

#2 Multi-Word Expression Tokenizer
from nltk.tokenize import MWETokenizer
tokenizer = MWETokenizer([('a', 'little'), ('a', 'little', 'bit'), ('a', 'lot')])
tokenizer.add_mwe(('in', 'spite', 'of'))
s1 = tokenizer.tokenize('This is a test in spite'.split())
print(s1)
s2 = tokenizer.tokenize('In a little or a little bit or a lot in spite of'.split())
print(s2)

#3 SExprTokenizer
from nltk.tokenize import SExprTokenizer 
tk = SExprTokenizer() 
s3 = "( a * ( b + c ))ab( a-c )"
s = tk.tokenize(s3) 
print(s) 

#4 TabTokenizer() method from nltk 
from nltk.tokenize import TabTokenizer 
tk = TabTokenizer() 
s5 = "Testing\tPython\t.$$&* \nis\t Testing2"
s6 = tk.tokenize(s5) 
print(s6) 

# ConditionalFreqDist()
from nltk.probability import ConditionalFreqDist 
from nltk.tokenize import word_tokenize 
tk = ConditionalFreqDist() 
s4 = "Testing Python Testing"
     
for word in word_tokenize(s4): 
   condition = len(word) 
   tk[condition][word] += 1
     
print(tk) 

[u'This', u'is', u'a', u'cooool', u'#dummysmiley', u':', u':-)', u':-P', u'<3', u'and', u'some', u'arrows', u'<', u'>', u'->', u'<--']
16
['This', 'is', 'a', 'test', 'in', 'spite']
['In', 'a_little', 'or', 'a_little_bit', 'or', 'a_lot', 'in_spite_of']
['( a * ( b + c ))', 'ab', '( a-c )']
[u'Testing', u'Python', u'.$$&* \nis', u' Testing2']
<ConditionalFreqDist with 2 conditions>
