In [1]:
import nltk
#nltk.download()
from nltk import sent_tokenize

In [2]:
###################### Sentence Tokenization ####################################

text = "Success is not final. Failure is not fatal. It is the courage to continue that counts."



In [3]:
sentence_tokens = sent_tokenize(text)
print(sentence_tokens)

['Success is not final.', 'Failure is not fatal.', 'It is the courage to continue that counts.']


In [4]:
###################### Word Tokenization #########################################

In [5]:
from nltk.tokenize import word_tokenize


In [6]:
sentence = "Let's see how the tokenizer split's this!"



In [7]:
word_tokens = word_tokenize(sentence)
print(word_tokens)



['Let', "'s", 'see', 'how', 'the', 'tokenizer', 'split', "'s", 'this', '!']


In [8]:
############### Other Tokenizers ##################################################


In [9]:
from nltk.tokenize import TreebankWordTokenizer, WordPunctTokenizer, WhitespaceTokenizer
tree_tokenizer = TreebankWordTokenizer()
word_punct_tokenizer = WordPunctTokenizer()
white_space_tokenizer = WhitespaceTokenizer()

In [10]:
#### Tree Tokenizer #####
word_tokens = tree_tokenizer.tokenize(sentence)
print(word_tokens)

['Let', "'s", 'see', 'how', 'the', 'tokenizer', 'split', "'s", 'this', '!']


In [11]:
#### Punctuation Tokenizer #####
word_tokens = word_punct_tokenizer.tokenize(sentence)
print(word_tokens)

['Let', "'", 's', 'see', 'how', 'the', 'tokenizer', 'split', "'", 's', 'this', '!']


In [12]:
################################Stemming ####################################


In [13]:
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer


In [14]:
#####################  Porter Stemmer ########################################
porter_stemmer = PorterStemmer()
print(porter_stemmer.stem('lying'))
print(porter_stemmer.stem('lies'))
print(porter_stemmer.stem('lied'))

lie
lie
lie


In [15]:
################### Lancaster Stemmer #########################################


lancaster_stemmer = LancasterStemmer()
print(lancaster_stemmer.stem('lying'))
print(lancaster_stemmer.stem('lies'))
print(lancaster_stemmer.stem('lied'))



lying
lie
lied


In [16]:
snowball_stemmer = SnowballStemmer('english')
print(snowball_stemmer.stem('lying'))
print(snowball_stemmer.stem('lies'))
print(snowball_stemmer.stem('lied'))

lie
lie
lie


In [17]:
#############################  Lemmatization  ############################
#########   Wordnet Lemmtizer ###################

from nltk.stem import WordNetLemmatizer

In [18]:
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("running"))

running


In [19]:
###### Lemmatization in different forms  ###################

def lemmatize(word):
    lemmatizer = WordNetLemmatizer()
    print("verb form: " + lemmatizer.lemmatize(word, pos="v"))
    print("noun form: " + lemmatizer.lemmatize(word, pos="n"))
    print("adverb form: " + lemmatizer.lemmatize(word, pos="r"))
    print("adjective form: " + lemmatizer.lemmatize(word, pos="a"))



In [20]:
lemmatize("ears")

verb form: ears
noun form: ear
adverb form: ears
adjective form: ears


In [21]:
lemmatize("running")

verb form: run
noun form: running
adverb form: running
adjective form: running


In [22]:
#################   Stemming and Lemmatization Comparison #######################

In [23]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [24]:
stemmer = PorterStemmer();
lemmatizer = WordNetLemmatizer()

In [25]:


print(stemmer.stem("deactivating"))
print(stemmer.stem("deactivated"))
print(stemmer.stem("deactivates"))



deactiv
deactiv
deactiv


In [26]:
print(lemmatizer.lemmatize("deactivating", pos="v"))
print(lemmatizer.lemmatize("deactivating", pos="v"))
print(lemmatizer.lemmatize("deactivating", pos="v"))

deactivate
deactivate
deactivate


In [27]:
print(stemmer.stem('stones')) 
print(stemmer.stem('speaking')) 
print(stemmer.stem('bedroom')) 
print(stemmer.stem('jokes')) 
print(stemmer.stem('lisa')) 
print(stemmer.stem('purple'))

stone
speak
bedroom
joke
lisa
purpl


In [28]:
print(lemmatizer.lemmatize('stones')) 
print(lemmatizer.lemmatize('speaking'))
print(lemmatizer.lemmatize('bedroom'))
print(lemmatizer.lemmatize('jokes'))
print(lemmatizer.lemmatize('lisa'))
print(lemmatizer.lemmatize('purple'))


stone
speaking
bedroom
joke
lisa
purple


In [29]:
#################### POS Tagging ########################


In [30]:
from nltk import word_tokenize, pos_tag


In [31]:
sentence = "The hardest choices require the strongest wills"


In [32]:
sentence_tokens = word_tokenize(sentence)
print(sentence_tokens)

['The', 'hardest', 'choices', 'require', 'the', 'strongest', 'wills']


In [33]:
pos_tag(sentence_tokens)

[('The', 'DT'),
 ('hardest', 'JJS'),
 ('choices', 'NNS'),
 ('require', 'VBP'),
 ('the', 'DT'),
 ('strongest', 'JJS'),
 ('wills', 'NNS')]

In [34]:
#################### Chunking / Shallow parsing ######################

In [35]:
from nltk import pos_tag, word_tokenize, RegexpParser

In [36]:
sentence = "the big visious dog barked at the small feeble cat"
# sentence = "the little yellow hard tight dog barked at the cat"

In [37]:
#Define your grammar using regular expressions
grammar = ('''NP: {<DT>?<JJ>*<NN>} # NP''')

In [38]:
chunkParser = RegexpParser(grammar)
tagged = pos_tag(word_tokenize(sentence))
tagged

[('the', 'DT'),
 ('big', 'JJ'),
 ('visious', 'JJ'),
 ('dog', 'NN'),
 ('barked', 'VBD'),
 ('at', 'IN'),
 ('the', 'DT'),
 ('small', 'JJ'),
 ('feeble', 'JJ'),
 ('cat', 'NN')]

In [39]:
tree = chunkParser.parse(tagged)

In [40]:
for subtree in tree.subtrees():
    print(subtree)

(S
  (NP the/DT big/JJ visious/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT small/JJ feeble/JJ cat/NN))
(NP the/DT big/JJ visious/JJ dog/NN)
(NP the/DT small/JJ feeble/JJ cat/NN)


In [41]:
tree.draw()

In [42]:
#######################  STOP WORDS REMOVAL  ##########################


In [43]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [44]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [45]:
###################   Named Entity Recognition ##########################

In [46]:
from nltk import word_tokenize, pos_tag, ne_chunk

In [47]:
sentence = "Mark who works at Yahoo and John who works at Google decided to meet at New York City"

In [48]:
print (ne_chunk(pos_tag(word_tokenize(sentence))))

(S
  (PERSON Mark/NNP)
  who/WP
  works/VBZ
  at/IN
  (ORGANIZATION Yahoo/NNP)
  and/CC
  (PERSON John/NNP)
  who/WP
  works/VBZ
  at/IN
  (ORGANIZATION Google/NNP)
  decided/VBD
  to/TO
  meet/VB
  at/IN
  (GPE New/NNP York/NNP City/NNP))


In [49]:
sentence = "The Avengers began as a group of extraordinary individuals who were assembled to defeat \
Loki and his chitauri army in New York City. "

In [50]:
print (ne_chunk(pos_tag(word_tokenize(sentence))))

(S
  The/DT
  (ORGANIZATION Avengers/NNP)
  began/VBD
  as/IN
  a/DT
  group/NN
  of/IN
  extraordinary/JJ
  individuals/NNS
  who/WP
  were/VBD
  assembled/VBN
  to/TO
  defeat/VB
  (PERSON Loki/NNP)
  and/CC
  his/PRP$
  chitauri/NN
  army/NN
  in/IN
  (GPE New/NNP York/NNP City/NNP)
  ./.)
