# Introduction To Natural Language Processing with NLTK

## Tokenization

In [1]:
from nltk import sent_tokenize, word_tokenize

### 01. Sentence Tokenization

In [2]:
text = "Success is not final. Failure is not fatal. It is the courage to continue that counts."
print(text)

Success is not final. Failure is not fatal. It is the courage to continue that counts.


In [3]:
sentence_tokens = sent_tokenize(text)
print(sentence_tokens)

['Success is not final.', 'Failure is not fatal.', 'It is the courage to continue that counts.']


In [4]:
for sentence in sentence_tokens:
    print(sentence)

Success is not final.
Failure is not fatal.
It is the courage to continue that counts.


### 02. Word Tokenization

In [5]:
from nltk.tokenize import word_tokenize

In [6]:
sentence = "Let's see how the tokenizer split's this!"

In [7]:
word_tokens = word_tokenize(sentence)
print(word_tokens)

['Let', "'s", 'see', 'how', 'the', 'tokenizer', 'split', "'s", 'this', '!']


In [8]:
from nltk.tokenize import TreebankWordTokenizer, WordPunctTokenizer, WhitespaceTokenizer
tree_tokenizer = TreebankWordTokenizer()
word_punct_tokenizer = WordPunctTokenizer()
white_space_tokenizer = WhitespaceTokenizer()

In [9]:
word_tokens = tree_tokenizer.tokenize(sentence)
print(word_tokens)

['Let', "'s", 'see', 'how', 'the', 'tokenizer', 'split', "'s", 'this', '!']


In [10]:
word_tokens = word_punct_tokenizer.tokenize(sentence)
print(word_tokens)

['Let', "'", 's', 'see', 'how', 'the', 'tokenizer', 'split', "'", 's', 'this', '!']


In [11]:
word_tokens = white_space_tokenizer.tokenize(sentence)
print(word_tokens)

["Let's", 'see', 'how', 'the', 'tokenizer', "split's", 'this!']


## Stemming and Lemmatization

### 01. Stemming

In [12]:
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer

In [13]:
porter_stemmer = PorterStemmer()
print(porter_stemmer.stem('lying'))
print(porter_stemmer.stem('lies'))
print(porter_stemmer.stem('lied'))

lie
lie
lie


In [14]:
lancaster_stemmer = LancasterStemmer()
print(lancaster_stemmer.stem('lying'))
print(lancaster_stemmer.stem('lies'))
print(lancaster_stemmer.stem('lied'))

lying
lie
lied


In [15]:
snowball_stemmer = SnowballStemmer('english')
print(snowball_stemmer.stem('lying'))
print(snowball_stemmer.stem('lies'))
print(snowball_stemmer.stem('lied'))

lie
lie
lie


### 02. Lemmatization

In [16]:
from nltk.stem import WordNetLemmatizer

In [17]:
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("running"))

running


In [18]:
def lemmatize(word):
    lemmatizer = WordNetLemmatizer()
    print("verb form: " + lemmatizer.lemmatize(word, pos="v"))
    print("noun form: " + lemmatizer.lemmatize(word, pos="n"))
    print("adverb form: " + lemmatizer.lemmatize(word, pos="r"))
    print("adjective form: " + lemmatizer.lemmatize(word, pos="a"))

In [19]:
lemmatize("ears")

verb form: ears
noun form: ear
adverb form: ears
adjective form: ears


In [20]:
lemmatize("running")

verb form: run
noun form: running
adverb form: running
adjective form: running


### 03. Stemming vs Lemmatization 

In [21]:
stemmer = PorterStemmer();
lemmatizer = WordNetLemmatizer()

In [22]:
print(stemmer.stem("deactivating"))
print(stemmer.stem("deactivated"))
print(stemmer.stem("deactivates"))

deactiv
deactiv
deactiv


In [23]:
print(lemmatizer.lemmatize("deactivating", pos="v"))
print(lemmatizer.lemmatize("deactivating", pos="v"))
print(lemmatizer.lemmatize("deactivating", pos="v"))

deactivate
deactivate
deactivate


In [24]:
print(stemmer.stem('stones')) 
print(stemmer.stem('speaking')) 
print(stemmer.stem('bedroom')) 
print(stemmer.stem('jokes')) 
print(stemmer.stem('lisa')) 
print(stemmer.stem('purple')) 

stone
speak
bedroom
joke
lisa
purpl


In [25]:
print(lemmatizer.lemmatize('stones')) 
print(lemmatizer.lemmatize('speaking'))
print(lemmatizer.lemmatize('bedroom'))
print(lemmatizer.lemmatize('jokes'))
print(lemmatizer.lemmatize('lisa'))
print(lemmatizer.lemmatize('purple'))

stone
speaking
bedroom
joke
lisa
purple


# POS Tagging

In [26]:
from nltk import pos_tag

In [27]:
sentence = "The hardest choices require the strongest wills"

In [28]:
sentence_tokens = word_tokenize(sentence)
print(sentence_tokens)

['The', 'hardest', 'choices', 'require', 'the', 'strongest', 'wills']


In [29]:
pos_tag(sentence_tokens)

[('The', 'DT'),
 ('hardest', 'JJS'),
 ('choices', 'NNS'),
 ('require', 'VBP'),
 ('the', 'DT'),
 ('strongest', 'JJS'),
 ('wills', 'NNS')]

In [30]:
from nltk import help
help.upenn_tagset("PRP$")

PRP$: pronoun, possessive
    her his mine my our ours their thy your


# Stop Word Removal

In [31]:
from nltk.corpus import stopwords

In [32]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [33]:
sentence = "Success is not final. Failure is not fatal. It is the courage to continue that counts."

In [34]:
word_tokens = word_tokenize(sentence)
print(word_tokens)

['Success', 'is', 'not', 'final', '.', 'Failure', 'is', 'not', 'fatal', '.', 'It', 'is', 'the', 'courage', 'to', 'continue', 'that', 'counts', '.']


In [35]:
clean_tokens = word_tokens[:] 
for token in word_tokens:
    if token in stopwords.words('english'):
        clean_tokens.remove(token)

In [36]:
print(clean_tokens)

['Success', 'final', '.', 'Failure', 'fatal', '.', 'It', 'courage', 'continue', 'counts', '.']


# Named Entity Recognition

In [37]:
from nltk import word_tokenize, pos_tag, ne_chunk

In [38]:
sentence = "Mark and John will meet at Google HQ on 25-10-2019 at 9.30PM"
print (ne_chunk(pos_tag(word_tokenize(sentence))))

(S
  (PERSON Mark/NNP)
  and/CC
  (PERSON John/NNP)
  will/MD
  meet/VB
  at/IN
  (ORGANIZATION Google/NNP)
  HQ/NNP
  on/IN
  25-10-2019/JJ
  at/IN
  9.30PM/CD)


# Interface to WordNet

In [39]:
from nltk.corpus import wordnet
syn = wordnet.synsets("life")
print(syn[0].definition())
print(syn[0].examples())

a characteristic state or mode of living
['social life', 'city life', 'real life']


In [40]:
syn = wordnet.synsets("old")
print(syn[0].definition())

syn = wordnet.synsets("Python")
print(syn[0].definition())

past times (especially in the phrase `in days of old')
large Old World boas


In [41]:
synonyms = []
for syn in wordnet.synsets('small'):
    for lemma in syn.lemmas():
        synonyms.append(lemma.name())
print(synonyms)

['small', 'small', 'small', 'little', 'minor', 'modest', 'small', 'small-scale', 'pocket-size', 'pocket-sized', 'little', 'small', 'small', 'humble', 'low', 'lowly', 'modest', 'small', 'little', 'minuscule', 'small', 'little', 'small', 'small', 'modest', 'small', 'belittled', 'diminished', 'small', 'small']


In [42]:
antonyms = []
for syn in wordnet.synsets("big"):
    for l in syn.lemmas():
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
print(antonyms)

['small', 'little', 'small']
