# Introduction To Natural Language Processing with NLTK

## Tokenization

In [1]:
from nltk import sent_tokenize, word_tokenize

### 01. Sentence Tokenization

In [2]:
text = "Success is not final. Failure is not fatal. It is the courage to continue that counts."
print(text)

Success is not final. Failure is not fatal. It is the courage to continue that counts.


In [3]:
sentence_tokens = sent_tokenize(text)
print(sentence_tokens)

['Success is not final.', 'Failure is not fatal.', 'It is the courage to continue that counts.']


In [4]:
for sentence in sentence_tokens:
    print(sentence)

Success is not final.
Failure is not fatal.
It is the courage to continue that counts.


### 02. Word Tokenization

In [5]:
from nltk.tokenize import word_tokenize

In [6]:
sentence = "Let's see how the tokenizer split's this!"

In [7]:
word_tokens = word_tokenize(sentence)
print(word_tokens)

['Let', "'s", 'see', 'how', 'the', 'tokenizer', 'split', "'s", 'this', '!']


In [8]:
from nltk.tokenize import TreebankWordTokenizer, WordPunctTokenizer, WhitespaceTokenizer
tree_tokenizer = TreebankWordTokenizer()
word_punct_tokenizer = WordPunctTokenizer()
white_space_tokenizer = WhitespaceTokenizer()

In [9]:
word_tokens = tree_tokenizer.tokenize(sentence)
print(word_tokens)

['Let', "'s", 'see', 'how', 'the', 'tokenizer', 'split', "'s", 'this', '!']


In [10]:
word_tokens = word_punct_tokenizer.tokenize(sentence)
print(word_tokens)

['Let', "'", 's', 'see', 'how', 'the', 'tokenizer', 'split', "'", 's', 'this', '!']


In [11]:
word_tokens = white_space_tokenizer.tokenize(sentence)
print(word_tokens)

["Let's", 'see', 'how', 'the', 'tokenizer', "split's", 'this!']


## Stemming and Lemmatization

### 01. Stemming

In [12]:
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer

In [13]:
porter_stemmer = PorterStemmer()
print(porter_stemmer.stem('lying'))
print(porter_stemmer.stem('lies'))
print(porter_stemmer.stem('lied'))

lie
lie
lie


In [14]:
lancaster_stemmer = LancasterStemmer()
print(lancaster_stemmer.stem('lying'))
print(lancaster_stemmer.stem('lies'))
print(lancaster_stemmer.stem('lied'))

lying
lie
lied


In [15]:
snowball_stemmer = SnowballStemmer('english')
print(snowball_stemmer.stem('lying'))
print(snowball_stemmer.stem('lies'))
print(snowball_stemmer.stem('lied'))

lie
lie
lie


### 02. Lemmatization

In [16]:
from nltk.stem import WordNetLemmatizer

In [17]:
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('cars', pos="v"))
print(lemmatizer.lemmatize('cars', pos="n"))
print(lemmatizer.lemmatize('cars', pos="r"))
print(lemmatizer.lemmatize('cars', pos="a"))

cars
car
cars
cars


### 03. Stemming vs Lemmatization 

In [18]:
stemmer = PorterStemmer() 
lemmatizer = WordNetLemmatizer() 
print(stemmer.stem('stones')) 
print(stemmer.stem('speaking')) 
print(stemmer.stem('bedroom')) 
print(stemmer.stem('jokes')) 
print(stemmer.stem('lisa')) 
print(stemmer.stem('purple')) 
print('----------------------') 
print(lemmatizer.lemmatize('stones')) 
print(lemmatizer.lemmatize('speaking'))
print(lemmatizer.lemmatize('bedroom'))
print(lemmatizer.lemmatize('jokes'))
print(lemmatizer.lemmatize('lisa'))
print(lemmatizer.lemmatize('purple'))

stone
speak
bedroom
joke
lisa
purpl
----------------------
stone
speaking
bedroom
joke
lisa
purple


# POS Tagging

In [19]:
from nltk import pos_tag

In [20]:
print(pos_tag(word_tokenize("I'm learning NLP")))

[('I', 'PRP'), ("'m", 'VBP'), ('learning', 'VBG'), ('NLP', 'NNP')]


# Stop Word Removal

In [21]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [22]:
clean_tokens = word_tokens[:] 
for token in word_tokens:
    if token in stopwords.words('english'):
        clean_tokens.remove(token)

In [23]:
print(clean_tokens)

["Let's", 'see', 'tokenizer', "split's", 'this!']


# Named Entity Recognition

In [24]:
from nltk import word_tokenize, pos_tag, ne_chunk

In [25]:
sentence = "Mark and John are working at Google."
print (ne_chunk(pos_tag(word_tokenize(sentence))))

(S
  (PERSON Mark/NNP)
  and/CC
  (PERSON John/NNP)
  are/VBP
  working/VBG
  at/IN
  (ORGANIZATION Google/NNP)
  ./.)


# Interface to WordNet

In [26]:
from nltk.corpus import wordnet
syn = wordnet.synsets("life")
print(syn[0].definition())
print(syn[0].examples())

a characteristic state or mode of living
['social life', 'city life', 'real life']


In [27]:
syn = wordnet.synsets("old")
print(syn[0].definition())

syn = wordnet.synsets("Python")
print(syn[0].definition())

past times (especially in the phrase `in days of old')
large Old World boas


In [28]:
synonyms = []
for syn in wordnet.synsets('small'):
    for lemma in syn.lemmas():
        synonyms.append(lemma.name())
print(synonyms)

['small', 'small', 'small', 'little', 'minor', 'modest', 'small', 'small-scale', 'pocket-size', 'pocket-sized', 'little', 'small', 'small', 'humble', 'low', 'lowly', 'modest', 'small', 'little', 'minuscule', 'small', 'little', 'small', 'small', 'modest', 'small', 'belittled', 'diminished', 'small', 'small']


In [29]:
antonyms = []
for syn in wordnet.synsets("big"):
    for l in syn.lemmas():
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
print(antonyms)

['small', 'little', 'small']
