# Introduction To Natural Language Processing with NLTK

## Tokenization

In [1]:
from nltk import sent_tokenize, word_tokenize

### 01. Sentence Tokenization

In [2]:
text = """Simply, Natural Language Processing (NLP) helps computers (machines) to "read and understand" text or speech, by simulating the human's ability to understand languages. NLP is a sub-field of Artificial Intelligence, which also comprises of computation linguistics, computer science and statistics. Even though NLP is not comparatively as popular as Machine Learning, Deep Learning etc. it is as important and useful as them."""
print(text)

Simply, Natural Language Processing (NLP) helps computers (machines) to "read and understand" text or speech, by simulating the human's ability to understand languages. NLP is a sub-field of Artificial Intelligence, which also comprises of computation linguistics, computer science and statistics. Even though NLP is not comparatively as popular as Machine Learning, Deep Learning etc. it is as important and useful as them.


In [3]:
sentence_tokens = sent_tokenize(text)
print(sentence_tokens)

['Simply, Natural Language Processing (NLP) helps computers (machines) to "read and understand" text or speech, by simulating the human\'s ability to understand languages.', 'NLP is a sub-field of Artificial Intelligence, which also comprises of computation linguistics, computer science and statistics.', 'Even though NLP is not comparatively as popular as Machine Learning, Deep Learning etc.', 'it is as important and useful as them.']


In [4]:
for sentence in sentence_tokens:
    print(sentence)

Simply, Natural Language Processing (NLP) helps computers (machines) to "read and understand" text or speech, by simulating the human's ability to understand languages.
NLP is a sub-field of Artificial Intelligence, which also comprises of computation linguistics, computer science and statistics.
Even though NLP is not comparatively as popular as Machine Learning, Deep Learning etc.
it is as important and useful as them.


### 02. Word Tokenization

In [5]:
sentence = """NLP is a sub-field of Artificial Intelligence, which also comprises of computation linguistics, computer science and statistics."""
print(sentence)

NLP is a sub-field of Artificial Intelligence, which also comprises of computation linguistics, computer science and statistics.


In [6]:
word_tokens = word_tokenize(sentence)
print(word_tokens)

['NLP', 'is', 'a', 'sub-field', 'of', 'Artificial', 'Intelligence', ',', 'which', 'also', 'comprises', 'of', 'computation', 'linguistics', ',', 'computer', 'science', 'and', 'statistics', '.']


## Stemming and Lemmatization

### 01. Stemming

In [7]:
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer

In [8]:
print(PorterStemmer().stem('thought'))

thought


### 02. Lemmatization

In [9]:
from nltk.stem import WordNetLemmatizer

In [10]:
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('thought', pos="v"))

think


### 03. Stemming vs Lemmatization 

In [11]:
stemmer = PorterStemmer() 
lemmatizer = WordNetLemmatizer() 
print(stemmer.stem('stones')) 
print(stemmer.stem('speaking')) 
print(stemmer.stem('bedroom')) 
print(stemmer.stem('jokes')) 
print(stemmer.stem('lisa')) 
print(stemmer.stem('purple')) 
print('----------------------') 
print(lemmatizer.lemmatize('stones')) 
print(lemmatizer.lemmatize('speaking'))
print(lemmatizer.lemmatize('bedroom'))
print(lemmatizer.lemmatize('jokes'))
print(lemmatizer.lemmatize('lisa'))
print(lemmatizer.lemmatize('purple'))

stone
speak
bedroom
joke
lisa
purpl
----------------------
stone
speaking
bedroom
joke
lisa
purple


# POS Tagging

In [12]:
from nltk import pos_tag

In [13]:
print(pos_tag(word_tokenize("I'm learning NLP")))

[('I', 'PRP'), ("'m", 'VBP'), ('learning', 'VBG'), ('NLP', 'NNP')]


# Stop Word Removal

In [14]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [15]:
clean_tokens = word_tokens[:] 
for token in word_tokens:
    if token in stopwords.words('english'):
        clean_tokens.remove(token)

In [16]:
print(clean_tokens)

['NLP', 'sub-field', 'Artificial', 'Intelligence', ',', 'also', 'comprises', 'computation', 'linguistics', ',', 'computer', 'science', 'statistics', '.']


# Named Entity Recognition

In [17]:
from nltk import word_tokenize, pos_tag, ne_chunk

In [18]:
sentence = "Mark and John are working at Google."
print (ne_chunk(pos_tag(word_tokenize(sentence))))

(S
  (PERSON Mark/NNP)
  and/CC
  (PERSON John/NNP)
  are/VBP
  working/VBG
  at/IN
  (ORGANIZATION Google/NNP)
  ./.)


# Interface to WordNet

In [19]:
from nltk.corpus import wordnet
syn = wordnet.synsets("life")
print(syn[0].definition())
print(syn[0].examples())

a characteristic state or mode of living
['social life', 'city life', 'real life']


In [20]:
syn = wordnet.synsets("old")
print(syn[0].definition())

syn = wordnet.synsets("Python")
print(syn[0].definition())

past times (especially in the phrase `in days of old')
large Old World boas


In [21]:
synonyms = []
for syn in wordnet.synsets('small'):
    for lemma in syn.lemmas():
        synonyms.append(lemma.name())
print(synonyms)

['small', 'small', 'small', 'little', 'minor', 'modest', 'small', 'small-scale', 'pocket-size', 'pocket-sized', 'little', 'small', 'small', 'humble', 'low', 'lowly', 'modest', 'small', 'little', 'minuscule', 'small', 'little', 'small', 'small', 'modest', 'small', 'belittled', 'diminished', 'small', 'small']


In [22]:
antonyms = []
for syn in wordnet.synsets("big"):
    for l in syn.lemmas():
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
print(antonyms)

['small', 'little', 'small']
