# Simple NLP demo using NLTK
NLTK is a toolkit build for working with NLP in Python. It provides us various text processing libraries with a lot of test datasets.<br><br>
Install nltk: pip install nltk <br><br>
Blog: https://becominghuman.ai/nlp-for-beginners-using-nltk-f58ec22005cd

# Convert text to lower Case
It is necessary to convert the text to lower case as it is case sensitive.

In [1]:
import nltk

In [2]:
text = "This is a Demo Text for NLP using NLTK. Full form of NLTK is Natural Language Toolkit"
lower_text = text.lower()
print (lower_text)

this is a demo text for nlp using nltk. full form of nltk is natural language toolkit


# word tokenize
Tokenize sentences to get the tokens of the text i.e breaking the sentences into words.

In [4]:
text = "This is a Demo Text for NLP using NLTK. Full form of NLTK is Natural Language Toolkit"
word_tokens = nltk.word_tokenize(text)
print (word_tokens)

['This', 'is', 'a', 'Demo', 'Text', 'for', 'NLP', 'using', 'NLTK', '.', 'Full', 'form', 'of', 'NLTK', 'is', 'Natural', 'Language', 'Toolkit']


# sentence tokenize
Tokenize sentences if the there are more than 1 sentence i.e breaking the sentences to list of sentence.



In [6]:
text = "This is a Demo Text for NLP using NLTK. Full form of NLTK is Natural Language Toolkit"
sent_token = nltk.sent_tokenize(text)
print (sent_token)


['This is a Demo Text for NLP using NLTK.', 'Full form of NLTK is Natural Language Toolkit']


# stop words removal
Remove irrelevant words using nltk stop words like is,the,a etc from the sentences as they don’t carry any information.

In [8]:
import nltk
from nltk.corpus import stopwords
stopword = stopwords.words('english')
text = 'This is a Demo Text for NLP using NLTK. Full form of NLTK is Natural Language Toolkit'
word_tokens = nltk.word_tokenize(text)
removing_stopwords = [word for word in word_tokens if word not in stopword]
print (removing_stopwords)

['This', 'Demo', 'Text', 'NLP', 'using', 'NLTK', '.', 'Full', 'form', 'NLTK', 'Natural', 'Language', 'Toolkit']


# lemma
lemmatize the text so as to get its root form eg: functions,funtionality as function

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
#is based on The Porter Stemming Algorithm
stopword = stopwords.words('english')
wordnet_lemmatizer = WordNetLemmatizer()
text = "the dogs are barking outside. Are the cats in the garden?"
word_tokens = nltk.word_tokenize(text)
lemmatized_word = [wordnet_lemmatizer.lemmatize(word) for word in word_tokens]
print (lemmatized_word)


['the', 'dog', 'are', 'barking', 'outside', '.', 'Are', 'the', 'cat', 'in', 'the', 'garden', '?']


# Stemming
stemming is the process of reducing inflected (or sometimes derived) words to their word stem, base or root form

In [11]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
#is based on The Porter Stemming Algorithm
stopword = stopwords.words('english')
snowball_stemmer = SnowballStemmer('english')
text = "This is a Demo Text for NLP using NLTK. Full form of NLTK is Natural Language Toolkit"
word_tokens = nltk.word_tokenize(text)
stemmed_word = [snowball_stemmer.stem(word) for word in word_tokens]
print (stemmed_word)

['this', 'is', 'a', 'demo', 'text', 'for', 'nlp', 'use', 'nltk', '.', 'full', 'form', 'of', 'nltk', 'is', 'natur', 'languag', 'toolkit']


# Get word frequency
counting the word occurrence using FreqDist library

In [13]:
import nltk
from nltk import FreqDist
text = "This is a Demo Text for NLP using NLTK. Full form of NLTK is Natural Language Toolkit"
word = nltk.word_tokenize(text.lower())
freq = FreqDist(word)
print (freq.most_common(5))


[('is', 2), ('nltk', 2), ('this', 1), ('a', 1), ('demo', 1)]


#  pos(Part of Speech)tags
POS tag helps us to know the tags of each word like whether a word is noun, adjective etc.

In [14]:
import nltk
text = "the dogs are barking outside."
word = nltk.word_tokenize(text)
pos_tag = nltk.pos_tag(word)
print (pos_tag)

[('the', 'DT'), ('dogs', 'NNS'), ('are', 'VBP'), ('barking', 'VBG'), ('outside', 'IN'), ('.', '.')]


# NER
NER(Named Entity Recognition) is the process of getting the entity names



In [15]:
import nltk
text = "who is Barrack Obama"
word = nltk.word_tokenize(text)
pos_tag = nltk.pos_tag(word)
print(pos_tag)
chunk = nltk.ne_chunk(pos_tag)
print(chunk)
NE = [ " ".join(w for w, t in ele) for ele in chunk if isinstance(ele, nltk.Tree)]
print (NE)

[('who', 'WP'), ('is', 'VBZ'), ('Barrack', 'NNP'), ('Obama', 'NNP')]
(S who/WP is/VBZ (PERSON Barrack/NNP Obama/NNP))
['Barrack Obama']
