In [14]:
#! pip install nltk

In [15]:
#! pip show nltk

### Import Libraries

In [8]:
import nltk
import re # Regular expression
# nltk.download('punkt') # used for Tokenization
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')

from nltk.corpus import stopwords # used for stopwords
from nltk.stem.porter import PorterStemmer # used for stemming
from nltk.stem.wordnet import WordNetLemmatizer # Used for Lemmatization
from nltk.tokenize import sent_tokenize, word_tokenize


### Input text

In [43]:
text="Natural Language Processing is an exciting area. Huge budget have been allocated for this."

### Tokenization

In [20]:
# sent tokenize
print(sent_tokenize(text))
# word tokenize
print(word_tokenize(text))

['Natural Language Processing is an exciting area.', 'Huge budget have been allocated for this.']
['Natural', 'Language', 'Processing', 'is', 'an', 'exciting', 'area', '.', 'Huge', 'budget', 'have', 'been', 'allocated', 'for', 'this', '.']


### Lower case conversion

In [44]:
text = re.sub(r"[^a-zA-Z0-9]"," ",text.lower())
words = text.split()
print(words)

['natural', 'language', 'processing', 'is', 'an', 'exciting', 'area', 'huge', 'budget', 'have', 'been', 'allocated', 'for', 'this']


### Stop word removal

In [45]:
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [47]:
final_words=[]
for w in words:
    if w not in stopwords.words("english"):
        final_words.append(w)

In [53]:
words = [w for w in words if w not in stopwords.words("english")]

In [55]:
print(words)

['natural', 'language', 'processing', 'exciting', 'area', 'huge', 'budget', 'allocated']


### Stemming

In [68]:
stemmer = PorterStemmer()
stemmer.stem('giving')

'give'

In [66]:
stemmed = [PorterStemmer().stem(w) for w in  words]
print(stemmed)

['natur', 'languag', 'process', 'excit', 'area', 'huge', 'budget', 'alloc']


### Lemmatization

In [72]:
lemmed = [WordNetLemmatizer().lemmatize(w) for w in  words]
print(lemmed)

['natural', 'language', 'processing', 'exciting', 'area', 'huge', 'budget', 'allocated']


### Executing in a single cell

In [75]:
text="Natural Language Processing is an exciting area. Huge budget have been allocated for this."
text = re.sub(r"[^a-zA-Z0-9]"," ",text.lower()) # Removing special charaters and lowering
words = text.split() # Tokenization activity executed
stopText =[w for w in words if w not in stopwords.words("english")] # stopwords executed
finalWords=[WordNetLemmatizer().lemmatize(w) for w in  stopText] # Lemmatization executed

['natural', 'language', 'processing', 'exciting', 'area', 'huge', 'budget', 'allocated']


### Parts of speech

In [77]:
#nltk.download('averaged_perceptron_tagger')

In [80]:
text="Natural Language Processing is an exciting area. Huge budget have been allocated for this."

tokenized = sent_tokenize(text)
for i in tokenized:
    wordList = word_tokenize(i)
    wordList = [w for w in wordList if w not in stopwords.words("english")]
    tagged = nltk.pos_tag(wordList)
    print(tagged)


[('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('exciting', 'VBG'), ('area', 'NN'), ('.', '.')]
[('Huge', 'NNP'), ('budget', 'NN'), ('allocated', 'VBD'), ('.', '.')]


In [79]:
CC coordinating conjunction 
CD cardinal digit 
DT determiner 
EX existential there (like: “there is” … think of it like “there exists”) 
FW foreign word 
IN preposition/subordinating conjunction 
JJ adjective – ‘big’ 
JJR adjective, comparative – ‘bigger’ 
JJS adjective, superlative – ‘biggest’ 
LS list marker 1) 
MD modal – could, will 
NN noun, singular ‘- desk’ 
NNS noun plural – ‘desks’ 
NNP proper noun, singular – ‘Harrison’ 
NNPS proper noun, plural – ‘Americans’ 
PDT predeterminer – ‘all the kids’ 
POS possessive ending parent’s 
PRP personal pronoun –  I, he, she 
PRP$ possessive pronoun – my, his, hers 
RB adverb – very, silently, 
RBR adverb, comparative – better 
RBS adverb, superlative – best 
RP particle – give up 
TO – to go ‘to’ the store. 
UH interjection – errrrrrrrm 
VB verb, base form – take 
VBD verb, past tense – took 
VBG verb, gerund/present participle – taking 
VBN verb, past participle – taken 
VBP verb, sing. present, non-3d – take 
VBZ verb, 3rd person sing. present – takes 
WDT wh-determiner – which 
WP wh-pronoun – who, what 
WP$ possessive wh-pronoun, eg- whose 
WRB wh-adverb, eg- where, when

['Natural Language Processing is an exciting area.',
 'Huge budget have been allocated for this.']