In [1]:
import nltk

In [2]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [3]:
ps.stem('walking')

'walk'

In [4]:
ps.stem('walks')

'walk'

In [5]:
ps.stem('walked')

'walk'

In [6]:
ps.stem("bosses")

'boss'

In [7]:
ps.stem('ran')   #should have given run, as run is the root node. This can be done easily with lemmatization.

'ran'

In [8]:
ps.stem('running')

'run'

In [10]:
sentence =' Lemmatization  is much more sophisticated than stemming'.split()  #tokenized based on words
sentence


['Lemmatization', 'is', 'much', 'more', 'sophisticated', 'than', 'stemming']

In [12]:
for token in sentence:
  print(ps.stem(token), end=' ')

lemmat is much more sophist than stem 

In [13]:
ps.stem('unnecessary')

'unnecessari'

In [14]:
ps.stem('berry')   #it is making the word into something illogical.

'berri'

In [17]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download("wordnet")


[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [19]:
wln = WordNetLemmatizer()
wln.lemmatize('walking')   #same prob, no noun has 'ing' suffix, since it has noun as default, it doesn't know the root of this word, we need POS tagging.

'walking'

In [20]:
wln.lemmatize('walking',pos=wordnet.VERB)  #now it knows this verb, and treat as such.

'walk'

In [21]:
wln.lemmatize('mice')  #it knows ttherules that mice is the plural of mouse and root word is mouse

'mouse'

In [23]:
wln.lemmatize('ran', pos=wordnet.VERB)

'run'

In [24]:
ps.stem('was')

'wa'

In [26]:
wln.lemmatize('was', pos=wordnet.VERB)  #this is actually working correctly, is or was root is be.

'be'

In [27]:
wln.lemmatize('better')

'better'

In [28]:
wln.lemmatize('better', pos=wordnet.ADJ)  #pos = adjective, root is good.

'good'

## This manual work, we need to automate the POS tagging of a word.

In [37]:
def getwordpos(treebank_tag):    #automating
  if treebank_tag.startswith('J'):
    return wordnet.ADJ
  elif treebank_tag.startswith('N'):
    return wordnet.NOUN
  elif treebank_tag.startswith('V'):
    return wordnet.VERB
  elif treebank_tag.startswith('R'):
    return wordnet.ADV
  else:
    return wordnet.NOUN


In [32]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [34]:
sentence = 'Donald Trump has a devoted following'.split()
sentence

['Donald', 'Trump', 'has', 'a', 'devoted', 'following']

In [35]:
words_and_tags = nltk.pos_tag(sentence)
words_and_tags

[('Donald', 'NNP'),
 ('Trump', 'NNP'),
 ('has', 'VBZ'),
 ('a', 'DT'),
 ('devoted', 'VBN'),
 ('following', 'NN')]

In [38]:
for word,tag in words_and_tags:
  lemmat= wln.lemmatize(word,pos=getwordpos(tag))
  print(lemmat,end=" ")    #note following has not been reducced since its a noun.

Donald Trump have a devote following 

In [39]:
s = "The cat was following the bird as it flew by".split()
s

['The', 'cat', 'was', 'following', 'the', 'bird', 'as', 'it', 'flew', 'by']

In [41]:
wordtag = nltk.pos_tag(s)
wordtag    #notice 'following' is now verb in this context

[('The', 'DT'),
 ('cat', 'NN'),
 ('was', 'VBD'),
 ('following', 'VBG'),
 ('the', 'DT'),
 ('bird', 'NN'),
 ('as', 'IN'),
 ('it', 'PRP'),
 ('flew', 'VBD'),
 ('by', 'IN')]

In [42]:
for word,tag in wordtag:
  print(wln.lemmatize(word,pos=getwordpos(tag)),end=" ")


The cat be follow the bird a it fly by 