#  Differentiate stemming and lemmatizing words

Let us first try to understand the difference between them

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

## Stemming

In [None]:
from nltk.stem import PorterStemmer
e_words= ["wait", "waiting", "waited", "waits"]
ps =PorterStemmer()
for w in e_words:
    rootWord=ps.stem(w)
    print(rootWord)

wait
wait
wait
wait


In [None]:
sentence = "leaves are falling from trees because of autumn"
ps =PorterStemmer()
stem_list=[ps.stem(w) for w in nltk.word_tokenize(sentence)]
print(stem_list)

['leav', 'are', 'fall', 'from', 'tree', 'becaus', 'of', 'autumn']


## Lemmatization

In [None]:
# Lemmatize with POS Tag
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

In [None]:

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


#Init Lemmatizer
lemmatizer = WordNetLemmatizer()

#Lemmatize a Sentence with the appropriate POS tag
sentence = "leaves are falling from trees because of autumn"
lemm_list=[lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)]
print(lemm_list)

['leaf', 'be', 'fall', 'from', 'tree', 'because', 'of', 'autumn']


## Comparing between the outputs

In [None]:
actual_sent_list=nltk.word_tokenize(sentence)
actual_sent_list

['leaves', 'are', 'falling', 'from', 'trees', 'because', 'of', 'autumn']

In [None]:
from nltk.corpus import wordnet
comp1_list=[]
length = len(actual_sent_list)

for i in range(0,length):
  if(wordnet.synsets(actual_sent_list[i])!=[] and  wordnet.synsets(lemm_list[i])!=[]):
    wordFromList1 = wordnet.synsets(actual_sent_list[i])[0]
    wordFromList2 = wordnet.synsets(lemm_list[i])[0]
    s = wordFromList1.wup_similarity(wordFromList2)
    comp1_list.append([actual_sent_list[i],lemm_list[i],s])
comp1_list

[['leaves', 'leaf', 1.0],
 ['are', 'be', 0.26666666666666666],
 ['falling', 'fall', 0.18181818181818182],
 ['trees', 'tree', 1.0],
 ['autumn', 'autumn', 1.0]]

In [None]:
from nltk.corpus import wordnet
comp2_list=[]
length = len(actual_sent_list)

for i in range(0,length):
  if(wordnet.synsets(actual_sent_list[i])!=[] and  wordnet.synsets(stem_list[i])!=[]):
    wordFromList1 = wordnet.synsets(actual_sent_list[i])[0]
    wordFromList2 = wordnet.synsets(stem_list[i])[0]
    s = wordFromList1.wup_similarity(wordFromList2)
    comp2_list.append([actual_sent_list[i],stem_list[i],s])
comp2_list

[['are', 'are', 1.0],
 ['falling', 'fall', 0.18181818181818182],
 ['trees', 'tree', 1.0],
 ['autumn', 'autumn', 1.0]]

Since Stemming was unable to bring out a proper word for 'leaves', it fails to get compared with the textual (word) similarity