In [1]:
#importing libraries and the corpus
import nltk
from nltk.corpus import gutenberg
import string
import matplotlib.pyplot as plt
import pandas as pd


In [2]:
#Importing the corpus
corp = gutenberg.words("carroll-alice.txt")

In [3]:
#Removing all non-alphabetic tokens and lowercasing all the tokens
corp = [word for word in corp if word.isalpha()]
corp = [word.lower() for word in corp]

In [4]:
text = corp[0:200]

In [5]:
from nltk import PorterStemmer, LancasterStemmer

In [6]:
#Step 1
portstem = PorterStemmer()
lancstem = LancasterStemmer()

In [7]:
#Step 2
port_stemmed = []
lanc_stemmed = []
print("{0:20}{1:20}{2:20}".format('Word', 'Porter Stemmer', 'Lancaster Stemmer'))
for word in text:
    print("{0:20}{1:20}{2:20}".format(word, portstem.stem(word), lancstem.stem(word)))

Word                Porter Stemmer      Lancaster Stemmer   
alice               alic                al                  
s                   s                   s                   
adventures          adventur            adv                 
in                  in                  in                  
wonderland          wonderland          wonderland          
by                  by                  by                  
lewis               lewi                lew                 
carroll             carrol              carrol              
chapter             chapter             chapt               
i                   i                   i                   
down                down                down                
the                 the                 the                 
rabbit              rabbit              rabbit              
hole                hole                hol                 
alice               alic                al                  
was                 wa  

<ol>
    <li>The word 'this' is stemmed to 'thi' by both stemmers</li>
    <li>'Alice' is stemmed to 'alic' by the Porter Stemmer and 'al' by the Lancaster Stemmer</li>
    <li>'very' is stemmed to 'veri' by the Porter Stemmer and is untouched by the Lancaster Stemmer</li>  
</ol>
In general, we can see that the Lancaster Stemmer ends up stemming much more aggressively than the Porter Stemmer and ends up cutting out a lot of characters

In [8]:
#Step 3
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [9]:
print("{0:20}{1:20}".format("Word","Lemma"))
for word in text:
    print ("{0:20}{1:20}".format(word,lemmatizer.lemmatize(word)))

Word                Lemma               
alice               alice               
s                   s                   
adventures          adventure           
in                  in                  
wonderland          wonderland          
by                  by                  
lewis               lewis               
carroll             carroll             
chapter             chapter             
i                   i                   
down                down                
the                 the                 
rabbit              rabbit              
hole                hole                
alice               alice               
was                 wa                  
beginning           beginning           
to                  to                  
get                 get                 
very                very                
tired               tired               
of                  of                  
sitting             sitting             
by              

Step 4<br>
Stemming is useful in contexts where the goal is to learn some specific knowledge from the text. For example, in the case of sentiment analysis, we are more interested in the root stems of words than what inflections are present.<br>
Alternatively, in case of language generation, stemming will result in the resulting text to be incoherent.<br>

In [10]:
test_list_1 = ['abandon', 'absorbency', 'marketing', 'university', 'volume']
test_list_2 = ['abandonment', 'absorbent', 'markets', 'universe', 'volumes']

In [11]:
print("{0:20}{1:20}".format("First Word Stem","Second Word Stem"))
for i in range(0, 5):
    print ("{0:20}{1:20}".format(portstem.stem(test_list_1[i]),portstem.stem(test_list_2[i])))

First Word Stem     Second Word Stem    
abandon             abandon             
absorb              absorb              
market              market              
univers             univers             
volum               volum               


Here we can see for each word pair the result is the same stem<br>
However, 'universe' and 'univeristy' are not inflections of the stem 'univers'. The two words do not have a shared root.<br>
Similarly, 'marketing' and 'market' result in the stem 'market' which is also incorrect.<br>