Stemming :

Porter Stemmer

In [3]:
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import RegexpStemmer
from nltk.stem import SnowballStemmer

%pip install krovetz
import krovetz

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


--Word--            --Stem--            
Russian             russian             
warships            warship             
ready               readi               
to                  to                  
strike              strike              
terrorists          terrorist           
near                near                
Aleppo              aleppo              
08112016            08112016            
Source              sourc               
Source              sourc               
Milru               milru               
Attack              attack              
aircraft            aircraft            
of                  of                  
the                 the                 
Russian             russian             
aircraft            aircraft            
carrier             carrier             
Admiral             admir               
Kuznetsov           kuznetsov           
get                 get                 
ready               readi               
to              

In [None]:

nltk.download('punkt')

# Example text
example_sentence = '''
                  Russian warships ready to strike terrorists near Aleppo 08.11.2016 |
                  Source: Source: Mil.ru Attack aircraft of the Russian aircraft carrier
                  Admiral Kuznetsov get ready to strike terrorists' positions in the vicinity
                  of Aleppo, sources at the Russian Defense Ministry said, RBC reports.
                  "Insurgents' attempts to break into Aleppo from outside are meaningless," the source said.
                  The main task of the aircraft carrier aviation group is to strike missile and
                  air blows on the terrorists , whose goal is to enter Aleppo. "After the attacks
                   on terrorists' positions, one will have to forget about the support for
                   insurgents from the outside," the source said.
                   The Russian group in the Mediterranean Sea consists of the Admiral Kuznetsov aircraft
                   carrier , the heavy nuclear missile cruiser Pyotr Velikiy (Peter the Great) and
                   large anti-submarine ships Severomorsk and Vice-Admiral Kulakov. Russia has increased
                   intelligence activities in Syria to establish the areas, where terrorists are concentrated,
                   as well as the routes that they use to move from one area to another.
                   "The militants took advantage of the humanitarian pause and regrouped their forces to prepare
                   for a new breakthrough into the eastern part of Aleppo," the source added. According to the source,
                   Russia will use new weapons during the upcoming attacks on terrorists . It was said that the Russian warships
                   in the Mediterranean Sea will launch "Caliber" cruise missiles, although it
                   was not specified which ships would be responsible for the launches.
                   Pravda.Ru Russian warships travel to Syria
'''

# Remove punctuation
example_sentence_no_punct = example_sentence.translate(str.maketrans("", "", string.punctuation))

# Create tokens
word_tokens = word_tokenize(example_sentence_no_punct)

# Initialize the Porter Stemmer
ps = PorterStemmer()

# Perform stemming
print("{0:20}{1:20}".format("--Word--","--Stem--"))
for word in word_tokens:
    print("{0:20}{1:20}".format(word, ps.stem(word)))


Lancaster Stemmer

In [4]:
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()

print("{0:20}{1:20}".format("--Word--","--Stem--"))
for word in word_tokens:
    print("{0:20}{1:20}".format(word, st.stem(word)))

--Word--            --Stem--            
Russian             russ                
warships            war                 
ready               ready               
to                  to                  
strike              strike              
terrorists          ter                 
near                near                
Aleppo              aleppo              
08112016            08112016            
Source              sourc               
Source              sourc               
Milru               milru               
Attack              attack              
aircraft            aircraft            
of                  of                  
the                 the                 
Russian             russ                
aircraft            aircraft            
carrier             carry               
Admiral             admir               
Kuznetsov           kuznetsov           
get                 get                 
ready               ready               
to              

RegEx-Based Stemmer


In [6]:
st = RegexpStemmer('ing$|s$|e$|able$|ern$|ed$y', min=4)

print("{0:20}{1:20}".format("--Word--","--Stem--"))
for word in word_tokens:
    print("{0:20}{1:20}".format(word, st.stem(word)))


--Word--            --Stem--            
Russian             Russian             
warships            warship             
ready               ready               
to                  to                  
strike              strik               
terrorists          terrorist           
near                near                
Aleppo              Aleppo              
08112016            08112016            
Source              Sourc               
Source              Sourc               
Milru               Milru               
Attack              Attack              
aircraft            aircraft            
of                  of                  
the                 the                 
Russian             Russian             
aircraft            aircraft            
carrier             carrier             
Admiral             Admiral             
Kuznetsov           Kuznetsov           
get                 get                 
ready               ready               
to              

Krovetz Stemmer
  

In [11]:
# Initialize the Krovetz Stemmer
ks = krovetz.PyKrovetzStemmer()

print("{0:20}{1:20}".format("--Word--","--Stem--"))
for word in word_tokens:
    print("{0:20}{1:20}".format(word, ps.stem(word)))

--Word--            --Stem--            
Russian             russian             
warships            warship             
ready               readi               
to                  to                  
strike              strike              
terrorists          terrorist           
near                near                
Aleppo              aleppo              
08112016            08112016            
Source              sourc               
Source              sourc               
Milru               milru               
Attack              attack              
aircraft            aircraft            
of                  of                  
the                 the                 
Russian             russian             
aircraft            aircraft            
carrier             carrier             
Admiral             admir               
Kuznetsov           kuznetsov           
get                 get                 
ready               readi               
to              

Snowball Stemmer
  

In [12]:
# Initialize the Snowball Stemmer
stemmer = SnowballStemmer('english')

print("{0:20}{1:20}".format("--Word--","--Stem--"))
for word in word_tokens:
    print("{0:20}{1:20}".format(word, st.stem(word)))

--Word--            --Stem--            
Russian             Russian             
warships            warship             
ready               ready               
to                  to                  
strike              strik               
terrorists          terrorist           
near                near                
Aleppo              Aleppo              
08112016            08112016            
Source              Sourc               
Source              Sourc               
Milru               Milru               
Attack              Attack              
aircraft            aircraft            
of                  of                  
the                 the                 
Russian             Russian             
aircraft            aircraft            
carrier             carrier             
Admiral             Admiral             
Kuznetsov           Kuznetsov           
get                 get                 
ready               ready               
to              

In [None]:
# Summary : The Porter Stemmer applies a set of rules to strip suffixes from words. 
# The Snowball Stemmer improves upon the Porter algorithm, offering better accuracy and support for multiple languages. 
# The Lancaster Stemmer, known for its aggressive stemming approach, reduces words more aggressively than other methods.
# The Regexp Stemmer uses regular expressions for custom stemming rules, providing flexibility for specific applications. 
# The Krovetz Stemmer, known for handling morphological variations effectively, enhances word normalization. 