Stemming -> simplify 1 word from like different words that essentially mean the same (run, ran)

In [1]:
import nltk

In [3]:
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer
porter_stemmer = PorterStemmer() # Most commonly used stemmer
snowball_stemmer = SnowballStemmer('english') # SnowballStemmer can be used for multiple languages
lancaster_stemmer = LancasterStemmer() # More aggressive than the other two

words = ['running','runs','runners','programming','studies','better']

for word in words:
    print(f"Real word: {word}")
    print(f"Porter: {porter_stemmer.stem(word)}")
    print(f"Snowball: {snowball_stemmer.stem(word)}")
    print(f"Lancaster: {lancaster_stemmer.stem(word)}")
    print()



Real word: running
Porter: run
Snowball: run
Lancaster: run

Real word: runs
Porter: run
Snowball: run
Lancaster: run

Real word: runners
Porter: runner
Snowball: runner
Lancaster: run

Real word: programming
Porter: program
Snowball: program
Lancaster: program

Real word: studies
Porter: studi
Snowball: studi
Lancaster: study

Real word: better
Porter: better
Snowball: better
Lancaster: bet



In [4]:
# Lemmatization
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\norbe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# better -> adjective
# noun, verb, adverb


words = ['running','runs','runners','programming','studies','better']

for word in words:
    print(f"Real word: {word}")
    print(f"Lemmatizer: {wordnet_lemmatizer.lemmatize(word)}")
    print()

Real word: running
Lemmatizer: running

Real word: runs
Lemmatizer: run

Real word: runners
Lemmatizer: runner

Real word: programming
Lemmatizer: programming

Real word: studies
Lemmatizer: study

Real word: better
Lemmatizer: better



In [7]:
wordnet_lemmatizer.lemmatize('better', pos='a') # 'good' (adjective)

'good'

In [8]:
wordnet_lemmatizer.lemmatize('fought', pos='v') # 'fight' (verb)

'fight'

In [9]:
wordnet_lemmatizer.lemmatize('cats', pos='n') # 'cat' (noun)

'cat'

In [10]:
wordnet_lemmatizer.lemmatize('quickly', pos='r') # 'quickly' (adverb)

'quickly'