In [1]:
# two ways to convert words into their root words
# stemming - converts the word into root word in a crude manner
# lemmatization - converts word into root word according to the sentence structure

In [2]:
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [4]:
'its' in stop_words

True

In [5]:
stemmer = PorterStemmer()

In [6]:
plurals = ['caresses', 'flies', 'dies', 'mules', 'denied',
'died', 'agreed', 'owned', 'humbled', 'sized',
'meeting', 'stating', 'siezing', 'itemization',
'sensational', 'traditional', 'reference', 'colonizer',
'plotted']

In [7]:
[f'{i} - {stemmer.stem(i)}' for i in plurals]

['caresses - caress',
 'flies - fli',
 'dies - die',
 'mules - mule',
 'denied - deni',
 'died - die',
 'agreed - agre',
 'owned - own',
 'humbled - humbl',
 'sized - size',
 'meeting - meet',
 'stating - state',
 'siezing - siez',
 'itemization - item',
 'sensational - sensat',
 'traditional - tradit',
 'reference - refer',
 'colonizer - colon',
 'plotted - plot']

In [8]:
a = " ".join(plurals[:-1])
a

'caresses flies dies mules denied died agreed owned humbled sized meeting stating siezing itemization sensational traditional reference colonizer'

In [9]:
stemmer.stem(a)

'caresses flies dies mules denied died agreed owned humbled sized meeting stating siezing itemization sensational traditional reference colon'

In [10]:
stemmer1 = SnowballStemmer("english", ignore_stopwords=False)
stemmer2 = SnowballStemmer("english", ignore_stopwords=True)

In [11]:
a = 'The MAIN APPLICATION of Porter Stemmer include data mining and Information retrieval. However it\'s applications are only limited to English words. Also, the group of stems is mapped on to the same stem and the output stem is not necessarily a meaningful word'

In [12]:
[f'{i} - {stemmer1.stem(i)}' for i in a.split()]

['The - the',
 'MAIN - main',
 'APPLICATION - applic',
 'of - of',
 'Porter - porter',
 'Stemmer - stemmer',
 'include - includ',
 'data - data',
 'mining - mine',
 'and - and',
 'Information - inform',
 'retrieval. - retrieval.',
 'However - howev',
 "it's - it",
 'applications - applic',
 'are - are',
 'only - onli',
 'limited - limit',
 'to - to',
 'English - english',
 'words. - words.',
 'Also, - also,',
 'the - the',
 'group - group',
 'of - of',
 'stems - stem',
 'is - is',
 'mapped - map',
 'on - on',
 'to - to',
 'the - the',
 'same - same',
 'stem - stem',
 'and - and',
 'the - the',
 'output - output',
 'stem - stem',
 'is - is',
 'not - not',
 'necessarily - necessarili',
 'a - a',
 'meaningful - meaning',
 'word - word']

In [13]:
[f'{i} - {stemmer2.stem(i)}' for i in a.split()]

['The - the',
 'MAIN - main',
 'APPLICATION - applic',
 'of - of',
 'Porter - porter',
 'Stemmer - stemmer',
 'include - includ',
 'data - data',
 'mining - mine',
 'and - and',
 'Information - inform',
 'retrieval. - retrieval.',
 'However - howev',
 "it's - it's",
 'applications - applic',
 'are - are',
 'only - only',
 'limited - limit',
 'to - to',
 'English - english',
 'words. - words.',
 'Also, - also,',
 'the - the',
 'group - group',
 'of - of',
 'stems - stem',
 'is - is',
 'mapped - map',
 'on - on',
 'to - to',
 'the - the',
 'same - same',
 'stem - stem',
 'and - and',
 'the - the',
 'output - output',
 'stem - stem',
 'is - is',
 'not - not',
 'necessarily - necessarili',
 'a - a',
 'meaningful - meaning',
 'word - word']

In [14]:
stemmer1.stem(a)

"the main application of porter stemmer include data mining and information retrieval. however it's applications are only limited to english words. also, the group of stems is mapped on to the same stem and the output stem is not necessarily a meaningful word"

In [15]:
lemma = WordNetLemmatizer()

In [16]:
a = "the main application of porter stemmer include data mining and information retrieval. however, it's applications are only limited to english words. also, the group of stems is mapped on to the same stem and the output stem is not necessarily a meaningful word rocks"
[f'{i} - {lemma.lemmatize(i)}' for i in a.split()]

['the - the',
 'main - main',
 'application - application',
 'of - of',
 'porter - porter',
 'stemmer - stemmer',
 'include - include',
 'data - data',
 'mining - mining',
 'and - and',
 'information - information',
 'retrieval. - retrieval.',
 'however, - however,',
 "it's - it's",
 'applications - application',
 'are - are',
 'only - only',
 'limited - limited',
 'to - to',
 'english - english',
 'words. - words.',
 'also, - also,',
 'the - the',
 'group - group',
 'of - of',
 'stems - stem',
 'is - is',
 'mapped - mapped',
 'on - on',
 'to - to',
 'the - the',
 'same - same',
 'stem - stem',
 'and - and',
 'the - the',
 'output - output',
 'stem - stem',
 'is - is',
 'not - not',
 'necessarily - necessarily',
 'a - a',
 'meaningful - meaningful',
 'word - word',
 'rocks - rock']

In [17]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
b = ['goes', 'going'] * 2
b

['goes', 'going', 'goes', 'going']

In [19]:
[stemmer1.stem(i) for i in b]

['goe', 'go', 'goe', 'go']

In [20]:
[lemma.lemmatize(i) for i in b]

['go', 'going', 'go', 'going']