### Lemmatization

Lemmatization is the process of grouping together the different inflected forms of a word so they can be analysed as a single item.

In [1]:
from nltk.stem import WordNetLemmatizer

In [5]:
words = ["walk","walking","walked","walks","study","studies","studying","gone","going","went","believes"]

In [6]:
lemmatiser = WordNetLemmatizer()

In [8]:
for word in words:
    print("{} ---> {}".format(word,lemmatiser.lemmatize(word,pos="v")))

walk ---> walk
walking ---> walk
walked ---> walk
walks ---> walk
study ---> study
studies ---> study
studying ---> study
gone ---> go
going ---> go
went ---> go
believes ---> believe


### Stemming
Stemming is the process of producing morphological variants of a root/base word.

Similar to lemmatization, except that stem might not be the actual word and lemma is an actual language word.

In [9]:
from nltk.stem import PorterStemmer

In [10]:
words = ["walk","walking","walked","walks","lying","laid","lay","layed","lied","laying","study","studies","studying","gone","went","believes"]


In [11]:
ps = PorterStemmer()

In [12]:
for word in words:
    print("{} ---> {}".format(word,ps.stem(word)))

walk ---> walk
walking ---> walk
walked ---> walk
walks ---> walk
lying ---> lie
laid ---> laid
lay ---> lay
layed ---> lay
lied ---> lie
laying ---> lay
study ---> studi
studies ---> studi
studying ---> studi
gone ---> gone
went ---> went
believes ---> believ


In [13]:
words = {'+',
 '2021,',
 '500',
 'able',
 'access',
 'acknowledgments',
 'advantage',
 'also',
 'american',
 'analyst',
 'analytical',
 'analytics',
 'appa',
 'application',
 'arduous.',
 'association',
 'association,',
 'automate',
 'automated',
 'automation',
 'available',
 'award-winning',
 'bay',
 'became',
 'benchmarking',
 'benefits,',
 'cascade',
 'case',
 'center',
 'challenge',
 'change',
 'changes',
 'charts',
 'click',
 'code.',
 'communities.',
 'computation,',
 'computational',
 'compute',
 'could',
 'crosscompute',
 'css',
 'css.',
 'custom',
 'data',
 'data.',
 'dataset',
 'decided',
 'delay',
 'deliver',
 'delivered',
 'department',
 'deployed',
 'developed',
 'downstream',
 'drag',
 'drop',
 'electric',
 'energy,',
 'ereliability',
 'every',
 'faithfully',
 'faster.',
 'five',
 'flexible',
 'foundation,',
 'framework.',
 'free',
 'frequently',
 'full',
 'generate',
 'help',
 'helps',
 'highly',
 'hour,',
 'however,',
 'hundred',
 'improve',
 'increasingly',
 'innovate',
 'innovation',
 'intensive',
 'inter-departmental',
 'internally',
 'iterate',
 'iterations',
 'iterative',
 'jupyterlab',
 'labor',
 'leverage',
 'library',
 'making',
 'many',
 'markdown',
 'matplotlib',
 'means',
 'metrics',
 'metrics.',
 'microsoft',
 'month',
 'months,',
 'multiplied,',
 'new',
 'next',
 'non-technical',
 'number',
 'numpy',
 'office',
 'open',
 'originally',
 'outcome',
 'overview',
 'packages',
 'pandas',
 'part',
 'pdf',
 'performance',
 'plots.',
 'possible.',
 'power',
 'process',
 'process.',
 'public',
 'pyramid',
 'python',
 'python.',
 'receive',
 'recreate',
 'regenerate',
 'reliability',
 'report',
 "report's",
 'report,',
 'reports',
 'review',
 'rich',
 'saved',
 'seaborn',
 'sections',
 'semi-manual',
 'services.',
 'show',
 'significantly',
 'six',
 'software',
 'solution',
 'source',
 'standard',
 'statistics',
 'study,',
 'style',
 'subscribers.',
 'subscription',
 'subscriptions',
 'subsequent',
 'successfully',
 'tables',
 'tailored',
 'take',
 'tampa',
 'team',
 'tedious',
 'ten',
 'thank',
 'three',
 'time',
 'tools',
 'touching',
 'track',
 'tracker',
 'transform',
 'trigger',
 'two',
 'u.s.',
 'underlying',
 'updates',
 'used',
 'users',
 'uses',
 'using',
 'utilities',
 'utility',
 "utility's",
 'valuable',
 'various',
 'visualization',
 'web',
 'web-based',
 'within',
 'without',
 'work',
 'years'}

In [14]:
len(words)

192

In [15]:
len({lemmatiser.lemmatize(word,pos="v") for word in words})

185

In [16]:
len({ps.stem(word) for word in words})

177