# Lemmatization using NLTK

In [1]:
import nltk

In [2]:
from nltk.stem import WordNetLemmatizer

In [3]:
from nltk.tokenize import word_tokenize

In [4]:
from nltk.corpus import wordnet

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nairm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nairm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\nairm\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [8]:
# Initialize the WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

In [9]:
# Helper function to get WordNet POS tag
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [10]:
text = "running runners run easily fairer fairly"

In [11]:
words = word_tokenize(text)

In [12]:
words

['running', 'runners', 'run', 'easily', 'fairer', 'fairly']

In [13]:
# POS tagging
pos_tags = nltk.pos_tag(words)

In [14]:
pos_tags

[('running', 'VBG'),
 ('runners', 'NNS'),
 ('run', 'VBP'),
 ('easily', 'RB'),
 ('fairer', 'JJR'),
 ('fairly', 'RB')]

In [15]:
# Lemmatize each word
lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]

In [16]:
lemmatized_words

['run', 'runner', 'run', 'easily', 'fair', 'fairly']

# Lemmatization with SpaCy

In [17]:
import spacy

In [18]:
nlp = spacy.load('en_core_web_sm')

In [19]:
text = "running runners run easily fairer fairly"

In [20]:
# Process the text with SpaCy
doc = nlp(text)

In [21]:
lemmatized_words = [token.lemma_ for token in doc]

In [22]:
lemmatized_words

['running', 'runner', 'run', 'easily', 'fair', 'fairly']