# NLTK Complete Guide - Section 6: Lemmatization

This notebook covers:
- What is Lemmatization?
- WordNet Lemmatizer
- POS-aware Lemmatization
- Stemming vs Lemmatization

In [None]:
import nltk

nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag

## 6.1 What is Lemmatization?

**Lemmatization** reduces words to their dictionary form (lemma).

| Word | Stem (Porter) | Lemma |
|------|---------------|-------|
| running | run | running/run |
| better | better | good |
| studies | studi | study |
| geese | gees | goose |

✅ **Lemmas are valid words** (unlike stems)

In [None]:
lemmatizer = WordNetLemmatizer()

# Basic lemmatization (defaults to noun)
words = ["cats", "dogs", "children", "mice", "geese", "feet"]

print("Basic Lemmatization (as nouns)")
print("=" * 35)
for word in words:
    lemma = lemmatizer.lemmatize(word)
    print(f"{word:<15} → {lemma}")

## 6.2 Part of Speech Matters!

Lemmatization needs the correct POS to work properly.

In [None]:
# Without POS (default = noun)
word = "running"

print(f"Word: '{word}'")
print(f"  As noun (default): {lemmatizer.lemmatize(word)}")
print(f"  As verb:           {lemmatizer.lemmatize(word, pos='v')}")
print(f"  As adjective:      {lemmatizer.lemmatize(word, pos='a')}")

In [None]:
# More examples with POS
examples = [
    ("running", "v"),   # verb
    ("better", "a"),    # adjective
    ("studies", "n"),   # noun
    ("studies", "v"),   # verb
    ("quickly", "r"),   # adverb
]

print("POS-specific Lemmatization")
print("=" * 45)
print(f"{'Word':<15} {'POS':<10} {'Lemma':<15}")
print("-" * 45)

pos_names = {'n': 'noun', 'v': 'verb', 'a': 'adjective', 'r': 'adverb'}

for word, pos in examples:
    lemma = lemmatizer.lemmatize(word, pos=pos)
    print(f"{word:<15} {pos_names[pos]:<10} {lemma:<15}")

## 6.3 WordNet POS Tags

WordNet uses specific POS tags:
- `n` = Noun
- `v` = Verb
- `a` = Adjective
- `r` = Adverb

In [None]:
def get_wordnet_pos(treebank_tag):
    """Convert TreeBank POS tag to WordNet POS tag"""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ      # 'a'
    elif treebank_tag.startswith('V'):
        return wordnet.VERB     # 'v'
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN     # 'n'
    elif treebank_tag.startswith('R'):
        return wordnet.ADV      # 'r'
    else:
        return wordnet.NOUN     # Default to noun

# Test the conversion
treebank_tags = ['NN', 'NNS', 'VB', 'VBG', 'JJ', 'JJR', 'RB', 'DT']

print("TreeBank to WordNet POS Conversion")
print("-" * 35)
for tag in treebank_tags:
    wn_tag = get_wordnet_pos(tag)
    print(f"{tag:<8} → {wn_tag}")

## 6.4 Automatic POS-aware Lemmatization

In [None]:
def lemmatize_sentence(sentence):
    """Lemmatize a sentence with automatic POS detection"""
    lemmatizer = WordNetLemmatizer()
    
    # Tokenize
    tokens = word_tokenize(sentence)
    
    # POS tag
    tagged = pos_tag(tokens)
    
    # Lemmatize with correct POS
    lemmas = []
    for word, tag in tagged:
        wn_pos = get_wordnet_pos(tag)
        lemma = lemmatizer.lemmatize(word.lower(), pos=wn_pos)
        lemmas.append(lemma)
    
    return lemmas, tagged

In [None]:
sentence = "The striped bats are hanging on their feet for best"

lemmas, tagged = lemmatize_sentence(sentence)

print(f"Sentence: {sentence}\n")
print(f"{'Word':<12} {'POS':<6} {'Lemma':<12}")
print("-" * 30)

for (word, tag), lemma in zip(tagged, lemmas):
    print(f"{word:<12} {tag:<6} {lemma:<12}")

In [None]:
# More examples
sentences = [
    "The dogs are running quickly through the fields.",
    "She studies better when the weather is good.",
    "The children were playing happily in the garden.",
    "He has been running marathons for years.",
]

print("Sentence Lemmatization")
print("=" * 60)

for sent in sentences:
    lemmas, _ = lemmatize_sentence(sent)
    print(f"\nOriginal:   {sent}")
    print(f"Lemmatized: {' '.join(lemmas)}")

## 6.5 Stemming vs Lemmatization

In [None]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

words_with_pos = [
    ("running", "v"),
    ("runs", "v"),
    ("better", "a"),
    ("studies", "v"),
    ("studying", "v"),
    ("feet", "n"),
    ("geese", "n"),
    ("happiness", "n"),
    ("happily", "r"),
    ("organization", "n"),
]

print("Stemming vs Lemmatization")
print("=" * 55)
print(f"{'Word':<15} {'Stem':<12} {'Lemma':<12} {'Valid?'}")
print("-" * 55)

for word, pos in words_with_pos:
    stem = ps.stem(word)
    lemma = lemmatizer.lemmatize(word, pos=pos)
    
    # Check if stem is valid word (simple check using wordnet)
    stem_valid = "✅" if wordnet.synsets(stem) else "❌"
    lemma_valid = "✅" if wordnet.synsets(lemma) else "❌"
    
    print(f"{word:<15} {stem:<12} {lemma:<12} Stem:{stem_valid} Lemma:{lemma_valid}")

### Key Differences

| Aspect | Stemming | Lemmatization |
|--------|----------|---------------|
| **Output** | Root form (may not be valid word) | Dictionary form (valid word) |
| **Speed** | Faster | Slower |
| **Accuracy** | Less accurate | More accurate |
| **Requires** | Just the word | Word + POS tag |
| **"better"** | better | good |
| **"studies"** | studi | study |

In [None]:
# Special cases where lemmatization excels
special_cases = [
    ("better", "a", "good"),      # Comparative adjective
    ("best", "a", "good"),        # Superlative adjective
    ("worse", "a", "bad"),        # Comparative
    ("went", "v", "go"),          # Irregular past
    ("mice", "n", "mouse"),       # Irregular plural
    ("geese", "n", "goose"),      # Irregular plural
    ("feet", "n", "foot"),        # Irregular plural
    ("children", "n", "child"),   # Irregular plural
]

print("Special Cases (Lemmatization Wins!)")
print("=" * 55)
print(f"{'Word':<12} {'Stem':<12} {'Lemma':<12} {'Expected':<12}")
print("-" * 55)

for word, pos, expected in special_cases:
    stem = ps.stem(word)
    lemma = lemmatizer.lemmatize(word, pos=pos)
    match = "✅" if lemma == expected else "❌"
    print(f"{word:<12} {stem:<12} {lemma:<12} {expected:<12} {match}")

## 6.6 Practical Lemmatization Pipeline

In [None]:
from nltk.corpus import stopwords
import string

nltk.download('stopwords', quiet=True)

class TextLemmatizer:
    """Complete lemmatization pipeline"""
    
    def __init__(self, remove_stopwords=True, lowercase=True):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english')) if remove_stopwords else set()
        self.lowercase = lowercase
    
    def get_wordnet_pos(self, tag):
        """Convert TreeBank tag to WordNet tag"""
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        return wordnet.NOUN
    
    def lemmatize(self, text):
        """Lemmatize text with full pipeline"""
        # Tokenize
        tokens = word_tokenize(text)
        
        # POS tag
        tagged = pos_tag(tokens)
        
        # Lemmatize
        lemmas = []
        for word, tag in tagged:
            # Skip punctuation and stopwords
            if word in string.punctuation:
                continue
            
            word_lower = word.lower() if self.lowercase else word
            
            if word_lower in self.stop_words:
                continue
            
            wn_pos = self.get_wordnet_pos(tag)
            lemma = self.lemmatizer.lemmatize(word_lower, pos=wn_pos)
            lemmas.append(lemma)
        
        return lemmas
    
    def lemmatize_batch(self, texts):
        """Lemmatize multiple texts"""
        return [self.lemmatize(text) for text in texts]

In [None]:
# Use the pipeline
lemmatizer_pipeline = TextLemmatizer(remove_stopwords=True)

texts = [
    "The cats are running quickly through the beautiful gardens.",
    "She has been studying machine learning for several years.",
    "The children were happily playing with their new toys.",
]

print("Lemmatization Pipeline Results")
print("=" * 60)

for text in texts:
    lemmas = lemmatizer_pipeline.lemmatize(text)
    print(f"\nOriginal:   {text}")
    print(f"Lemmatized: {lemmas}")

## 6.7 When to Use What?

### Use Stemming When:
- Speed is critical
- Working with search/IR systems
- Exact word form doesn't matter
- Quick prototyping

### Use Lemmatization When:
- Accuracy is important
- Working with chatbots/NLU
- Need valid dictionary words
- Doing sentiment analysis
- Text generation tasks

## Summary

| Method | Code |
|--------|------|
| Create lemmatizer | `WordNetLemmatizer()` |
| Lemmatize (noun) | `lemmatizer.lemmatize(word)` |
| Lemmatize (verb) | `lemmatizer.lemmatize(word, pos='v')` |
| Lemmatize (adj) | `lemmatizer.lemmatize(word, pos='a')` |
| Lemmatize (adv) | `lemmatizer.lemmatize(word, pos='r')` |

### WordNet POS Tags
- `wordnet.NOUN` or `'n'`
- `wordnet.VERB` or `'v'`
- `wordnet.ADJ` or `'a'`
- `wordnet.ADV` or `'r'`