# NLTK Complete Guide - Section 7: Part-of-Speech (POS) Tagging

This notebook covers:
- What is POS Tagging?
- NLTK POS Taggers
- Penn Treebank Tag Set
- Custom Taggers
- Practical Applications

In [None]:
import nltk

nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('tagsets', quiet=True)
nltk.download('universal_tagset', quiet=True)
nltk.download('brown', quiet=True)

from nltk import pos_tag, pos_tag_sents
from nltk.tokenize import word_tokenize, sent_tokenize

## 7.1 What is POS Tagging?

**Part-of-Speech (POS) Tagging** assigns grammatical categories to words:
- Noun, Verb, Adjective, Adverb
- Pronoun, Preposition, Conjunction
- And more specific subcategories

In [None]:
sentence = "The quick brown fox jumps over the lazy dog."

# Tokenize and tag
tokens = word_tokenize(sentence)
tagged = pos_tag(tokens)

print(f"Sentence: {sentence}\n")
print("POS Tags:")
print(tagged)

In [None]:
# Pretty print
print(f"{'Word':<12} {'Tag':<6} {'Description'}")
print("-" * 50)

tag_descriptions = {
    'DT': 'Determiner',
    'JJ': 'Adjective',
    'NN': 'Noun (singular)',
    'NNS': 'Noun (plural)',
    'VBZ': 'Verb (3rd person singular)',
    'IN': 'Preposition',
    '.': 'Punctuation',
}

for word, tag in tagged:
    desc = tag_descriptions.get(tag, 'Other')
    print(f"{word:<12} {tag:<6} {desc}")

## 7.2 Penn Treebank Tag Set

NLTK uses the Penn Treebank tagset by default.

In [None]:
# Common POS tags
common_tags = {
    # Nouns
    'NN': 'Noun, singular (dog, city)',
    'NNS': 'Noun, plural (dogs, cities)',
    'NNP': 'Proper noun, singular (John, London)',
    'NNPS': 'Proper noun, plural (Americans)',
    
    # Verbs
    'VB': 'Verb, base form (run, eat)',
    'VBD': 'Verb, past tense (ran, ate)',
    'VBG': 'Verb, gerund (running, eating)',
    'VBN': 'Verb, past participle (eaten, written)',
    'VBP': 'Verb, non-3rd person (run, eat)',
    'VBZ': 'Verb, 3rd person singular (runs, eats)',
    
    # Adjectives
    'JJ': 'Adjective (big, green)',
    'JJR': 'Adjective, comparative (bigger)',
    'JJS': 'Adjective, superlative (biggest)',
    
    # Adverbs
    'RB': 'Adverb (quickly, very)',
    'RBR': 'Adverb, comparative (faster)',
    'RBS': 'Adverb, superlative (fastest)',
    
    # Others
    'PRP': 'Personal pronoun (I, you, he)',
    'PRP$': 'Possessive pronoun (my, your)',
    'DT': 'Determiner (the, a, an)',
    'IN': 'Preposition (in, on, at)',
    'CC': 'Coordinating conjunction (and, or)',
    'TO': 'to',
    'MD': 'Modal (can, will, should)',
}

print("Common Penn Treebank POS Tags")
print("=" * 60)
for tag, description in common_tags.items():
    print(f"{tag:<6} {description}")

In [None]:
# Get help on a specific tag
nltk.help.upenn_tagset('VBG')

In [None]:
# All noun tags
nltk.help.upenn_tagset('NN.*')

## 7.3 Universal Tagset

Simplified tagset that works across languages.

In [None]:
sentence = "The quick brown fox jumps over the lazy dog."
tokens = word_tokenize(sentence)

# Default (Penn Treebank)
penn_tags = pos_tag(tokens)

# Universal tagset
universal_tags = pos_tag(tokens, tagset='universal')

print(f"{'Word':<12} {'Penn':<8} {'Universal':<10}")
print("-" * 35)

for (word, penn), (_, univ) in zip(penn_tags, universal_tags):
    print(f"{word:<12} {penn:<8} {univ:<10}")

In [None]:
# Universal tags
universal_tagset = {
    'NOUN': 'Nouns',
    'VERB': 'Verbs',
    'ADJ': 'Adjectives',
    'ADV': 'Adverbs',
    'PRON': 'Pronouns',
    'DET': 'Determiners',
    'ADP': 'Adpositions (prepositions)',
    'NUM': 'Numbers',
    'CONJ': 'Conjunctions',
    'PRT': 'Particles',
    '.': 'Punctuation',
    'X': 'Other',
}

print("Universal Tagset")
print("=" * 40)
for tag, desc in universal_tagset.items():
    print(f"{tag:<8} {desc}")

## 7.4 Tagging Multiple Sentences

In [None]:
text = """Natural language processing is fascinating.
It enables computers to understand human language.
Many applications use NLP today."""

# Method 1: Tag sentence by sentence
sentences = sent_tokenize(text)

print("Method 1: Individual sentences")
print("=" * 50)
for sent in sentences:
    tokens = word_tokenize(sent)
    tagged = pos_tag(tokens)
    print(f"\n{sent}")
    print(f"Tags: {tagged}")

In [None]:
# Method 2: Batch tagging (more efficient)
tokenized_sents = [word_tokenize(s) for s in sentences]
tagged_sents = pos_tag_sents(tokenized_sents)

print("Method 2: Batch tagging (pos_tag_sents)")
print("=" * 50)
for i, tagged in enumerate(tagged_sents, 1):
    print(f"\nSentence {i}: {tagged}")

## 7.5 Context-Dependent POS

The same word can have different POS tags depending on context.

In [None]:
# "book" as noun vs verb
sentences = [
    "I read a book.",           # book = noun
    "Please book a table.",     # book = verb
]

print("'book' in different contexts:")
print("-" * 40)

for sent in sentences:
    tokens = word_tokenize(sent)
    tagged = pos_tag(tokens)
    book_tag = [t for w, t in tagged if w.lower() == 'book'][0]
    print(f"{sent:<30} book = {book_tag}")

In [None]:
# More examples of context-dependent tags
ambiguous_examples = [
    ("I run every day.", "run"),
    ("The run was exhausting.", "run"),
    ("She can fish.", "fish"),
    ("I caught a fish.", "fish"),
    ("Light the candle.", "light"),
    ("The light is bright.", "light"),
    ("This box is light.", "light"),
]

print("Context-Dependent POS Tags")
print("=" * 55)
print(f"{'Sentence':<35} {'Word':<8} {'Tag'}")
print("-" * 55)

for sent, target_word in ambiguous_examples:
    tokens = word_tokenize(sent)
    tagged = pos_tag(tokens)
    word_tag = [t for w, t in tagged if w.lower() == target_word][0]
    print(f"{sent:<35} {target_word:<8} {word_tag}")

## 7.6 Extracting Words by POS

In [None]:
def extract_by_pos(text, target_tags):
    """Extract words with specific POS tags"""
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    return [word for word, tag in tagged if tag in target_tags]

text = """The beautiful princess quickly ran through the dark forest.
She was searching for her magical golden crown."""

print(f"Text: {text}")

# Extract different parts of speech
nouns = extract_by_pos(text, ['NN', 'NNS', 'NNP', 'NNPS'])
verbs = extract_by_pos(text, ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'])
adjectives = extract_by_pos(text, ['JJ', 'JJR', 'JJS'])
adverbs = extract_by_pos(text, ['RB', 'RBR', 'RBS'])

print(f"\nNouns: {nouns}")
print(f"Verbs: {verbs}")
print(f"Adjectives: {adjectives}")
print(f"Adverbs: {adverbs}")

In [None]:
# POS distribution
from collections import Counter

def pos_distribution(text):
    """Get distribution of POS tags"""
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    return Counter(tag for word, tag in tagged)

text = """Machine learning is transforming how computers understand and process 
human language. Natural language processing applications are becoming 
increasingly sophisticated and accurate."""

dist = pos_distribution(text)

print("POS Tag Distribution")
print("=" * 30)
for tag, count in dist.most_common():
    print(f"{tag:<6} {count:>3} {'â–ˆ' * count}")

## 7.7 Custom POS Taggers

In [None]:
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger
from nltk.corpus import brown

# Get tagged sentences from Brown corpus
brown_tagged = brown.tagged_sents(categories='news')

# Split into train/test
train_size = int(len(brown_tagged) * 0.8)
train_sents = brown_tagged[:train_size]
test_sents = brown_tagged[train_size:]

print(f"Training sentences: {len(train_sents)}")
print(f"Test sentences: {len(test_sents)}")

In [None]:
# Default Tagger (assigns same tag to everything)
default_tagger = DefaultTagger('NN')
print(f"Default Tagger accuracy: {default_tagger.accuracy(test_sents):.2%}")

In [None]:
# Unigram Tagger (learns most common tag for each word)
unigram_tagger = UnigramTagger(train_sents, backoff=default_tagger)
print(f"Unigram Tagger accuracy: {unigram_tagger.accuracy(test_sents):.2%}")

In [None]:
# Bigram Tagger (considers previous word)
bigram_tagger = BigramTagger(train_sents, backoff=unigram_tagger)
print(f"Bigram Tagger accuracy: {bigram_tagger.accuracy(test_sents):.2%}")

In [None]:
# Trigram Tagger (considers two previous words)
trigram_tagger = TrigramTagger(train_sents, backoff=bigram_tagger)
print(f"Trigram Tagger accuracy: {trigram_tagger.accuracy(test_sents):.2%}")

In [None]:
# Compare all taggers
print("\nTagger Comparison")
print("=" * 35)
taggers = [
    ("Default", default_tagger),
    ("Unigram", unigram_tagger),
    ("Bigram", bigram_tagger),
    ("Trigram", trigram_tagger),
]

for name, tagger in taggers:
    acc = tagger.accuracy(test_sents)
    print(f"{name:<10} {acc:.2%}")

## 7.8 Practical Application: Text Analysis

In [None]:
def analyze_text(text):
    """Comprehensive text analysis using POS tagging"""
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    
    # Count by category
    categories = {
        'Nouns': ['NN', 'NNS', 'NNP', 'NNPS'],
        'Verbs': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
        'Adjectives': ['JJ', 'JJR', 'JJS'],
        'Adverbs': ['RB', 'RBR', 'RBS'],
        'Pronouns': ['PRP', 'PRP$', 'WP', 'WP$'],
    }
    
    results = {}
    for category, tags in categories.items():
        words = [w for w, t in tagged if t in tags]
        results[category] = {
            'count': len(words),
            'words': words
        }
    
    return results

text = """The ambitious young scientist quickly discovered a remarkable 
breakthrough in artificial intelligence. She carefully analyzed the complex 
data and brilliantly solved the challenging problem."""

print(f"Text: {text}\n")

analysis = analyze_text(text)

print("Text Analysis")
print("=" * 50)
for category, data in analysis.items():
    print(f"\n{category} ({data['count']}):")
    print(f"  {data['words']}")

## Summary

| Method | Description |
|--------|-------------|
| `pos_tag(tokens)` | Tag a list of tokens |
| `pos_tag(tokens, tagset='universal')` | Use universal tagset |
| `pos_tag_sents(list_of_sents)` | Batch tag multiple sentences |
| `nltk.help.upenn_tagset('TAG')` | Get tag description |

### Common Tags
- **Nouns**: NN, NNS, NNP, NNPS
- **Verbs**: VB, VBD, VBG, VBN, VBP, VBZ
- **Adjectives**: JJ, JJR, JJS
- **Adverbs**: RB, RBR, RBS