# NLTK Complete Guide - Section 9: Chunking

This notebook covers:
- What is Chunking?
- Noun Phrase Chunking
- Chunk Grammar Rules
- Chinking (Excluding Patterns)
- Custom Chunkers
- Practical Applications

In [None]:
import nltk

nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

from nltk import pos_tag, word_tokenize
from nltk.chunk import RegexpParser
from nltk.tree import Tree

## 9.1 What is Chunking?

**Chunking** (also called shallow parsing) groups words into meaningful phrases:

- **Noun Phrases (NP)**: "the big dog", "a beautiful sunset"
- **Verb Phrases (VP)**: "is running", "has been working"
- **Prepositional Phrases (PP)**: "in the house", "on the table"

Chunking uses POS tags to identify patterns.

In [None]:
sentence = "The quick brown fox jumps over the lazy dog."

# POS tag the sentence
tokens = word_tokenize(sentence)
tagged = pos_tag(tokens)

print("POS Tagged:")
print(tagged)

In [None]:
# Define a simple noun phrase grammar
# NP: Determiner + Adjective(s) + Noun
grammar = "NP: {<DT>?<JJ>*<NN.*>+}"

# Create a chunk parser
chunk_parser = RegexpParser(grammar)

# Parse the tagged sentence
tree = chunk_parser.parse(tagged)

print("Chunked:")
tree.pprint()

## 9.2 Chunk Grammar Syntax

| Symbol | Meaning |
|--------|----------|
| `<TAG>` | Match a specific POS tag |
| `<TAG1\|TAG2>` | Match either tag |
| `?` | Optional (0 or 1) |
| `*` | Zero or more |
| `+` | One or more |
| `{pattern}` | Chunk pattern (include) |
| `}pattern{` | Chink pattern (exclude) |

In [None]:
# Different grammar patterns
grammars = {
    "Simple NP": "NP: {<DT>?<NN>}",
    "NP with adjectives": "NP: {<DT>?<JJ>*<NN>}",
    "NP with multiple nouns": "NP: {<DT>?<JJ>*<NN.*>+}",
    "Verb phrase": "VP: {<VB.*><RB>?}",
    "Prepositional phrase": "PP: {<IN><DT>?<NN.*>}",
}

sentence = "The quick brown fox jumps quickly over the lazy dog."
tagged = pos_tag(word_tokenize(sentence))

print(f"Sentence: {sentence}")
print(f"Tagged: {tagged}\n")

for name, grammar in grammars.items():
    parser = RegexpParser(grammar)
    tree = parser.parse(tagged)
    
    # Extract chunks
    chunks = []
    for subtree in tree:
        if isinstance(subtree, Tree):
            chunk_text = ' '.join(word for word, tag in subtree.leaves())
            chunks.append(chunk_text)
    
    print(f"{name}: {chunks}")

## 9.3 Noun Phrase Chunking (NP)

In [None]:
# Comprehensive NP grammar
np_grammar = r"""
    NP: {<DT|PRP\$>?<JJ>*<NN.*>+}   # Determiner + adjectives + nouns
        {<NNP>+}                      # Proper nouns
        {<PRP>}                       # Pronouns
"""

np_parser = RegexpParser(np_grammar)

sentences = [
    "The big brown dog chased the small cat.",
    "My beautiful garden has colorful flowers.",
    "John and Mary visited New York City.",
    "She bought an expensive red sports car.",
]

print("Noun Phrase Chunking")
print("=" * 60)

for sent in sentences:
    tagged = pos_tag(word_tokenize(sent))
    tree = np_parser.parse(tagged)
    
    nps = [' '.join(w for w, t in subtree.leaves()) 
           for subtree in tree if isinstance(subtree, Tree)]
    
    print(f"\nSentence: {sent}")
    print(f"NPs: {nps}")

## 9.4 Verb Phrase Chunking (VP)

In [None]:
# Verb phrase grammar
vp_grammar = r"""
    VP: {<VB.*><RB.*>?<VB.*>*}  # Verb + optional adverb + more verbs
        {<MD><VB>}              # Modal + base verb
"""

vp_parser = RegexpParser(vp_grammar)

sentences = [
    "She is running quickly.",
    "They have been working hard.",
    "He can swim very fast.",
    "The dog was barking loudly.",
]

print("Verb Phrase Chunking")
print("=" * 60)

for sent in sentences:
    tagged = pos_tag(word_tokenize(sent))
    tree = vp_parser.parse(tagged)
    
    vps = [' '.join(w for w, t in subtree.leaves()) 
           for subtree in tree if isinstance(subtree, Tree)]
    
    print(f"\nSentence: {sent}")
    print(f"VPs: {vps}")

## 9.5 Chinking (Excluding Patterns)

**Chinking** removes certain patterns from chunks.

In [None]:
# Chunk everything, then exclude verbs and prepositions
chink_grammar = r"""
    NP: {<.*>+}         # Chunk everything
        }<VB.*|IN>{     # Chink verbs and prepositions
"""

chink_parser = RegexpParser(chink_grammar)

sentence = "The dog ran through the park and jumped over the fence."
tagged = pos_tag(word_tokenize(sentence))

print(f"Sentence: {sentence}\n")
print("Tagged:")
print(tagged)

tree = chink_parser.parse(tagged)
print("\nChunked (with chinking):")
tree.pprint()

In [None]:
# Compare with and without chinking
sentence = "The cat sat on the mat near the door."
tagged = pos_tag(word_tokenize(sentence))

# Without chinking - everything becomes one chunk
no_chink = RegexpParser("NP: {<.*>+}")
tree1 = no_chink.parse(tagged)

# With chinking - breaks at prepositions
with_chink = RegexpParser(r"NP: {<.*>+} }<IN>{")
tree2 = with_chink.parse(tagged)

print(f"Sentence: {sentence}\n")

print("Without chinking:")
tree1.pprint()

print("\nWith chinking (exclude prepositions):")
tree2.pprint()

## 9.6 Complex Multi-Level Grammar

In [None]:
# Multi-level chunking grammar
complex_grammar = r"""
    NP: {<DT|PRP\$>?<JJ>*<NN.*>+}  # Noun phrases
    VP: {<VB.*>+}                   # Verb phrases
    PP: {<IN><NP>}                  # Prepositional phrases
    CLAUSE: {<NP><VP><NP>?<PP>*}   # Simple clause
"""

complex_parser = RegexpParser(complex_grammar)

sentence = "The young student studies hard in the library."
tagged = pos_tag(word_tokenize(sentence))

print(f"Sentence: {sentence}\n")
print("Tagged:")
for word, tag in tagged:
    print(f"  {word}: {tag}")

tree = complex_parser.parse(tagged)
print("\nChunked tree:")
tree.pprint()

## 9.7 Extracting Chunks Programmatically

In [None]:
def extract_chunks(text, grammar):
    """Extract chunks from text using given grammar"""
    parser = RegexpParser(grammar)
    tagged = pos_tag(word_tokenize(text))
    tree = parser.parse(tagged)
    
    chunks = {}
    for subtree in tree.subtrees():
        if subtree.label() != 'S':  # Skip root
            chunk_type = subtree.label()
            chunk_text = ' '.join(word for word, tag in subtree.leaves())
            
            if chunk_type not in chunks:
                chunks[chunk_type] = []
            chunks[chunk_type].append(chunk_text)
    
    return chunks

In [None]:
grammar = r"""
    NP: {<DT|PRP\$>?<JJ>*<NN.*>+}
    VP: {<VB.*><RB>?}
"""

text = "The clever student quickly solved the difficult math problem."

print(f"Text: {text}\n")

chunks = extract_chunks(text, grammar)

for chunk_type, chunk_list in chunks.items():
    print(f"{chunk_type}: {chunk_list}")

## 9.8 Practical: Information Extraction

In [None]:
def extract_subject_verb_object(sentence):
    """Simple SVO extraction using chunking"""
    # Grammar for SVO patterns
    grammar = r"""
        NP: {<DT|PRP\$>?<JJ>*<NN.*>+|<PRP>|<NNP>+}
        VP: {<VB.*>}
    """
    
    parser = RegexpParser(grammar)
    tagged = pos_tag(word_tokenize(sentence))
    tree = parser.parse(tagged)
    
    chunks = []
    for subtree in tree:
        if isinstance(subtree, Tree):
            chunk_type = subtree.label()
            chunk_text = ' '.join(w for w, t in subtree.leaves())
            chunks.append((chunk_type, chunk_text))
    
    # Simple heuristic: first NP = subject, VP = verb, second NP = object
    nps = [text for ctype, text in chunks if ctype == 'NP']
    vps = [text for ctype, text in chunks if ctype == 'VP']
    
    return {
        'subject': nps[0] if len(nps) > 0 else None,
        'verb': vps[0] if len(vps) > 0 else None,
        'object': nps[1] if len(nps) > 1 else None,
    }

In [None]:
sentences = [
    "The cat chased the mouse.",
    "John loves pizza.",
    "The happy children played games.",
    "Scientists discovered a new planet.",
    "She wrote an interesting book.",
]

print("Subject-Verb-Object Extraction")
print("=" * 60)

for sent in sentences:
    svo = extract_subject_verb_object(sent)
    print(f"\n{sent}")
    print(f"  Subject: {svo['subject']}")
    print(f"  Verb:    {svo['verb']}")
    print(f"  Object:  {svo['object']}")

## 9.9 Chunk Parser Class

In [None]:
class ChunkExtractor:
    """Reusable chunk extraction utility"""
    
    DEFAULT_GRAMMAR = r"""
        NP: {<DT|PRP\$>?<JJ>*<NN.*>+}
        VP: {<VB.*>+<RB>?}
        PP: {<IN><DT>?<JJ>*<NN.*>+}
    """
    
    def __init__(self, grammar=None):
        self.grammar = grammar or self.DEFAULT_GRAMMAR
        self.parser = RegexpParser(self.grammar)
    
    def parse(self, text):
        """Parse text and return tree"""
        tagged = pos_tag(word_tokenize(text))
        return self.parser.parse(tagged)
    
    def extract_all(self, text):
        """Extract all chunks as dict"""
        tree = self.parse(text)
        chunks = {}
        
        for subtree in tree.subtrees():
            if subtree.label() != 'S':
                ctype = subtree.label()
                ctext = ' '.join(w for w, t in subtree.leaves())
                
                if ctype not in chunks:
                    chunks[ctype] = []
                chunks[ctype].append(ctext)
        
        return chunks
    
    def get_noun_phrases(self, text):
        """Get noun phrases only"""
        chunks = self.extract_all(text)
        return chunks.get('NP', [])
    
    def get_verb_phrases(self, text):
        """Get verb phrases only"""
        chunks = self.extract_all(text)
        return chunks.get('VP', [])
    
    def get_prep_phrases(self, text):
        """Get prepositional phrases only"""
        chunks = self.extract_all(text)
        return chunks.get('PP', [])

In [None]:
# Use the class
chunker = ChunkExtractor()

text = "The young scientist conducted experiments in the modern laboratory."

print(f"Text: {text}\n")

print(f"Noun Phrases: {chunker.get_noun_phrases(text)}")
print(f"Verb Phrases: {chunker.get_verb_phrases(text)}")
print(f"Prep Phrases: {chunker.get_prep_phrases(text)}")

print(f"\nAll chunks:")
for ctype, chunks in chunker.extract_all(text).items():
    print(f"  {ctype}: {chunks}")

## Summary

| Syntax | Meaning |
|--------|----------|
| `{pattern}` | Chunk (include) |
| `}pattern{` | Chink (exclude) |
| `<TAG>` | Match tag |
| `<TAG1\|TAG2>` | Match either |
| `?` | Optional |
| `*` | Zero or more |
| `+` | One or more |

### Common Patterns
```python
# Noun phrase
NP: {<DT>?<JJ>*<NN.*>+}

# Verb phrase
VP: {<VB.*>+}

# Prepositional phrase
PP: {<IN><NP>}
```