# NLTK Complete Guide - Section 16: Advanced Topics

This notebook covers:
- Parsing and Grammar
- Context-Free Grammar (CFG)
- Dependency Parsing
- Information Extraction
- Regular Expression Patterns
- Performance Optimization

In [None]:
import nltk
import re

nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('maxent_ne_chunker', quiet=True)
nltk.download('words', quiet=True)

from nltk import CFG, ChartParser, RecursiveDescentParser
from nltk import pos_tag, word_tokenize, ne_chunk
from nltk.chunk import RegexpParser
from nltk.tree import Tree

## 16.1 Context-Free Grammar (CFG)

CFG defines rules for syntactic structure of sentences.

In [None]:
# Define a simple grammar
grammar = CFG.fromstring("""
    S -> NP VP
    NP -> Det N | Det Adj N | 'I'
    VP -> V NP | V
    Det -> 'the' | 'a'
    N -> 'dog' | 'cat' | 'ball' | 'park'
    Adj -> 'big' | 'small' | 'happy'
    V -> 'chased' | 'saw' | 'ran'
""")

print("Grammar productions:")
for production in grammar.productions():
    print(f"  {production}")

In [None]:
# Parse a sentence
parser = ChartParser(grammar)

sentence = "the big dog chased a cat".split()

print(f"Sentence: {' '.join(sentence)}\n")
print("Parse trees:")

for tree in parser.parse(sentence):
    print(tree)
    tree.pretty_print()

In [None]:
# More complex grammar with recursion
complex_grammar = CFG.fromstring("""
    S -> NP VP
    NP -> Det N | Det N PP | 'I' | N
    VP -> V | V NP | V NP PP
    PP -> P NP
    Det -> 'the' | 'a' | 'my'
    N -> 'dog' | 'cat' | 'park' | 'telescope' | 'man' | 'hill'
    V -> 'saw' | 'walked' | 'chased'
    P -> 'in' | 'on' | 'with' | 'by'
""")

parser = ChartParser(complex_grammar)

# Ambiguous sentence
sentence = "I saw the man with the telescope".lower().split()

print(f"Sentence: {' '.join(sentence)}")
print("\nPossible interpretations:")

for i, tree in enumerate(parser.parse(sentence), 1):
    print(f"\nInterpretation {i}:")
    tree.pretty_print()

## 16.2 Probabilistic CFG (PCFG)

In [None]:
from nltk import PCFG, ViterbiParser

# Grammar with probabilities
pcfg = PCFG.fromstring("""
    S -> NP VP [1.0]
    NP -> Det N [0.5] | Det Adj N [0.3] | 'I' [0.2]
    VP -> V NP [0.7] | V [0.3]
    Det -> 'the' [0.6] | 'a' [0.4]
    N -> 'dog' [0.4] | 'cat' [0.3] | 'ball' [0.3]
    Adj -> 'big' [0.5] | 'small' [0.5]
    V -> 'chased' [0.5] | 'saw' [0.5]
""")

print("PCFG Productions:")
for prod in pcfg.productions()[:8]:
    print(f"  {prod}")

In [None]:
# Parse with Viterbi (finds most probable parse)
viterbi_parser = ViterbiParser(pcfg)

sentence = "the dog chased a cat".split()

print(f"Sentence: {' '.join(sentence)}\n")

for tree in viterbi_parser.parse(sentence):
    print(f"Probability: {tree.prob():.6f}")
    tree.pretty_print()

## 16.3 Regular Expression for Information Extraction

In [None]:
# Extract patterns using regex
text = """Contact us at support@example.com or sales@company.org.
Call 123-456-7890 or (555) 123-4567 for assistance.
Visit https://www.example.com or http://test.org for more info.
Prices: $19.99, $150, $1,299.00"""

# Email pattern
email_pattern = r'[\w.-]+@[\w.-]+\.\w+'
emails = re.findall(email_pattern, text)
print(f"Emails: {emails}")

# Phone pattern
phone_pattern = r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
phones = re.findall(phone_pattern, text)
print(f"Phones: {phones}")

# URL pattern
url_pattern = r'https?://[\w./]+'
urls = re.findall(url_pattern, text)
print(f"URLs: {urls}")

# Price pattern
price_pattern = r'\$[\d,]+\.?\d*'
prices = re.findall(price_pattern, text)
print(f"Prices: {prices}")

In [None]:
class PatternExtractor:
    """Extract various patterns from text"""
    
    patterns = {
        'email': r'[\w.-]+@[\w.-]+\.\w+',
        'phone': r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}',
        'url': r'https?://[\w./-]+',
        'price': r'\$[\d,]+\.?\d*',
        'date': r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}',
        'time': r'\d{1,2}:\d{2}(?:\s?[AP]M)?',
        'hashtag': r'#\w+',
        'mention': r'@\w+',
    }
    
    @classmethod
    def extract(cls, text, pattern_name):
        """Extract specific pattern"""
        if pattern_name not in cls.patterns:
            raise ValueError(f"Unknown pattern: {pattern_name}")
        return re.findall(cls.patterns[pattern_name], text, re.IGNORECASE)
    
    @classmethod
    def extract_all(cls, text):
        """Extract all patterns"""
        results = {}
        for name, pattern in cls.patterns.items():
            matches = re.findall(pattern, text, re.IGNORECASE)
            if matches:
                results[name] = matches
        return results

In [None]:
sample_text = """
Meeting scheduled for 01/15/2024 at 2:30 PM.
Contact john@email.com or call (555) 123-4567.
Check out our website: https://www.example.com
Follow us @company #innovation #tech
Special offer: $99.99!
"""

results = PatternExtractor.extract_all(sample_text)

print("Extracted Information:")
print("=" * 40)
for pattern_type, matches in results.items():
    print(f"{pattern_type}: {matches}")

## 16.4 Relation Extraction

In [None]:
def extract_relations(text):
    """Extract subject-relation-object triples"""
    # POS tag and chunk
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    
    # Grammar for relation extraction
    grammar = r"""
        NP: {<DT>?<JJ>*<NN.*>+}
        VP: {<VB.*><RB>?}
        RELATION: {<NP><VP><NP>}
    """
    
    parser = RegexpParser(grammar)
    tree = parser.parse(tagged)
    
    relations = []
    
    for subtree in tree.subtrees():
        if subtree.label() == 'RELATION':
            parts = []
            for child in subtree:
                if isinstance(child, Tree):
                    parts.append(' '.join(w for w, t in child.leaves()))
            if len(parts) >= 2:
                relations.append(tuple(parts))
    
    return relations

In [None]:
sentences = [
    "The company acquired the startup.",
    "John founded a technology company.",
    "The scientists discovered a new species.",
]

print("Relation Extraction")
print("=" * 50)

for sent in sentences:
    relations = extract_relations(sent)
    print(f"\n{sent}")
    print(f"Relations: {relations}")

## 16.5 Text Normalization Pipeline

In [None]:
import unicodedata
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

class TextNormalizer:
    """Comprehensive text normalization pipeline"""
    
    def __init__(self, language='english'):
        self.stop_words = set(stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer()
        
        # Common contractions
        self.contractions = {
            "won't": "will not", "can't": "cannot",
            "n't": " not", "'re": " are",
            "'s": " is", "'d": " would",
            "'ll": " will", "'ve": " have",
            "'m": " am",
        }
    
    def expand_contractions(self, text):
        """Expand contractions"""
        for contraction, expansion in self.contractions.items():
            text = text.replace(contraction, expansion)
        return text
    
    def remove_accents(self, text):
        """Remove accented characters"""
        nfkd = unicodedata.normalize('NFKD', text)
        return ''.join(c for c in nfkd if not unicodedata.combining(c))
    
    def normalize(self, text, 
                  lowercase=True,
                  remove_punctuation=True,
                  remove_numbers=False,
                  remove_stopwords=True,
                  lemmatize=True):
        """Full normalization pipeline"""
        
        # Expand contractions
        text = self.expand_contractions(text)
        
        # Remove accents
        text = self.remove_accents(text)
        
        # Lowercase
        if lowercase:
            text = text.lower()
        
        # Tokenize
        tokens = word_tokenize(text)
        
        # Filter tokens
        filtered = []
        for token in tokens:
            # Remove punctuation
            if remove_punctuation and not token.isalnum():
                continue
            
            # Remove numbers
            if remove_numbers and token.isdigit():
                continue
            
            # Remove stopwords
            if remove_stopwords and token.lower() in self.stop_words:
                continue
            
            # Lemmatize
            if lemmatize:
                token = self.lemmatizer.lemmatize(token)
            
            filtered.append(token)
        
        return filtered

In [None]:
normalizer = TextNormalizer()

texts = [
    "I can't believe it's already 2024! The caf√© was amazing.",
    "They're running 5 miles every day. She's been training hard.",
    "The dogs were happily playing with their toys in the gardens.",
]

print("Text Normalization")
print("=" * 60)

for text in texts:
    normalized = normalizer.normalize(text)
    print(f"\nOriginal: {text}")
    print(f"Normalized: {normalized}")

## 16.6 Performance Optimization

In [None]:
import time
from functools import lru_cache

# Caching for repeated operations
@lru_cache(maxsize=10000)
def cached_lemmatize(word):
    """Cached lemmatization"""
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(word)

# Test performance
words = ["running", "dogs", "happily", "better"] * 1000

# Without cache
lemmatizer = WordNetLemmatizer()
start = time.time()
result1 = [lemmatizer.lemmatize(w) for w in words]
time1 = time.time() - start

# With cache (first run)
cached_lemmatize.cache_clear()
start = time.time()
result2 = [cached_lemmatize(w) for w in words]
time2 = time.time() - start

# With cache (second run - cached)
start = time.time()
result3 = [cached_lemmatize(w) for w in words]
time3 = time.time() - start

print("Performance Comparison")
print("=" * 40)
print(f"Without cache:     {time1:.4f}s")
print(f"With cache (1st):  {time2:.4f}s")
print(f"With cache (2nd):  {time3:.4f}s")
print(f"\nSpeedup: {time1/time3:.1f}x")

In [None]:
# Batch processing for efficiency
from nltk import pos_tag_sents

sentences = [
    word_tokenize("The quick brown fox jumps."),
    word_tokenize("Natural language processing is fascinating."),
    word_tokenize("Machine learning transforms industries."),
] * 100

# Individual processing
start = time.time()
result1 = [pos_tag(sent) for sent in sentences]
time1 = time.time() - start

# Batch processing
start = time.time()
result2 = pos_tag_sents(sentences)
time2 = time.time() - start

print("Batch vs Individual Processing")
print("=" * 40)
print(f"Individual: {time1:.4f}s")
print(f"Batch:      {time2:.4f}s")
print(f"Speedup:    {time1/time2:.1f}x")

## Summary

### Parsing
- `CFG.fromstring()` - Define grammar
- `ChartParser` - General parsing
- `RecursiveDescentParser` - Top-down parsing
- `ViterbiParser` - Probabilistic parsing

### Information Extraction
- Regular expressions for patterns
- Chunking for phrases
- NER for named entities
- Relation extraction for triples

### Optimization Tips
- Use `lru_cache` for repeated operations
- Use batch functions (`pos_tag_sents`)
- Precompile regex patterns
- Limit vocabulary size for features