# NLTK Complete Guide - Section 8: Named Entity Recognition (NER)

This notebook covers:
- What is NER?
- NLTK's Named Entity Chunker
- Entity Types
- Extracting Entities
- Practical Applications

In [None]:
import nltk

nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('maxent_ne_chunker', quiet=True)
nltk.download('words', quiet=True)

from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree

## 8.1 What is Named Entity Recognition?

**NER** identifies and classifies named entities in text:
- **PERSON**: People's names
- **ORGANIZATION**: Companies, institutions
- **GPE**: Geo-Political Entities (countries, cities)
- **LOCATION**: Mountains, rivers, regions
- **DATE/TIME**: Temporal expressions
- **MONEY**: Monetary values
- **PERCENT**: Percentages

In [None]:
text = "Apple Inc. was founded by Steve Jobs in California."

# NER Pipeline: Tokenize → POS Tag → NE Chunk
tokens = word_tokenize(text)
tagged = pos_tag(tokens)
entities = ne_chunk(tagged)

print(f"Text: {text}\n")
print("Named Entities:")
print(entities)

In [None]:
# Visualize the tree (if IPython display is available)
# entities.draw()  # Uncomment to see tree visualization

# Print tree structure
entities.pprint()

## 8.2 Extracting Named Entities

In [None]:
def extract_entities(text):
    """Extract named entities from text"""
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    tree = ne_chunk(tagged)
    
    entities = []
    for subtree in tree:
        if isinstance(subtree, Tree):
            entity_type = subtree.label()
            entity_text = ' '.join(word for word, tag in subtree.leaves())
            entities.append((entity_text, entity_type))
    
    return entities

In [None]:
text = """Barack Obama was the 44th President of the United States.
He was born in Hawaii and studied at Harvard University.
Microsoft and Google are major tech companies in America."""

print(f"Text:\n{text}\n")

entities = extract_entities(text)

print("Extracted Entities:")
print("-" * 40)
for entity, entity_type in entities:
    print(f"{entity_type:<15} {entity}")

## 8.3 Entity Types in NLTK

In [None]:
entity_types = {
    'PERSON': 'People, including fictional characters',
    'ORGANIZATION': 'Companies, agencies, institutions',
    'GPE': 'Countries, cities, states (Geo-Political Entities)',
    'LOCATION': 'Non-GPE locations (mountains, rivers)',
    'FACILITY': 'Buildings, airports, highways',
    'GSP': 'Geo-Socio-Political groups',
}

print("NLTK Named Entity Types")
print("=" * 60)
for entity_type, description in entity_types.items():
    print(f"{entity_type:<15} {description}")

In [None]:
# Examples of each entity type
examples = [
    ("PERSON", "Elon Musk founded SpaceX."),
    ("ORGANIZATION", "NASA launched a new satellite."),
    ("GPE", "Tokyo is the capital of Japan."),
    ("LOCATION", "Mount Everest is the tallest mountain."),
]

print("Entity Type Examples")
print("=" * 60)

for expected_type, text in examples:
    entities = extract_entities(text)
    print(f"\nText: {text}")
    print(f"Expected: {expected_type}")
    print(f"Found: {entities}")

## 8.4 Binary NER (Named Entity or Not)

In [None]:
text = "Steve Jobs founded Apple in California."
tokens = word_tokenize(text)
tagged = pos_tag(tokens)

# With entity types (default)
entities_typed = ne_chunk(tagged, binary=False)
print("With entity types (binary=False):")
entities_typed.pprint()

print("\n" + "=" * 50 + "\n")

# Binary (just NE or not)
entities_binary = ne_chunk(tagged, binary=True)
print("Binary mode (binary=True):")
entities_binary.pprint()

## 8.5 Extracting Entities by Type

In [None]:
def extract_entities_by_type(text):
    """Extract entities grouped by type"""
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    tree = ne_chunk(tagged)
    
    entities_by_type = {}
    
    for subtree in tree:
        if isinstance(subtree, Tree):
            entity_type = subtree.label()
            entity_text = ' '.join(word for word, tag in subtree.leaves())
            
            if entity_type not in entities_by_type:
                entities_by_type[entity_type] = []
            entities_by_type[entity_type].append(entity_text)
    
    return entities_by_type

In [None]:
text = """Bill Gates and Satya Nadella lead Microsoft in Redmond, Washington.
Tim Cook is the CEO of Apple, headquartered in Cupertino, California.
Google was founded by Larry Page and Sergey Brin at Stanford University."""

print(f"Text:\n{text}\n")

entities = extract_entities_by_type(text)

print("Entities by Type:")
print("=" * 50)
for entity_type, entity_list in entities.items():
    print(f"\n{entity_type}:")
    for entity in entity_list:
        print(f"  • {entity}")

## 8.6 Entity Counter

In [None]:
from collections import Counter

def count_entities(text):
    """Count entity occurrences"""
    entities = extract_entities(text)
    return Counter(entities)

text = """Apple announced new products. Apple's CEO Tim Cook presented.
Microsoft also had announcements. Google and Apple compete in many markets.
Tim Cook mentioned Apple's commitment to privacy."""

print(f"Text:\n{text}\n")

entity_counts = count_entities(text)

print("Entity Counts:")
print("-" * 40)
for (entity, etype), count in entity_counts.most_common():
    print(f"{entity:<20} ({etype:<12}) {count}x")

## 8.7 Complete NER Pipeline Class

In [None]:
class NERExtractor:
    """Named Entity Recognition utility class"""
    
    def __init__(self, binary=False):
        self.binary = binary
    
    def process(self, text):
        """Process text and return NE tree"""
        tokens = word_tokenize(text)
        tagged = pos_tag(tokens)
        return ne_chunk(tagged, binary=self.binary)
    
    def extract_all(self, text):
        """Extract all entities as list of tuples"""
        tree = self.process(text)
        entities = []
        for subtree in tree:
            if isinstance(subtree, Tree):
                entity_type = subtree.label()
                entity_text = ' '.join(w for w, t in subtree.leaves())
                entities.append((entity_text, entity_type))
        return entities
    
    def extract_by_type(self, text, target_type):
        """Extract entities of specific type"""
        entities = self.extract_all(text)
        return [e for e, t in entities if t == target_type]
    
    def get_people(self, text):
        """Extract person names"""
        return self.extract_by_type(text, 'PERSON')
    
    def get_organizations(self, text):
        """Extract organization names"""
        return self.extract_by_type(text, 'ORGANIZATION')
    
    def get_locations(self, text):
        """Extract locations (GPE + LOCATION)"""
        gpe = self.extract_by_type(text, 'GPE')
        loc = self.extract_by_type(text, 'LOCATION')
        return gpe + loc
    
    def summary(self, text):
        """Get summary of all entities"""
        return {
            'people': self.get_people(text),
            'organizations': self.get_organizations(text),
            'locations': self.get_locations(text),
        }

In [None]:
# Use the NER class
ner = NERExtractor()

text = """Elon Musk is the CEO of Tesla and SpaceX.
Tesla is headquartered in Austin, Texas.
SpaceX launches rockets from Cape Canaveral, Florida.
Mark Zuckerberg runs Meta in Menlo Park, California."""

print(f"Text:\n{text}\n")

summary = ner.summary(text)

print("Entity Summary:")
print("=" * 50)
for category, entities in summary.items():
    print(f"\n{category.upper()}:")
    for entity in entities:
        print(f"  • {entity}")

## 8.8 Practical Application: News Article Analysis

In [None]:
def analyze_article(text):
    """Analyze a news article for entities"""
    ner = NERExtractor()
    
    # Get all entities
    all_entities = ner.extract_all(text)
    
    # Count by type
    type_counts = Counter(t for e, t in all_entities)
    
    # Most mentioned entities
    entity_counts = Counter(e for e, t in all_entities)
    
    return {
        'total_entities': len(all_entities),
        'unique_entities': len(set(e for e, t in all_entities)),
        'type_distribution': dict(type_counts),
        'top_entities': entity_counts.most_common(5),
        'summary': ner.summary(text)
    }

In [None]:
article = """Technology giants Apple, Google, and Microsoft reported strong quarterly earnings.
Apple CEO Tim Cook announced record iPhone sales in China and Europe.
Google's Sundar Pichai highlighted growth in cloud computing services.
Microsoft's Satya Nadella discussed the company's AI investments.
Wall Street analysts predict continued growth for these Silicon Valley companies.
Apple's headquarters in Cupertino and Google's campus in Mountain View remain innovation hubs."""

print("NEWS ARTICLE ANALYSIS")
print("=" * 60)
print(f"\n{article}\n")
print("=" * 60)

analysis = analyze_article(article)

print(f"\nTotal entities found: {analysis['total_entities']}")
print(f"Unique entities: {analysis['unique_entities']}")

print(f"\nEntity Type Distribution:")
for etype, count in analysis['type_distribution'].items():
    print(f"  {etype}: {count}")

print(f"\nTop Mentioned Entities:")
for entity, count in analysis['top_entities']:
    print(f"  {entity}: {count}x")

## Summary

| Method | Description |
|--------|-------------|
| `ne_chunk(tagged)` | Perform NER on POS-tagged tokens |
| `ne_chunk(tagged, binary=True)` | Binary NER (entity or not) |
| `tree.label()` | Get entity type |
| `tree.leaves()` | Get words in entity |

### NER Pipeline
```python
tokens = word_tokenize(text)     # 1. Tokenize
tagged = pos_tag(tokens)         # 2. POS Tag
entities = ne_chunk(tagged)      # 3. NE Chunk
```

### Entity Types
- **PERSON**: People
- **ORGANIZATION**: Companies, institutions
- **GPE**: Countries, cities, states
- **LOCATION**: Non-GPE locations