# NLTK Complete Guide - Section 15: Corpus Management

This notebook covers:
- Built-in Corpora
- Loading Custom Corpora
- Creating Your Own Corpus
- Corpus Readers
- Practical Applications

In [None]:
import nltk
import os

nltk.download('gutenberg', quiet=True)
nltk.download('brown', quiet=True)
nltk.download('reuters', quiet=True)
nltk.download('inaugural', quiet=True)
nltk.download('webtext', quiet=True)
nltk.download('nps_chat', quiet=True)
nltk.download('treebank', quiet=True)

from nltk.corpus import gutenberg, brown, reuters, inaugural, webtext
from nltk.corpus import PlaintextCorpusReader, TaggedCorpusReader
from nltk.tokenize import word_tokenize

## 15.1 Built-in Corpora Overview

NLTK includes many corpora for different purposes.

In [None]:
# List available corpora
corpora_info = {
    'gutenberg': 'Classic literature (18 texts)',
    'brown': 'Categorized text (news, fiction, etc.)',
    'reuters': 'News articles with categories',
    'inaugural': 'US Presidential inaugural addresses',
    'webtext': 'Web and chat text',
    'treebank': 'Parsed Wall Street Journal',
    'movie_reviews': 'Positive/negative movie reviews',
    'stopwords': 'Stop words in multiple languages',
    'wordnet': 'Lexical database',
    'names': 'Male and female names',
}

print("Popular NLTK Corpora")
print("=" * 55)
for corpus, description in corpora_info.items():
    print(f"{corpus:<15} {description}")

## 15.2 Gutenberg Corpus

In [None]:
# List files in Gutenberg corpus
print("Gutenberg Files:")
print("-" * 40)
for fileid in gutenberg.fileids():
    words = len(gutenberg.words(fileid))
    print(f"{fileid:<30} {words:>8,} words")

In [None]:
# Access methods
fileid = 'austen-emma.txt'

print(f"Accessing '{fileid}':")
print("-" * 40)

# Raw text
raw = gutenberg.raw(fileid)
print(f"Raw text (first 200 chars): {raw[:200]}...")

# Words
words = gutenberg.words(fileid)
print(f"\nWords (first 20): {list(words[:20])}")

# Sentences
sents = gutenberg.sents(fileid)
print(f"\nFirst sentence: {list(sents[0])}")

## 15.3 Brown Corpus (Categorized)

In [None]:
# Categories in Brown corpus
print("Brown Corpus Categories:")
print("-" * 40)
for cat in brown.categories():
    words = len(brown.words(categories=cat))
    files = len(brown.fileids(categories=cat))
    print(f"{cat:<20} {files:>3} files, {words:>8,} words")

In [None]:
# Access by category
news_words = brown.words(categories='news')
fiction_words = brown.words(categories='fiction')

print(f"News words: {len(news_words):,}")
print(f"Fiction words: {len(fiction_words):,}")

# Multiple categories
multi_words = brown.words(categories=['news', 'editorial'])
print(f"News + Editorial words: {len(multi_words):,}")

In [None]:
# Tagged words (Brown has POS tags)
tagged = brown.tagged_words(categories='news')[:10]
print("Tagged words (news):")
print(tagged)

## 15.4 Reuters Corpus (Multi-label)

In [None]:
print(f"Reuters files: {len(reuters.fileids())}")
print(f"Reuters categories: {len(reuters.categories())}")
print(f"\nSample categories: {reuters.categories()[:15]}")

In [None]:
# Files can have multiple categories
sample_file = reuters.fileids()[0]
print(f"File: {sample_file}")
print(f"Categories: {reuters.categories(sample_file)}")
print(f"\nText: {reuters.raw(sample_file)[:300]}...")

In [None]:
# Train/test split (built-in)
train_files = [f for f in reuters.fileids() if f.startswith('training/')]
test_files = [f for f in reuters.fileids() if f.startswith('test/')]

print(f"Training files: {len(train_files)}")
print(f"Test files: {len(test_files)}")

## 15.5 Creating Custom Corpus

In [None]:
# Create sample corpus directory
corpus_dir = './my_corpus'
os.makedirs(corpus_dir, exist_ok=True)

# Create sample files
texts = {
    'doc1.txt': """Natural language processing is a field of computer science.
It deals with the interaction between computers and humans.
NLP is used in many applications today.""",
    
    'doc2.txt': """Machine learning is transforming how we build software.
Deep learning models can understand complex patterns.
AI is becoming more accessible to developers.""",
    
    'doc3.txt': """Python is a popular programming language.
It is widely used in data science and web development.
Python has a rich ecosystem of libraries.""",
}

for filename, content in texts.items():
    with open(os.path.join(corpus_dir, filename), 'w') as f:
        f.write(content)

print(f"Created corpus in '{corpus_dir}' with {len(texts)} files")

In [None]:
# Load custom corpus with PlaintextCorpusReader
my_corpus = PlaintextCorpusReader(corpus_dir, r'.*\.txt')

print("Custom Corpus:")
print("-" * 40)
print(f"Files: {my_corpus.fileids()}")

In [None]:
# Access methods work the same as built-in corpora
print(f"\nTotal words: {len(my_corpus.words())}")
print(f"Total sentences: {len(my_corpus.sents())}")

print(f"\nWords in doc1.txt: {list(my_corpus.words('doc1.txt'))}")

print(f"\nFirst sentence of doc2.txt: {list(my_corpus.sents('doc2.txt')[0])}")

## 15.6 Categorized Corpus

In [None]:
from nltk.corpus import CategorizedPlaintextCorpusReader

# Create categorized corpus directory
cat_corpus_dir = './categorized_corpus'
os.makedirs(f'{cat_corpus_dir}/tech', exist_ok=True)
os.makedirs(f'{cat_corpus_dir}/science', exist_ok=True)

# Tech documents
tech_docs = {
    'tech/software.txt': "Software development requires programming skills and creativity.",
    'tech/hardware.txt': "Computer hardware includes processors, memory, and storage devices.",
}

# Science documents
science_docs = {
    'science/biology.txt': "Biology studies living organisms and their interactions.",
    'science/physics.txt': "Physics explains the fundamental laws of the universe.",
}

for filepath, content in {**tech_docs, **science_docs}.items():
    with open(os.path.join(cat_corpus_dir, filepath), 'w') as f:
        f.write(content)

print("Created categorized corpus")

In [None]:
# Load categorized corpus (categories from directory structure)
cat_corpus = CategorizedPlaintextCorpusReader(
    cat_corpus_dir,
    r'.*/.*\.txt',
    cat_pattern=r'(\w+)/.*'  # Category from first directory
)

print(f"Categories: {cat_corpus.categories()}")
print(f"All files: {cat_corpus.fileids()}")

In [None]:
# Access by category
print(f"\nTech files: {cat_corpus.fileids(categories='tech')}")
print(f"Science files: {cat_corpus.fileids(categories='science')}")

print(f"\nTech words: {list(cat_corpus.words(categories='tech'))}")

## 15.7 Corpus Statistics

In [None]:
def corpus_statistics(corpus, name="Corpus"):
    """Calculate comprehensive corpus statistics"""
    stats = {
        'name': name,
        'files': len(corpus.fileids()),
        'words': len(corpus.words()),
        'unique_words': len(set(w.lower() for w in corpus.words() if w.isalpha())),
        'sentences': len(corpus.sents()),
        'chars': len(corpus.raw()),
    }
    
    stats['avg_word_length'] = sum(len(w) for w in corpus.words() if w.isalpha()) / stats['words']
    stats['avg_sent_length'] = stats['words'] / stats['sentences']
    stats['lexical_diversity'] = stats['unique_words'] / stats['words']
    
    return stats

In [None]:
# Compare corpora statistics
corpora_to_analyze = [
    (gutenberg, 'Gutenberg'),
    (brown, 'Brown'),
    (inaugural, 'Inaugural'),
]

print("Corpus Statistics Comparison")
print("=" * 70)

for corpus, name in corpora_to_analyze:
    stats = corpus_statistics(corpus, name)
    print(f"\n{name}:")
    print(f"  Files: {stats['files']:,}")
    print(f"  Words: {stats['words']:,}")
    print(f"  Unique words: {stats['unique_words']:,}")
    print(f"  Sentences: {stats['sentences']:,}")
    print(f"  Avg word length: {stats['avg_word_length']:.2f}")
    print(f"  Avg sentence length: {stats['avg_sent_length']:.1f} words")
    print(f"  Lexical diversity: {stats['lexical_diversity']:.4f}")

## 15.8 Corpus Utility Class

In [None]:
class CorpusManager:
    """Utility class for corpus management"""
    
    def __init__(self, corpus_path, pattern=r'.*\.txt'):
        self.path = corpus_path
        self.corpus = PlaintextCorpusReader(corpus_path, pattern)
    
    def add_document(self, filename, content):
        """Add a new document to the corpus"""
        filepath = os.path.join(self.path, filename)
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(content)
        # Reload corpus
        self.corpus = PlaintextCorpusReader(self.path, r'.*\.txt')
    
    def get_statistics(self):
        """Get corpus statistics"""
        return corpus_statistics(self.corpus, self.path)
    
    def search(self, term):
        """Search for term in corpus"""
        results = []
        for fileid in self.corpus.fileids():
            text = self.corpus.raw(fileid).lower()
            if term.lower() in text:
                count = text.count(term.lower())
                results.append((fileid, count))
        return sorted(results, key=lambda x: x[1], reverse=True)
    
    def get_concordance(self, word, width=40):
        """Get concordance for a word"""
        from nltk import Text
        text = Text(self.corpus.words())
        text.concordance(word, width=width)
    
    def vocabulary(self, min_freq=1):
        """Get vocabulary with frequency filter"""
        from collections import Counter
        words = [w.lower() for w in self.corpus.words() if w.isalpha()]
        freq = Counter(words)
        return {w: c for w, c in freq.items() if c >= min_freq}

In [None]:
# Use the manager
manager = CorpusManager('./my_corpus')

# Add a new document
manager.add_document('doc4.txt', """Data analysis is essential for business intelligence.
Visualization helps communicate insights effectively.
Python and R are popular tools for data analysis.""")

print(f"Files: {manager.corpus.fileids()}")

# Search
print(f"\nSearch for 'Python': {manager.search('python')}")

# Statistics
stats = manager.get_statistics()
print(f"\nTotal words: {stats['words']}")

In [None]:
# Cleanup
import shutil
shutil.rmtree('./my_corpus', ignore_errors=True)
shutil.rmtree('./categorized_corpus', ignore_errors=True)
print("Cleaned up temporary corpus directories")

## Summary

| Corpus Reader | Use Case |
|---------------|----------|
| `PlaintextCorpusReader` | Plain text files |
| `CategorizedPlaintextCorpusReader` | Categorized text |
| `TaggedCorpusReader` | POS-tagged text |
| `BracketParseCorpusReader` | Parsed trees |

### Common Methods
- `corpus.fileids()` - List files
- `corpus.raw()` - Raw text
- `corpus.words()` - Tokenized words
- `corpus.sents()` - Sentences
- `corpus.categories()` - Categories (if available)