# NLTK Complete Guide - Section 2: Text Processing Fundamentals

This notebook covers:
- Working with Text
- NLTK Text Object
- Loading Sample Texts
- Text Statistics

In [None]:
import nltk
from nltk.text import Text
from nltk.tokenize import word_tokenize, sent_tokenize

## 2.1 Working with Text

Basic string operations on text data.

In [None]:
text = """Natural Language Processing (NLP) is a field of artificial intelligence 
that gives computers the ability to understand text and spoken words."""

print("Original text:")
print(text)
print(f"\nLength: {len(text)} characters")
print(f"Word count (simple): {len(text.split())}")

In [None]:
# Case transformations
print("Uppercase:")
print(text.upper())

print("\nLowercase:")
print(text.lower())

## 2.2 NLTK Text Object

The `Text` class provides useful methods for text analysis.

In [None]:
# Create NLTK Text object
sample_text = """Natural Language Processing enables computers to understand human language.
Language processing involves many complex tasks. Processing text requires 
understanding grammar and semantics. Computers can now process language effectively.
Natural language understanding is a key challenge in artificial intelligence.
Language models have revolutionized natural language processing."""

tokens = word_tokenize(sample_text)
nltk_text = Text(tokens)

print(f"Total tokens: {len(nltk_text)}")
print(f"Unique tokens: {len(set(nltk_text))}")

### Concordance
Shows a word in its context (surrounding words).

In [None]:
# Concordance - shows word in context
print("Concordance for 'language':")
nltk_text.concordance("language", width=60, lines=5)

In [None]:
print("Concordance for 'processing':")
nltk_text.concordance("processing", width=60, lines=5)

### Word Count

In [None]:
# Count specific words
print(f"'language' appears: {nltk_text.count('language')} times")
print(f"'processing' appears: {nltk_text.count('processing')} times")
print(f"'Natural' appears: {nltk_text.count('Natural')} times")

### Vocabulary

In [None]:
# Get vocabulary with frequencies
vocab = nltk_text.vocab()

print("Top 15 most common words:")
print("-" * 30)
for word, count in vocab.most_common(15):
    print(f"{word:<20} {count}")

### Finding Similar Words
Words that appear in similar contexts.

In [None]:
# Create a larger text for better similar word detection
larger_text = """The cat sat on the mat. The dog sat on the rug. 
The cat chased the mouse. The dog chased the cat.
A happy cat is a good cat. A happy dog is a good dog.
The cat sleeps on the bed. The dog sleeps on the floor.
My cat likes fish. My dog likes meat."""

tokens_large = word_tokenize(larger_text.lower())
text_large = Text(tokens_large)

print("Words similar to 'cat':")
text_large.similar("cat")

In [None]:
print("Words similar to 'dog':")
text_large.similar("dog")

### Common Contexts

In [None]:
print("Common contexts for 'cat' and 'dog':")
text_large.common_contexts(["cat", "dog"])

### Dispersion Plot
Visualize where words appear throughout the text.

In [None]:
# Dispersion plot (requires matplotlib)
import matplotlib.pyplot as plt

text_large.dispersion_plot(["cat", "dog", "sat", "chased"])
plt.show()

## 2.3 Loading Sample Texts

NLTK comes with many built-in corpora.

In [None]:
# Download book corpus if needed
nltk.download('gutenberg', quiet=True)
nltk.download('brown', quiet=True)

In [None]:
from nltk.corpus import gutenberg

# Available texts in Gutenberg corpus
print("Available Gutenberg texts:")
print(gutenberg.fileids())

In [None]:
# Load a specific text
emma_text = gutenberg.raw('austen-emma.txt')
print(f"Emma by Jane Austen")
print(f"Total characters: {len(emma_text):,}")
print(f"\nFirst 500 characters:")
print(emma_text[:500])

In [None]:
# Get words and sentences
emma_words = gutenberg.words('austen-emma.txt')
emma_sents = gutenberg.sents('austen-emma.txt')

print(f"Total words: {len(emma_words):,}")
print(f"Total sentences: {len(emma_sents):,}")
print(f"\nFirst 20 words: {list(emma_words[:20])}")
print(f"\nFirst sentence: {emma_sents[0]}")

### Gutenberg Corpus Statistics

In [None]:
# Statistics for all Gutenberg texts
print(f"{'File':<35} {'Chars':>10} {'Words':>10} {'Sents':>8} {'Avg Word':>10} {'Avg Sent':>10}")
print("-" * 85)

for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    avg_word_len = num_chars / num_words
    avg_sent_len = num_words / num_sents
    
    print(f"{fileid:<35} {num_chars:>10,} {num_words:>10,} {num_sents:>8,} {avg_word_len:>10.1f} {avg_sent_len:>10.1f}")

### Brown Corpus (Categorized)

In [None]:
from nltk.corpus import brown

# Brown corpus categories
print("Brown corpus categories:")
print(brown.categories())

In [None]:
# Get words from a specific category
news_words = brown.words(categories='news')
print(f"News category: {len(news_words):,} words")
print(f"First 20 words: {list(news_words[:20])}")

## 2.4 Text Statistics

In [None]:
def text_statistics(text):
    """Calculate comprehensive text statistics"""
    tokens = word_tokenize(text)
    sentences = sent_tokenize(text)
    
    # Basic counts
    char_count = len(text)
    word_count = len(tokens)
    sent_count = len(sentences)
    unique_words = len(set(t.lower() for t in tokens if t.isalpha()))
    
    # Averages
    avg_word_len = sum(len(w) for w in tokens if w.isalpha()) / len([w for w in tokens if w.isalpha()])
    avg_sent_len = word_count / sent_count
    
    # Lexical diversity
    alpha_tokens = [t.lower() for t in tokens if t.isalpha()]
    lexical_diversity = len(set(alpha_tokens)) / len(alpha_tokens)
    
    return {
        'characters': char_count,
        'words': word_count,
        'sentences': sent_count,
        'unique_words': unique_words,
        'avg_word_length': avg_word_len,
        'avg_sentence_length': avg_sent_len,
        'lexical_diversity': lexical_diversity
    }

In [None]:
sample = """Natural Language Processing (NLP) is a field of artificial intelligence.
It enables computers to understand, interpret, and generate human language.
NLP combines computational linguistics with machine learning.
Applications include translation, sentiment analysis, and chatbots.
Modern NLP uses deep learning for better results."""

stats = text_statistics(sample)

print("Text Statistics")
print("=" * 40)
for key, value in stats.items():
    if isinstance(value, float):
        print(f"{key.replace('_', ' ').title():<25} {value:.2f}")
    else:
        print(f"{key.replace('_', ' ').title():<25} {value:,}")

### Lexical Diversity Comparison

In [None]:
def lexical_diversity(text):
    """Calculate lexical diversity (unique words / total words)"""
    tokens = [t.lower() for t in word_tokenize(text) if t.isalpha()]
    return len(set(tokens)) / len(tokens)

# Compare texts
texts = {
    'News': ' '.join(brown.words(categories='news')[:1000]),
    'Romance': ' '.join(brown.words(categories='romance')[:1000]),
    'Science Fiction': ' '.join(brown.words(categories='science_fiction')[:1000]),
}

print("Lexical Diversity by Genre (first 1000 words)")
print("-" * 40)
for genre, text in texts.items():
    div = lexical_diversity(text)
    print(f"{genre:<20} {div:.2%}")

## Summary

| Method | Description |
|--------|-------------|
| `Text(tokens)` | Create NLTK Text object |
| `.concordance(word)` | Show word in context |
| `.similar(word)` | Find words in similar contexts |
| `.common_contexts([w1, w2])` | Find shared contexts |
| `.dispersion_plot(words)` | Visualize word positions |
| `.vocab()` | Get frequency distribution |
| `.count(word)` | Count word occurrences |

### Built-in Corpora
- `gutenberg` - Classic literature
- `brown` - Categorized text (news, romance, etc.)
- `reuters` - News articles
- `movie_reviews` - Movie reviews (positive/negative)