# NLTK Complete Guide - Section 11: Frequency Distribution

This notebook covers:
- FreqDist Basics
- Word Frequency Analysis
- Conditional Frequency Distribution
- Visualization
- Practical Applications

In [None]:
import nltk

nltk.download('punkt', quiet=True)
nltk.download('gutenberg', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('brown', quiet=True)
nltk.download('inaugural', quiet=True)

from nltk import FreqDist, ConditionalFreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import gutenberg, stopwords, brown, inaugural

## 11.1 FreqDist Basics

**FreqDist** counts the frequency of each item in a collection.

In [None]:
text = "the cat sat on the mat the cat is fat"
tokens = word_tokenize(text)

# Create frequency distribution
fdist = FreqDist(tokens)

print(f"Text: {text}")
print(f"Tokens: {tokens}")
print(f"\nFrequency Distribution: {dict(fdist)}")

In [None]:
# FreqDist properties and methods
print(f"Total words: {fdist.N()}")
print(f"Unique words: {fdist.B()}")
print(f"Most common: {fdist.most_common(3)}")
print(f"Frequency of 'the': {fdist['the']}")
print(f"Frequency of 'dog': {fdist['dog']}")

In [None]:
# More methods
print(f"Max frequency word: {fdist.max()}")
print(f"Frequency of max: {fdist.freq(fdist.max()):.2%}")
print(f"\nAll items: {list(fdist.keys())}")
print(f"All counts: {list(fdist.values())}")

## 11.2 Analyzing Real Text

In [None]:
# Load Emma by Jane Austen
emma = gutenberg.words('austen-emma.txt')

print(f"Total words in Emma: {len(emma):,}")
print(f"Sample: {list(emma[:20])}")

In [None]:
# Create frequency distribution
fdist_emma = FreqDist(emma)

print(f"Unique words: {fdist_emma.B():,}")
print(f"\nTop 20 most common words:")
print("-" * 30)

for word, count in fdist_emma.most_common(20):
    print(f"{word:<15} {count:>6}")

In [None]:
# Filter: only alphabetic words, remove stopwords
stop_words = set(stopwords.words('english'))

filtered_words = [
    w.lower() for w in emma 
    if w.isalpha() and w.lower() not in stop_words
]

fdist_filtered = FreqDist(filtered_words)

print("Top 20 meaningful words (no stopwords):")
print("-" * 35)

for word, count in fdist_filtered.most_common(20):
    print(f"{word:<15} {count:>6}")

## 11.3 Frequency Analysis Methods

In [None]:
# Hapaxes - words that appear only once
hapaxes = fdist_filtered.hapaxes()

print(f"Words appearing only once: {len(hapaxes):,}")
print(f"Percentage: {len(hapaxes)/fdist_filtered.B():.1%}")
print(f"\nSample hapaxes: {hapaxes[:20]}")

In [None]:
# Words with specific frequency
def words_with_freq(fdist, min_freq, max_freq=None):
    """Get words within frequency range"""
    if max_freq is None:
        max_freq = float('inf')
    return [w for w, f in fdist.items() if min_freq <= f <= max_freq]

# Words appearing exactly 5 times
freq_5 = words_with_freq(fdist_filtered, 5, 5)
print(f"Words appearing exactly 5 times: {len(freq_5)}")
print(f"Sample: {freq_5[:15]}")

# Words appearing 100+ times
freq_100_plus = words_with_freq(fdist_filtered, 100)
print(f"\nWords appearing 100+ times: {len(freq_100_plus)}")
print(f"Words: {freq_100_plus}")

In [None]:
# Relative frequency (probability)
print("Relative Frequencies (Top 10):")
print("-" * 35)

for word, count in fdist_filtered.most_common(10):
    rel_freq = fdist_filtered.freq(word)
    print(f"{word:<15} {count:>6}  ({rel_freq:.2%})")

## 11.4 Visualization

In [None]:
import matplotlib.pyplot as plt

# Plot most common words
plt.figure(figsize=(12, 5))

fdist_filtered.plot(30, title="Top 30 Words in Emma (excluding stopwords)")
plt.tight_layout()
plt.show()

In [None]:
# Cumulative frequency plot
plt.figure(figsize=(12, 5))

fdist_filtered.plot(50, cumulative=True, 
                    title="Cumulative Frequency Distribution")
plt.tight_layout()
plt.show()

In [None]:
# Custom bar chart
top_words = fdist_filtered.most_common(15)
words = [w for w, c in top_words]
counts = [c for w, c in top_words]

plt.figure(figsize=(12, 5))
plt.bar(words, counts, color='steelblue')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top 15 Words in Emma')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 11.5 Conditional Frequency Distribution

**ConditionalFreqDist** tracks frequencies for different conditions/categories.

In [None]:
# Word frequency by genre in Brown corpus
cfd = ConditionalFreqDist(
    (genre, word.lower())
    for genre in brown.categories()
    for word in brown.words(categories=genre)
    if word.isalpha()
)

print(f"Conditions (genres): {cfd.conditions()}")

In [None]:
# Compare word usage across genres
target_words = ['love', 'money', 'government', 'science', 'god']
genres = ['romance', 'news', 'religion', 'science_fiction']

print("Word Frequency by Genre")
print("=" * 60)
print(f"{'Word':<12}", end='')
for genre in genres:
    print(f"{genre:<15}", end='')
print()
print("-" * 60)

for word in target_words:
    print(f"{word:<12}", end='')
    for genre in genres:
        print(f"{cfd[genre][word]:<15}", end='')
    print()

In [None]:
# Tabulate for cleaner output
cfd.tabulate(conditions=genres, samples=target_words)

In [None]:
# Plot conditional frequency
plt.figure(figsize=(10, 6))
cfd.plot(conditions=['news', 'romance', 'religion'], 
         samples=['the', 'love', 'god', 'money', 'war'])
plt.title('Word Frequency Comparison Across Genres')
plt.show()

## 11.6 Inaugural Address Analysis

In [None]:
# Word usage over time in inaugural addresses
cfd_inaugural = ConditionalFreqDist(
    (fileid[:4], word.lower())  # Year is first 4 chars
    for fileid in inaugural.fileids()
    for word in inaugural.words(fileid)
    if word.isalpha()
)

print(f"Years available: {list(cfd_inaugural.conditions())[:10]}...")

In [None]:
# Track specific words over time
target_words = ['america', 'citizen', 'freedom', 'war', 'peace']
years = [str(y) for y in range(1900, 2021, 20)]

print("Word Usage in Inaugural Addresses (1900-2020)")
print("=" * 70)
cfd_inaugural.tabulate(conditions=years, samples=target_words)

In [None]:
# Plot word trends
plt.figure(figsize=(12, 6))

years = sorted(cfd_inaugural.conditions())
words_to_track = ['america', 'freedom', 'government']

for word in words_to_track:
    freqs = [cfd_inaugural[year][word] for year in years]
    plt.plot(years, freqs, marker='o', label=word, markersize=3)

plt.xlabel('Year')
plt.ylabel('Frequency')
plt.title('Word Usage in US Inaugural Addresses Over Time')
plt.legend()
plt.xticks(years[::5], rotation=45)
plt.tight_layout()
plt.show()

## 11.7 Practical: Text Statistics Class

In [None]:
class TextStatistics:
    """Comprehensive text frequency analysis"""
    
    def __init__(self, text, remove_stopwords=True):
        self.raw_text = text
        self.tokens = word_tokenize(text.lower())
        self.words = [w for w in self.tokens if w.isalpha()]
        
        if remove_stopwords:
            stop_words = set(stopwords.words('english'))
            self.words = [w for w in self.words if w not in stop_words]
        
        self.fdist = FreqDist(self.words)
    
    def summary(self):
        """Get summary statistics"""
        return {
            'total_words': self.fdist.N(),
            'unique_words': self.fdist.B(),
            'lexical_diversity': self.fdist.B() / self.fdist.N(),
            'hapaxes': len(self.fdist.hapaxes()),
            'avg_word_length': sum(len(w) for w in self.words) / len(self.words),
        }
    
    def top_words(self, n=10):
        """Get top n words"""
        return self.fdist.most_common(n)
    
    def word_lengths(self):
        """Distribution of word lengths"""
        return FreqDist(len(w) for w in self.words)
    
    def search(self, word):
        """Get frequency and rank of a word"""
        freq = self.fdist[word]
        if freq == 0:
            return {'word': word, 'frequency': 0, 'rank': None}
        
        rank = sorted(self.fdist.values(), reverse=True).index(freq) + 1
        return {
            'word': word,
            'frequency': freq,
            'rank': rank,
            'percentage': self.fdist.freq(word)
        }

In [None]:
# Use the class
emma_text = gutenberg.raw('austen-emma.txt')
stats = TextStatistics(emma_text)

print("Emma - Text Statistics")
print("=" * 40)

summary = stats.summary()
for key, value in summary.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")
    else:
        print(f"{key}: {value:,}")

In [None]:
print("\nTop 15 Words:")
print("-" * 25)
for word, count in stats.top_words(15):
    print(f"{word:<15} {count:>6}")

In [None]:
# Search for specific words
for word in ['emma', 'love', 'happy', 'marriage']:
    result = stats.search(word)
    print(f"{word}: freq={result['frequency']}, rank={result['rank']}")

## Summary

| Method | Description |
|--------|-------------|
| `FreqDist(samples)` | Create frequency distribution |
| `fdist.N()` | Total number of samples |
| `fdist.B()` | Number of unique samples |
| `fdist.most_common(n)` | Top n items |
| `fdist.hapaxes()` | Items appearing once |
| `fdist.freq(sample)` | Relative frequency |
| `fdist.plot()` | Plot distribution |
| `ConditionalFreqDist` | Frequency by condition |
| `cfd.tabulate()` | Tabular display |