# NLTK Complete Guide - Section 10: N-Grams & Language Models

This notebook covers:
- What are N-Grams?
- Generating N-Grams
- N-Gram Frequency Analysis
- Collocations
- Simple Language Models
- Text Generation

In [None]:
import nltk
import random
from collections import Counter, defaultdict

nltk.download('punkt', quiet=True)
nltk.download('gutenberg', quiet=True)
nltk.download('stopwords', quiet=True)

from nltk import ngrams, bigrams, trigrams
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import gutenberg, stopwords
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures

## 10.1 What are N-Grams?

**N-grams** are contiguous sequences of n items from text:

| Type | N | Example ("I love NLP") |
|------|---|------------------------|
| Unigram | 1 | ["I", "love", "NLP"] |
| Bigram | 2 | [("I", "love"), ("love", "NLP")] |
| Trigram | 3 | [("I", "love", "NLP")] |
| 4-gram | 4 | Not enough words! |

In [None]:
text = "I love natural language processing"
tokens = word_tokenize(text)

print(f"Text: {text}")
print(f"Tokens: {tokens}\n")

# Generate n-grams
unigrams = list(ngrams(tokens, 1))
bi_grams = list(ngrams(tokens, 2))
tri_grams = list(ngrams(tokens, 3))
four_grams = list(ngrams(tokens, 4))

print(f"Unigrams (1): {unigrams}")
print(f"Bigrams (2):  {bi_grams}")
print(f"Trigrams (3): {tri_grams}")
print(f"4-grams (4):  {four_grams}")

## 10.2 NLTK Convenience Functions

In [None]:
text = "The quick brown fox jumps over the lazy dog"
tokens = word_tokenize(text.lower())

print(f"Text: {text}\n")

# Using convenience functions
print("Bigrams (using bigrams()):")
for bg in bigrams(tokens):
    print(f"  {bg}")

print("\nTrigrams (using trigrams()):")
for tg in trigrams(tokens):
    print(f"  {tg}")

## 10.3 N-Gram with Padding

Add start/end markers for better language modeling.

In [None]:
from nltk.lm.preprocessing import padded_everygram_pipeline, pad_both_ends

text = "I love NLP"
tokens = word_tokenize(text)

print(f"Text: {text}")
print(f"Tokens: {tokens}\n")

# Without padding
print("Bigrams without padding:")
print(list(bigrams(tokens)))

# With padding
print("\nBigrams with padding:")
padded = list(pad_both_ends(tokens, n=2))
print(f"Padded tokens: {padded}")
print(f"Padded bigrams: {list(bigrams(padded))}")

## 10.4 N-Gram Frequency Analysis

In [None]:
# Load sample text
text = gutenberg.raw('austen-emma.txt')[:10000]  # First 10K chars
tokens = word_tokenize(text.lower())

# Filter to alphabetic tokens only
tokens = [t for t in tokens if t.isalpha()]

print(f"Total tokens: {len(tokens)}")
print(f"Sample: {tokens[:20]}")

In [None]:
# Bigram frequencies
bi_grams = list(bigrams(tokens))
bigram_freq = Counter(bi_grams)

print("Top 15 Most Common Bigrams:")
print("-" * 40)
for bg, count in bigram_freq.most_common(15):
    print(f"{bg[0]:<10} {bg[1]:<10} {count:>5}")

In [None]:
# Trigram frequencies
tri_grams = list(trigrams(tokens))
trigram_freq = Counter(tri_grams)

print("Top 15 Most Common Trigrams:")
print("-" * 50)
for tg, count in trigram_freq.most_common(15):
    print(f"{tg[0]:<10} {tg[1]:<10} {tg[2]:<10} {count:>5}")

## 10.5 Collocations

**Collocations** are words that appear together more often than by chance.

In [None]:
# Load more text
text = gutenberg.raw('austen-emma.txt')
tokens = word_tokenize(text.lower())
tokens = [t for t in tokens if t.isalpha() and len(t) > 2]

print(f"Total tokens: {len(tokens):,}")

In [None]:
# Find bigram collocations
bigram_finder = BigramCollocationFinder.from_words(tokens)

# Filter low-frequency bigrams
bigram_finder.apply_freq_filter(5)

# Get top collocations using PMI (Pointwise Mutual Information)
bigram_measures = BigramAssocMeasures()

print("Top 15 Bigram Collocations (PMI):")
print("-" * 40)
for colloc in bigram_finder.nbest(bigram_measures.pmi, 15):
    print(f"  {colloc[0]} {colloc[1]}")

In [None]:
# Different scoring methods
print("Top 10 by Likelihood Ratio:")
for colloc in bigram_finder.nbest(bigram_measures.likelihood_ratio, 10):
    print(f"  {colloc[0]} {colloc[1]}")

print("\nTop 10 by Chi-Square:")
for colloc in bigram_finder.nbest(bigram_measures.chi_sq, 10):
    print(f"  {colloc[0]} {colloc[1]}")

In [None]:
# Trigram collocations
trigram_finder = TrigramCollocationFinder.from_words(tokens)
trigram_finder.apply_freq_filter(3)

trigram_measures = TrigramAssocMeasures()

print("Top 15 Trigram Collocations:")
print("-" * 50)
for colloc in trigram_finder.nbest(trigram_measures.pmi, 15):
    print(f"  {' '.join(colloc)}")

## 10.6 Simple Language Model

In [None]:
class SimpleBigramModel:
    """Simple bigram language model"""
    
    def __init__(self):
        self.bigram_counts = defaultdict(Counter)
        self.unigram_counts = Counter()
    
    def train(self, tokens):
        """Train on a list of tokens"""
        # Count unigrams
        self.unigram_counts = Counter(tokens)
        
        # Count bigrams (word1 -> word2)
        for w1, w2 in bigrams(tokens):
            self.bigram_counts[w1][w2] += 1
    
    def probability(self, word, context):
        """P(word | context)"""
        if context not in self.bigram_counts:
            return 0
        
        total = sum(self.bigram_counts[context].values())
        return self.bigram_counts[context][word] / total
    
    def next_word_probs(self, context):
        """Get probabilities for all possible next words"""
        if context not in self.bigram_counts:
            return {}
        
        total = sum(self.bigram_counts[context].values())
        return {word: count/total 
                for word, count in self.bigram_counts[context].items()}
    
    def generate(self, start_word, length=10):
        """Generate text starting from a word"""
        words = [start_word]
        current = start_word
        
        for _ in range(length - 1):
            if current not in self.bigram_counts:
                break
            
            # Get next word probabilities
            probs = self.next_word_probs(current)
            if not probs:
                break
            
            # Choose next word weighted by probability
            next_words = list(probs.keys())
            weights = list(probs.values())
            current = random.choices(next_words, weights=weights)[0]
            words.append(current)
        
        return ' '.join(words)

In [None]:
# Train the model
text = gutenberg.raw('austen-emma.txt')
tokens = word_tokenize(text.lower())
tokens = [t for t in tokens if t.isalpha()]

model = SimpleBigramModel()
model.train(tokens)

print(f"Vocabulary size: {len(model.unigram_counts):,}")
print(f"Unique bigram contexts: {len(model.bigram_counts):,}")

In [None]:
# Check probabilities
context = "mr"
print(f"Words that follow '{context}':")
print("-" * 30)

probs = model.next_word_probs(context)
sorted_probs = sorted(probs.items(), key=lambda x: x[1], reverse=True)

for word, prob in sorted_probs[:10]:
    print(f"  {word:<15} {prob:.2%}")

In [None]:
# Generate text
print("Generated text samples:")
print("=" * 60)

start_words = ["the", "she", "he", "it", "mr"]

for start in start_words:
    generated = model.generate(start, length=12)
    print(f"\n'{start}' â†’ {generated}")

## 10.7 NLTK's Language Model

In [None]:
from nltk.lm import MLE
from nltk.lm.preprocessing import padded_everygram_pipeline

# Prepare training data
text = gutenberg.raw('austen-emma.txt')[:50000]
sentences = sent_tokenize(text)
tokenized_sents = [word_tokenize(s.lower()) for s in sentences]
tokenized_sents = [[t for t in s if t.isalpha()] for s in tokenized_sents]

# Remove empty sentences
tokenized_sents = [s for s in tokenized_sents if len(s) > 0]

print(f"Number of sentences: {len(tokenized_sents)}")
print(f"Sample: {tokenized_sents[0][:10]}")

In [None]:
# Create training data with padding
n = 3  # trigram model
train_data, vocab = padded_everygram_pipeline(n, tokenized_sents)

# Train MLE (Maximum Likelihood Estimation) model
lm = MLE(n)
lm.fit(train_data, vocab)

print(f"Vocabulary size: {len(lm.vocab):,}")

In [None]:
# Score some words given context
print("P(word | context)")
print("-" * 40)

contexts = [
    (["she", "was"], "very"),
    (["she", "was"], "not"),
    (["mr"], "knightley"),
    (["mr"], "woodhouse"),
]

for context, word in contexts:
    prob = lm.score(word, context)
    print(f"P({word} | {' '.join(context)}) = {prob:.4f}")

In [None]:
# Generate text using NLTK's model
print("Generated text (NLTK MLE model):")
print("=" * 50)

for i in range(5):
    generated = lm.generate(15, random_seed=i)
    print(f"{i+1}. {' '.join(generated)}")

## 10.8 Practical: N-Gram Text Analysis

In [None]:
def analyze_ngrams(text, n=2, top_k=10, remove_stopwords=True):
    """Comprehensive n-gram analysis"""
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalpha()]
    
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        tokens = [t for t in tokens if t not in stop_words]
    
    # Generate n-grams
    grams = list(ngrams(tokens, n))
    freq = Counter(grams)
    
    return {
        'total_ngrams': len(grams),
        'unique_ngrams': len(freq),
        'top_ngrams': freq.most_common(top_k),
    }

In [None]:
# Analyze a text
text = """Machine learning is a subset of artificial intelligence.
Machine learning enables computers to learn from data.
Deep learning is a subset of machine learning.
Natural language processing uses machine learning.
Machine learning models can process natural language."""

print(f"Text:\n{text}\n")
print("=" * 50)

for n in [1, 2, 3]:
    result = analyze_ngrams(text, n=n, remove_stopwords=True)
    
    print(f"\n{n}-grams Analysis:")
    print(f"  Total: {result['total_ngrams']}")
    print(f"  Unique: {result['unique_ngrams']}")
    print(f"  Top {n}-grams:")
    for gram, count in result['top_ngrams']:
        print(f"    {' '.join(gram)}: {count}")

## Summary

| Function | Description |
|----------|-------------|
| `ngrams(tokens, n)` | Generate n-grams |
| `bigrams(tokens)` | Generate bigrams |
| `trigrams(tokens)` | Generate trigrams |
| `BigramCollocationFinder` | Find significant bigrams |
| `TrigramCollocationFinder` | Find significant trigrams |

### Collocation Measures
- **PMI**: Pointwise Mutual Information
- **Chi-Square**: Statistical significance
- **Likelihood Ratio**: How likely is this collocation

### Use Cases
- Text generation
- Autocomplete / suggestion
- Keyphrase extraction
- Language detection
- Spell checking