# NLTK Complete Guide - Section 14: Text Classification

This notebook covers:
- Feature Extraction
- Naive Bayes Classifier
- Training and Evaluation
- Document Classification
- Practical Applications

In [None]:
import nltk
import random
from collections import Counter

nltk.download('movie_reviews', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('names', quiet=True)

from nltk.corpus import movie_reviews, names, stopwords
from nltk.classify import NaiveBayesClassifier, accuracy
from nltk.classify.util import apply_features
from nltk.tokenize import word_tokenize

## 14.1 Introduction to Text Classification

**Text Classification** assigns categories to documents:
- Spam detection
- Sentiment analysis
- Topic categorization
- Language detection

## 14.2 Simple Example: Gender Classification

In [None]:
def gender_features(name):
    """Extract features from a name"""
    return {
        'last_letter': name[-1].lower(),
        'last_two': name[-2:].lower(),
        'first_letter': name[0].lower(),
        'length': len(name),
    }

# Test the feature extractor
print("Feature extraction examples:")
print(f"  John: {gender_features('John')}")
print(f"  Mary: {gender_features('Mary')}")
print(f"  Alexandra: {gender_features('Alexandra')}")

In [None]:
# Prepare training data
male_names = [(name, 'male') for name in names.words('male.txt')]
female_names = [(name, 'female') for name in names.words('female.txt')]
all_names = male_names + female_names

random.seed(42)
random.shuffle(all_names)

print(f"Total names: {len(all_names)}")
print(f"Male: {len(male_names)}, Female: {len(female_names)}")

In [None]:
# Create feature sets
featuresets = [(gender_features(name), gender) for (name, gender) in all_names]

# Split into train/test
train_size = int(len(featuresets) * 0.8)
train_set = featuresets[:train_size]
test_set = featuresets[train_size:]

print(f"Training set: {len(train_set)}")
print(f"Test set: {len(test_set)}")

In [None]:
# Train Naive Bayes classifier
classifier = NaiveBayesClassifier.train(train_set)

# Evaluate
acc = accuracy(classifier, test_set)
print(f"Accuracy: {acc:.2%}")

In [None]:
# Most informative features
print("\nMost Informative Features:")
classifier.show_most_informative_features(10)

In [None]:
# Test with new names
test_names = ['Michael', 'Jessica', 'Alex', 'Taylor', 'Jordan', 'Emily', 'James']

print("\nPredictions:")
print("-" * 30)
for name in test_names:
    features = gender_features(name)
    prediction = classifier.classify(features)
    prob = classifier.prob_classify(features)
    confidence = prob.prob(prediction)
    print(f"{name:<12} â†’ {prediction:<8} ({confidence:.1%})")

## 14.3 Document Classification: Movie Reviews

In [None]:
# Load movie reviews
documents = [
    (list(movie_reviews.words(fileid)), category)
    for category in movie_reviews.categories()
    for fileid in movie_reviews.fileids(category)
]

random.seed(42)
random.shuffle(documents)

print(f"Total documents: {len(documents)}")
print(f"Categories: {movie_reviews.categories()}")
print(f"\nSample document (first 20 words): {documents[0][0][:20]}")
print(f"Label: {documents[0][1]}")

In [None]:
# Get most common words for features
all_words = [w.lower() for w in movie_reviews.words() if w.isalpha()]
word_freq = Counter(all_words)
common_words = [w for w, f in word_freq.most_common(2000)]

print(f"Vocabulary size: {len(word_freq)}")
print(f"Using top 2000 words as features")

In [None]:
def document_features(document, word_features):
    """Extract features from document"""
    document_words = set(document)
    features = {}
    for word in word_features:
        features[f'contains({word})'] = (word in document_words)
    return features

# Example
sample_features = document_features(documents[0][0], common_words[:10])
print("Sample features (first 10 words):")
for feat, value in sample_features.items():
    print(f"  {feat}: {value}")

In [None]:
# Create feature sets (using apply_features for memory efficiency)
featuresets = [
    (document_features(doc, common_words), category)
    for (doc, category) in documents
]

# Split data
train_set = featuresets[:1600]
test_set = featuresets[1600:]

print(f"Training: {len(train_set)}, Testing: {len(test_set)}")

In [None]:
# Train classifier
print("Training classifier...")
classifier = NaiveBayesClassifier.train(train_set)

# Evaluate
acc = accuracy(classifier, test_set)
print(f"\nAccuracy: {acc:.2%}")

In [None]:
# Most informative features
print("\nMost Informative Features for Sentiment:")
classifier.show_most_informative_features(15)

## 14.4 Improved Feature Extraction

In [None]:
stop_words = set(stopwords.words('english'))

def improved_features(document, word_features):
    """Extract improved features"""
    document_words = set(w.lower() for w in document if w.isalpha())
    
    features = {}
    
    # Word presence (excluding stopwords)
    for word in word_features:
        if word not in stop_words:
            features[f'contains({word})'] = (word in document_words)
    
    # Additional features
    features['doc_length'] = len(document) > 500
    features['has_exclamation'] = '!' in ' '.join(document)
    features['has_question'] = '?' in ' '.join(document)
    
    return features

In [None]:
# Filter word features (remove stopwords)
filtered_words = [w for w in common_words if w not in stop_words][:1500]

print(f"Filtered features: {len(filtered_words)}")

# Create new feature sets
featuresets_improved = [
    (improved_features(doc, filtered_words), category)
    for (doc, category) in documents
]

train_improved = featuresets_improved[:1600]
test_improved = featuresets_improved[1600:]

In [None]:
# Train improved classifier
print("Training improved classifier...")
classifier_improved = NaiveBayesClassifier.train(train_improved)

acc_improved = accuracy(classifier_improved, test_improved)
print(f"\nImproved Accuracy: {acc_improved:.2%}")
print(f"Original Accuracy: {acc:.2%}")

## 14.5 Cross-Validation

In [None]:
def cross_validate(featuresets, num_folds=5):
    """Perform k-fold cross validation"""
    fold_size = len(featuresets) // num_folds
    accuracies = []
    
    for i in range(num_folds):
        # Split data
        test_start = i * fold_size
        test_end = test_start + fold_size
        
        test_fold = featuresets[test_start:test_end]
        train_fold = featuresets[:test_start] + featuresets[test_end:]
        
        # Train and evaluate
        classifier = NaiveBayesClassifier.train(train_fold)
        acc = accuracy(classifier, test_fold)
        accuracies.append(acc)
        print(f"Fold {i+1}: {acc:.2%}")
    
    avg_accuracy = sum(accuracies) / len(accuracies)
    return avg_accuracy, accuracies

In [None]:
print("5-Fold Cross Validation:")
print("-" * 30)
avg_acc, fold_accs = cross_validate(featuresets_improved, num_folds=5)
print(f"\nAverage Accuracy: {avg_acc:.2%}")

## 14.6 Confusion Matrix and Metrics

In [None]:
def evaluate_classifier(classifier, test_set):
    """Calculate precision, recall, F1 for each class"""
    # Get predictions
    predictions = [classifier.classify(features) for features, label in test_set]
    actual = [label for features, label in test_set]
    
    # Build confusion matrix
    labels = list(set(actual))
    confusion = {}
    for true_label in labels:
        confusion[true_label] = {}
        for pred_label in labels:
            confusion[true_label][pred_label] = 0
    
    for true, pred in zip(actual, predictions):
        confusion[true][pred] += 1
    
    # Calculate metrics
    metrics = {}
    for label in labels:
        tp = confusion[label][label]
        fp = sum(confusion[other][label] for other in labels if other != label)
        fn = sum(confusion[label][other] for other in labels if other != label)
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        metrics[label] = {
            'precision': precision,
            'recall': recall,
            'f1': f1
        }
    
    return confusion, metrics

In [None]:
confusion, metrics = evaluate_classifier(classifier_improved, test_improved)

print("Confusion Matrix:")
print("=" * 40)
print(f"{'':>12} {'Predicted':<20}")
print(f"{'Actual':<12} {'neg':<10} {'pos':<10}")
print("-" * 40)
for true_label in ['neg', 'pos']:
    print(f"{true_label:<12}", end='')
    for pred_label in ['neg', 'pos']:
        print(f"{confusion[true_label][pred_label]:<10}", end='')
    print()

print("\nMetrics by Class:")
print("=" * 40)
print(f"{'Class':<10} {'Precision':<12} {'Recall':<12} {'F1':<10}")
print("-" * 40)
for label, m in metrics.items():
    print(f"{label:<10} {m['precision']:<12.2%} {m['recall']:<12.2%} {m['f1']:<10.2%}")

## 14.7 Complete Classifier Class

In [None]:
class TextClassifier:
    """Complete text classification pipeline"""
    
    def __init__(self, num_features=1500):
        self.num_features = num_features
        self.word_features = None
        self.classifier = None
        self.stop_words = set(stopwords.words('english'))
    
    def extract_features(self, document):
        """Extract features from a document"""
        if isinstance(document, str):
            document = word_tokenize(document.lower())
        
        doc_words = set(w.lower() for w in document if w.isalpha())
        
        features = {}
        for word in self.word_features:
            features[f'contains({word})'] = (word in doc_words)
        
        return features
    
    def train(self, documents, labels):
        """Train the classifier"""
        # Build vocabulary
        all_words = []
        for doc in documents:
            if isinstance(doc, str):
                doc = word_tokenize(doc.lower())
            all_words.extend([w.lower() for w in doc if w.isalpha() and w.lower() not in self.stop_words])
        
        word_freq = Counter(all_words)
        self.word_features = [w for w, f in word_freq.most_common(self.num_features)]
        
        # Create feature sets
        featuresets = [
            (self.extract_features(doc), label)
            for doc, label in zip(documents, labels)
        ]
        
        # Train
        self.classifier = NaiveBayesClassifier.train(featuresets)
        
        return self
    
    def predict(self, document):
        """Predict class for a document"""
        features = self.extract_features(document)
        return self.classifier.classify(features)
    
    def predict_proba(self, document):
        """Get probability distribution"""
        features = self.extract_features(document)
        prob_dist = self.classifier.prob_classify(features)
        return {label: prob_dist.prob(label) for label in prob_dist.samples()}
    
    def evaluate(self, documents, labels):
        """Evaluate accuracy on test data"""
        featuresets = [
            (self.extract_features(doc), label)
            for doc, label in zip(documents, labels)
        ]
        return accuracy(self.classifier, featuresets)
    
    def show_features(self, n=10):
        """Show most informative features"""
        self.classifier.show_most_informative_features(n)

In [None]:
# Use the class
# Prepare data
docs = [list(movie_reviews.words(fid)) for fid in movie_reviews.fileids()]
labels = [movie_reviews.categories(fid)[0] for fid in movie_reviews.fileids()]

# Shuffle
combined = list(zip(docs, labels))
random.shuffle(combined)
docs, labels = zip(*combined)

# Split
train_docs, test_docs = docs[:1600], docs[1600:]
train_labels, test_labels = labels[:1600], labels[1600:]

# Train
clf = TextClassifier(num_features=1500)
clf.train(train_docs, train_labels)

# Evaluate
acc = clf.evaluate(test_docs, test_labels)
print(f"Accuracy: {acc:.2%}")

In [None]:
# Test with new text
test_reviews = [
    "This movie was absolutely fantastic! Great acting and storyline.",
    "Terrible film. Waste of time and money. Very disappointing.",
    "It was okay. Not great, not terrible. Average movie.",
]

print("Predictions on new reviews:")
print("=" * 60)

for review in test_reviews:
    prediction = clf.predict(review)
    probs = clf.predict_proba(review)
    
    print(f"\nReview: {review[:50]}...")
    print(f"Prediction: {prediction}")
    print(f"Confidence: pos={probs['pos']:.1%}, neg={probs['neg']:.1%}")

## Summary

| Step | Code |
|------|------|
| Create features | `features = {...}` dictionary |
| Create featuresets | `[(features, label), ...]` |
| Train classifier | `NaiveBayesClassifier.train(train_set)` |
| Classify | `classifier.classify(features)` |
| Get probabilities | `classifier.prob_classify(features)` |
| Evaluate | `accuracy(classifier, test_set)` |
| Show features | `classifier.show_most_informative_features(n)` |

### Classification Pipeline
1. **Collect** labeled data
2. **Extract** features from text
3. **Split** into train/test sets
4. **Train** classifier
5. **Evaluate** performance
6. **Tune** features and parameters