# NLTK Complete Guide - Section 4: Stopwords & Text Cleaning

This notebook covers:
- Stopwords Basics
- Custom Stopwords
- Text Cleaning Functions
- Complete Preprocessing Pipeline

In [None]:
import re
import string
import nltk

nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

## 4.1 Stopwords Basics

**Stopwords** are common words that usually don't carry much meaning (the, is, at, which, on, etc.)

In [None]:
# Get English stopwords
stop_words = set(stopwords.words('english'))

print(f"Number of English stopwords: {len(stop_words)}")
print(f"\nFirst 30 stopwords (alphabetically):")
print(sorted(stop_words)[:30])

In [None]:
# All available languages
print("Available languages:")
print(stopwords.fileids())

### Stopwords by Language

In [None]:
languages = ['english', 'spanish', 'french', 'german', 'italian', 'portuguese']

print(f"{'Language':<12} {'Count':>6}  Sample words")
print("-" * 60)

for lang in languages:
    words = stopwords.words(lang)
    sample = words[:5]
    print(f"{lang.capitalize():<12} {len(words):>6}  {sample}")

### Removing Stopwords

In [None]:
text = "This is a sample sentence showing off the stop words filtration."
stop_words = set(stopwords.words('english'))

# Tokenize
tokens = word_tokenize(text.lower())

# Remove stopwords
filtered = [w for w in tokens if w not in stop_words]

# Remove stopwords AND punctuation
filtered_clean = [w for w in tokens if w not in stop_words and w.isalnum()]

print(f"Original: {text}")
print(f"\nTokens ({len(tokens)}): {tokens}")
print(f"\nWithout stopwords ({len(filtered)}): {filtered}")
print(f"\nWithout stopwords + punct ({len(filtered_clean)}): {filtered_clean}")

## 4.2 Custom Stopwords

Modify stopwords for your specific use case.

In [None]:
base_stopwords = set(stopwords.words('english'))
print(f"Base stopwords: {len(base_stopwords)}")

### Extend Stopwords

In [None]:
# Add domain-specific or common words
extended_stopwords = base_stopwords.union({'also', 'however', 'therefore', 'thus', 'hence', 'would', 'could'})

print(f"Extended stopwords: {len(extended_stopwords)}")
print(f"Added words: {extended_stopwords - base_stopwords}")

### Keep Negations (for Sentiment Analysis)

In [None]:
# Remove negations from stopwords (important for sentiment!)
negations = {'not', 'no', 'nor', 'neither', 'never', "don't", "won't", "can't", "isn't", "aren't"}
sentiment_stopwords = base_stopwords - negations

print(f"Sentiment stopwords: {len(sentiment_stopwords)}")
print(f"Kept words: {base_stopwords - sentiment_stopwords}")

### Comparison: Standard vs Sentiment-Aware

In [None]:
text = "This product is not good and I would never recommend it to anyone."
tokens = word_tokenize(text.lower())

# Standard removal
standard = [w for w in tokens if w not in base_stopwords and w.isalnum()]

# Sentiment-aware removal
sentiment = [w for w in tokens if w not in sentiment_stopwords and w.isalnum()]

print(f"Original: {text}")
print(f"\nStandard stopword removal: {standard}")
print("  ‚ö†Ô∏è  'not' and 'never' removed - loses negative sentiment!")
print(f"\nSentiment-aware removal: {sentiment}")
print("  ‚úÖ 'not' and 'never' preserved - keeps negative sentiment!")

### Domain-Specific Stopwords

In [None]:
# Medical domain
medical_stopwords = base_stopwords.union({
    'patient', 'patients', 'doctor', 'hospital', 'treatment', 
    'medical', 'clinical', 'symptoms', 'condition', 'diagnosis'
})

# Legal domain
legal_stopwords = base_stopwords.union({
    'court', 'plaintiff', 'defendant', 'hereby', 'whereas',
    'shall', 'pursuant', 'herein', 'thereof', 'aforementioned'
})

print(f"Medical stopwords: {len(medical_stopwords)}")
print(f"Legal stopwords: {len(legal_stopwords)}")

## 4.3 Text Cleaning Functions

### Basic Text Cleaning

In [None]:
def clean_text_basic(text):
    """Basic text cleaning"""
    # Lowercase
    text = text.lower()
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    return text

# Test
dirty = "  Hello,   WORLD!!!   How are   you???  "
print(f"Before: '{dirty}'")
print(f"After:  '{clean_text_basic(dirty)}'")

### Advanced Text Cleaning

In [None]:
def clean_text_advanced(text):
    """Advanced text cleaning with regex"""
    # Lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove mentions (@username)
    text = re.sub(r'@\w+', '', text)
    
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

In [None]:
dirty_text = """
Check out https://example.com!!! 
<b>AMAZING</b> deal @store #sale 
Contact: user@email.com
Price: $99.99 (50% OFF!!!)
"""

print("Original:")
print(dirty_text)
print("\nBasic cleaning:")
print(clean_text_basic(dirty_text))
print("\nAdvanced cleaning:")
print(clean_text_advanced(dirty_text))

## 4.4 Complete Preprocessing Pipeline

In [None]:
def preprocess_text(text, 
                    lowercase=True,
                    remove_urls=True,
                    remove_html=True,
                    remove_emails=True,
                    remove_mentions=True,
                    remove_hashtags=False,  # Keep hashtag text
                    remove_numbers=True,
                    remove_punctuation=True,
                    remove_stopwords_flag=True,
                    min_word_length=2,
                    custom_stopwords=None):
    """
    Complete text preprocessing pipeline.
    
    Returns: List of cleaned tokens
    """
    
    if lowercase:
        text = text.lower()
    
    if remove_urls:
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    if remove_html:
        text = re.sub(r'<[^>]+>', '', text)
    
    if remove_emails:
        text = re.sub(r'\S+@\S+', '', text)
    
    if remove_mentions:
        text = re.sub(r'@\w+', '', text)
    
    if remove_hashtags:
        text = re.sub(r'#\w+', '', text)
    else:
        # Keep hashtag text, just remove #
        text = re.sub(r'#(\w+)', r'\1', text)
    
    if remove_numbers:
        text = re.sub(r'\d+', '', text)
    
    if remove_punctuation:
        text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    if remove_stopwords_flag:
        stop_words = set(stopwords.words('english'))
        if custom_stopwords:
            stop_words = stop_words.union(custom_stopwords)
        tokens = [t for t in tokens if t not in stop_words]
    
    # Filter by length
    tokens = [t for t in tokens if len(t) >= min_word_length]
    
    return tokens

In [None]:
messy_text = """
üéâ Check out our AMAZING new product at https://shop.example.com! 
<p>Contact support@company.com for help.</p>
@customer said: "This is the BEST thing I've bought in 2024!!!"
#happy #satisfied
Price: only $49.99 (was $99.99) - 50% OFF!!! 
Limited time offer... Don't miss out!!!
"""

print("Original text:")
print(messy_text)

In [None]:
# Default preprocessing
tokens = preprocess_text(messy_text)
print("Default preprocessing:")
print(tokens)

In [None]:
# Keep hashtags (as text)
tokens = preprocess_text(messy_text, remove_hashtags=False)
print("Keep hashtags:")
print(tokens)

In [None]:
# For sentiment analysis (keep negations)
sentiment_stops = set(stopwords.words('english')) - {'not', 'no', 'never', "don't", "won't"}
tokens = preprocess_text(
    "This is not good and I don't like it at all!",
    remove_stopwords_flag=True,
)
print("For sentiment:")
print(tokens)

In [None]:
# Minimal preprocessing (keep more)
tokens = preprocess_text(
    messy_text,
    remove_stopwords_flag=False,
    remove_numbers=False,
    min_word_length=1
)
print("Minimal preprocessing:")
print(tokens)

## 4.5 Text Normalization

In [None]:
def expand_contractions(text):
    """Expand common contractions"""
    contractions = {
        "won't": "will not",
        "can't": "cannot",
        "n't": " not",
        "'re": " are",
        "'s": " is",
        "'d": " would",
        "'ll": " will",
        "'ve": " have",
        "'m": " am",
    }
    
    for contraction, expansion in contractions.items():
        text = text.replace(contraction, expansion)
    
    return text

# Test
text = "I can't believe it's working! We're so happy!"
print(f"Original:  {text}")
print(f"Expanded:  {expand_contractions(text)}")

In [None]:
def normalize_repeated_chars(text):
    """Reduce repeated characters (coooool -> cool)"""
    # Reduce 3+ repeated chars to 2
    return re.sub(r'(.)\1{2,}', r'\1\1', text)

# Test
text = "This is sooooo coooool! I loooove it!!!"
print(f"Original:   {text}")
print(f"Normalized: {normalize_repeated_chars(text)}")

## 4.6 Batch Processing

In [None]:
def preprocess_corpus(documents, **kwargs):
    """Preprocess a list of documents"""
    return [preprocess_text(doc, **kwargs) for doc in documents]

# Example documents
documents = [
    "Natural language processing is fascinating!",
    "Machine learning enables NLP applications.",
    "Deep learning has transformed NLP research.",
    "Text preprocessing is essential for NLP.",
    "NLP combines linguistics and computer science.",
]

# Preprocess all
processed = preprocess_corpus(documents)

print("Original ‚Üí Processed")
print("=" * 60)
for orig, proc in zip(documents, processed):
    print(f"{orig}")
    print(f"  ‚Üí {proc}\n")

### Create Vocabulary

In [None]:
from collections import Counter

def create_vocabulary(tokenized_docs, min_freq=1):
    """Create vocabulary from tokenized documents"""
    word_counts = Counter()
    for doc in tokenized_docs:
        word_counts.update(doc)
    
    # Filter by frequency
    vocab = {word for word, count in word_counts.items() if count >= min_freq}
    
    return vocab, word_counts

vocab, counts = create_vocabulary(processed)

print(f"Vocabulary size: {len(vocab)}")
print(f"\nMost common words:")
for word, count in counts.most_common(10):
    print(f"  {word}: {count}")

## 4.7 TextCleaner Class

In [None]:
class TextCleaner:
    """Reusable text cleaning utility"""
    
    def __init__(self, language='english'):
        self.stop_words = set(stopwords.words(language))
        self.custom_stopwords = set()
    
    def add_stopwords(self, words):
        """Add custom stopwords"""
        self.custom_stopwords.update(words)
        return self
    
    def keep_words(self, words):
        """Remove words from stopwords (keep them)"""
        self.stop_words -= set(words)
        return self
    
    def clean(self, text):
        """Clean text with configured options"""
        # Lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http\S+', '', text)
        
        # Remove HTML
        text = re.sub(r'<[^>]+>', '', text)
        
        # Remove punctuation
        text = re.sub(r'[^\w\s]', '', text)
        
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        
        # Tokenize
        tokens = word_tokenize(text)
        
        # Remove stopwords
        all_stopwords = self.stop_words.union(self.custom_stopwords)
        tokens = [t for t in tokens if t not in all_stopwords and len(t) > 1]
        
        return tokens
    
    def clean_batch(self, texts):
        """Clean multiple texts"""
        return [self.clean(text) for text in texts]

In [None]:
# Create cleaner
cleaner = TextCleaner()

# Configure for sentiment analysis
cleaner.keep_words(['not', 'no', 'never'])  # Keep negations
cleaner.add_stopwords(['said', 'also'])     # Remove common words

# Test
text = "I said this product is not good. Also, it never works properly!"
print(f"Text: {text}")
print(f"Cleaned: {cleaner.clean(text)}")

## Summary

| Task | Code |
|------|------|
| Get stopwords | `stopwords.words('english')` |
| Remove stopwords | `[w for w in tokens if w not in stop_words]` |
| Add stopwords | `stop_words.union({'word1', 'word2'})` |
| Remove from stopwords | `stop_words - {'not', 'no'}` |
| Remove URLs | `re.sub(r'http\S+', '', text)` |
| Remove HTML | `re.sub(r'<[^>]+>', '', text)` |
| Remove punctuation | `re.sub(r'[^\w\s]', '', text)` |

### Best Practices
1. **For sentiment analysis**: Keep negations (not, no, never)
2. **For topic modeling**: Remove domain-specific common words
3. **For search**: More aggressive cleaning is usually better
4. **Always**: Remove URLs, HTML, and normalize whitespace