In [1]:
"""
📌 NLP Preprocessing Pipeline (Reusable Python Template)
---------------------------------------------------------
This script provides a general-purpose text preprocessing pipeline for common NLP tasks, including:
- Text classification
- Sentiment analysis
- Topic modeling
- Named entity recognition

You can choose whether to apply:
- Lemmatization
- Stemming
- Both
- Or none

Usage:
------
Import `preprocess_text()` into your project, or run this script directly to test on a sample text.
"""

import re
import nltk
import contractions
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

# 🔧 Download required NLTK resources (only first time)
nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)

# ⚙️ Initialize tools
_stopwords = set(stopwords.words("english"))
_lemmatizer = WordNetLemmatizer()
_stemmer = PorterStemmer()


def preprocess_text(text: str, lemmatize=True, stem=False) -> list:
    """
    Preprocesses raw input text through a standard NLP pipeline.

    Steps:
        1. Remove HTML tags
        2. Convert to lowercase
        3. Expand contractions
        4. Remove URLs
        5. Remove punctuation and special characters
        6. Normalize whitespace
        7. Tokenize text
        8. Remove stopwords
        9. Lemmatize or Stem tokens (based on user choice)

    Args:
        text (str): Raw input text.
        lemmatize (bool): Whether to apply lemmatization.
        stem (bool): Whether to apply stemming.

    Returns:
        list: List of clean tokens.
    """

    # 1. Remove HTML tags
    text = re.sub(r"<.*?>", "", text)

    # 2. Convert to lowercase
    text = text.lower()

    # 3. Expand contractions
    text = contractions.fix(text)

    # 4. Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)

    # 5. Remove punctuation and special characters
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)

    # 6. Normalize extra whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # 7. Tokenize
    tokens = word_tokenize(text)

    # 8. Remove stopwords
    tokens = [token for token in tokens if token not in _stopwords]

    # 9. Lemmatize
    if lemmatize:
        tokens = [_lemmatizer.lemmatize(token) for token in tokens]

    # 10. Stem
    if stem:
        tokens = [_stemmer.stem(token) for token in tokens]

    return tokens


# 🚀 Example usage
if __name__ == "__main__":
    sample_text = """
    <p>Hello there! I'm testing this NLP pipeline using a sample URL: https://example.com,
    along with contractions like can't and numbers 123.</p>
    """

    print("▶ Lemmatization only:")
    print(preprocess_text(sample_text, lemmatize=True, stem=False))

    print("\n▶ Stemming only:")
    print(preprocess_text(sample_text, lemmatize=False, stem=True))

    print("\n▶ Both Lemmatization + Stemming:")
    print(preprocess_text(sample_text, lemmatize=True, stem=True))


▶ Lemmatization only:
['hello', 'testing', 'nlp', 'pipeline', 'using', 'sample', 'url', 'along', 'contraction', 'like', 'number', '123']

▶ Stemming only:
['hello', 'test', 'nlp', 'pipelin', 'use', 'sampl', 'url', 'along', 'contract', 'like', 'number', '123']

▶ Both Lemmatization + Stemming:
['hello', 'test', 'nlp', 'pipelin', 'use', 'sampl', 'url', 'along', 'contract', 'like', 'number', '123']
