In [41]:
text = (
    "Text preprocessing is a crucial step in natural language processing (NLP) and machine learning workflows, enabling more accurate and efficient text analysis. Raw text data is often messy and unstructured, containing inconsistencies, irrelevant characters, stop words, and varied forms of words. Through text preprocessing, this data can be cleaned, standardized, and transformed into a format better suited for algorithms to understand and analyze. Key steps include tokenization, which breaks down text into individual words or tokens; lemmatization, which converts words to their base or root forms; and removing stop words, which are common words (like "
    'the," "and," "is'
    ") that don’t add substantial meaning to the text. Additionally, text normalization techniques—such as lowercasing and punctuation removal—help in reducing dimensionality and ensuring consistency. Effective text preprocessing not only improves model performance by reducing noise and redundancy but also enhances the interpretability of text-based insights in applications like sentiment analysis, document classification, and chatbot responses."
)

In [42]:
import pandas as pd
import string

df = pd.DataFrame([text])
df.head()

Unnamed: 0,0
0,Text preprocessing is a crucial step in natura...


In [43]:
df["text"] = text

In [44]:
df = df.drop(columns=[0], axis=1)

In [45]:
df.head()

Unnamed: 0,text
0,Text preprocessing is a crucial step in natura...


# **Convert to lowercase**

In [46]:
df["cleaned_text"] = df["text"].str.lower()

In [47]:
df.head()

Unnamed: 0,text,cleaned_text
0,Text preprocessing is a crucial step in natura...,text preprocessing is a crucial step in natura...


# **Remove Punctuation**

In [48]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [49]:
def remove_punctuation(text):
    punctuations = string.punctuation
    return text.translate(str.maketrans("", "", punctuations))

In [50]:
df["cleaned_text"] = df["cleaned_text"].apply(lambda x: remove_punctuation(x))
df.head()

Unnamed: 0,text,cleaned_text
0,Text preprocessing is a crucial step in natura...,text preprocessing is a crucial step in natura...


# **Remove stop words**

In [51]:
from nltk.corpus import stopwords

stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [52]:
STOPWORDS = set(stopwords.words("english"))


def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

In [53]:
df["cleaned_text"] = df["cleaned_text"].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,text,cleaned_text
0,Text preprocessing is a crucial step in natura...,text preprocessing crucial step natural langua...


# **Remove Frequent words**

In [54]:
from collections import Counter

word_counts = Counter()
for text in df["cleaned_text"]:
    for word in text.split():
        word_counts[word] += 1
word_counts.most_common(10)

[('text', 8),
 ('words', 6),
 ('preprocessing', 3),
 ('analysis', 2),
 ('data', 2),
 ('stop', 2),
 ('forms', 2),
 ('like', 2),
 ('reducing', 2),
 ('crucial', 1)]

In [55]:
FREQUENT_WORDS = set([word for (word, count) in word_counts.most_common(10)])


def remove_frequent_words(text):
    return " ".join([word for word in str(text).split() if word not in FREQUENT_WORDS])


df["cleaned_text"] = df["cleaned_text"].apply(lambda x: remove_frequent_words(x))
df.head()

Unnamed: 0,text,cleaned_text
0,Text preprocessing is a crucial step in natura...,step natural language processing nlp machine l...


# **Remove Rare Words**

In [59]:
RARE_WORDS = set(word for (word, count) in word_counts.most_common()[:-10:-1])
RARE_WORDS

{'applications',
 'chatbot',
 'classification',
 'document',
 'insights',
 'interpretability',
 'responses',
 'sentiment',
 'textbased'}

In [60]:
def remove_rare_words(text):
    return " ".join([word for word in str(text).split() if word not in RARE_WORDS])


df["cleaned_text"] = df["cleaned_text"].apply(lambda x: remove_rare_words(x))
df.head()

Unnamed: 0,text,cleaned_text
0,Text preprocessing is a crucial step in natura...,step natural language processing nlp machine l...


# **Remove Special Character**

In [62]:
import re


def remove_special_characters(text):
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text


df["cleaned_text"] = df["cleaned_text"].apply(lambda x: remove_special_characters(x))
df.head()

Unnamed: 0,text,cleaned_text
0,Text preprocessing is a crucial step in natura...,step natural language processing nlp machine l...


# **Stemming**

In [64]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()


def stem_text(text):
    return " ".join([ps.stem(word) for word in text.split()])


df["stemmed_text"] = df["cleaned_text"].apply(lambda x: stem_text(x))
df.head()

Unnamed: 0,text,cleaned_text,stemmed_text
0,Text preprocessing is a crucial step in natura...,step natur languag process nlp machin learn wo...,step natur languag process nlp machin learn wo...


# **Lemmatization and POS tagging**

In [None]:
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer


lemmatizer = WordNetLemmatizer()
wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}


def lemmatize_text(text):
    pos_text = pos_tag(text.split())
    return " ".join(
        [
            lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN))
            for word, pos in pos_text
        ]
    )

[nltk_data] Error loading averaged_perceptron_tagger_english: Package
[nltk_data]     'averaged_perceptron_tagger_english' not found in
[nltk_data]     index


In [68]:
df["lemmatized_text"] = df["cleaned_text"].apply(lambda x: lemmatize_text(x))
df.head()

LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger_eng[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger_eng')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger_eng/[0m

  Searched in:
    - 'C:\\Users\\Gaurav/nltk_data'
    - 'c:\\Users\\Gaurav\\OneDrive\\Desktop\\TPT\\.venv\\nltk_data'
    - 'c:\\Users\\Gaurav\\OneDrive\\Desktop\\TPT\\.venv\\share\\nltk_data'
    - 'c:\\Users\\Gaurav\\OneDrive\\Desktop\\TPT\\.venv\\lib\\nltk_data'
    - 'C:\\Users\\Gaurav\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


# **Remove URL**

In [69]:
text = "https://www.google.com is a website"

In [72]:
def remove_url(text):
    return re.sub(r"https?://\S+|www\.\S+", "", text)


remove_url(text)

' is a website'

# **Remove HTML Tags**

In [76]:
text = "<html><body><h1>My First Heading </h1><p>My first paragraph.</p></body></html>"

In [77]:
def remove_html_tags(text):
    return re.sub(r"<[^>]+>", "", text)


remove_html_tags(text)

'My First Heading My first paragraph.'

# **Spelling Correction**

In [100]:
text = " welcme to my wordl. mye naame is Gaurav"

In [101]:
from spellchecker import SpellChecker

spell = SpellChecker()


def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_word = spell.correction(word)
            corrected_text.append(corrected_word)
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

In [102]:
correct_spellings(text)

'welcome to my world my name is Gaurav'

In [116]:
# text_preprocessing_toolkit/preprocessing.py


import string
import re
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from spellchecker import SpellChecker

import nltk

nltk.download("averaged_perceptron_tagger_eng")
nltk.download("wordnet")
nltk.download("omw-1.4")  # Optional, improves WordNet lemmatization


class TextPreprocessor:
    def __init__(
        self,
        remove_punctuation=True,
        remove_stopwords=True,
        get_frequent_words=True,
        remove_frequent_words=True,
        get_rare_words=True,
        remove_rare_words=True,
        remove_special_characters=True,
        stem_text=True,
        lemmatize_text=True,
        correct_spellings=True,
    ):
        self.stopwords = set(stopwords.words("english"))
        self.lemmatizer = WordNetLemmatizer()
        self.spell = SpellChecker()
        self.ps = PorterStemmer()
        self.wordnet_map = {
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "J": wordnet.ADJ,
            "R": wordnet.ADV,
        }

    def remove_punctuation(self, text):
        return text.translate(str.maketrans("", "", string.punctuation))

    def remove_stopwords(self, text):
        return " ".join(
            [word for word in str(text).split() if word not in self.stopwords]
        )

    def get_frequent_words(self, texts, top_n=10):
        word_counts = Counter()
        for text in texts:
            for word in text.split():
                word_counts[word] += 1
        return set([word for word, count in word_counts.most_common(top_n)])

    def remove_frequent_words(self, text, frequent_words):
        return " ".join(
            [word for word in str(text).split() if word not in frequent_words]
        )

    def get_rare_words(self, texts, bottom_n=10):
        word_counts = Counter()
        for text in texts:
            for word in text.split():
                word_counts[word] += 1
        return set(
            word for word, count in word_counts.most_common()[: -bottom_n - 1 : -1]
        )

    def remove_rare_words(self, text, rare_words):
        return " ".join([word for word in str(text).split() if word not in rare_words])

    def remove_special_characters(self, text):
        text = re.sub(r"[^a-zA-Z0-9]", " ", text)
        text = re.sub(r"\s+", " ", text)
        return text

    def stem_text(self, text):
        return " ".join([self.ps.stem(word) for word in text.split()])

    def lemmatize_text(self, text):
        pos_text = pos_tag(text.split())
        return " ".join(
            [
                self.lemmatizer.lemmatize(
                    word, self.wordnet_map.get(pos[0], wordnet.NOUN)
                )
                for word, pos in pos_text
            ]
        )

    # def lemmatize_text(self, text):
    #     pos_text = pos_tag(text.split())
    #     return " ".join([self.lemmatizer.lemmatize(word, self.wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text])

    def remove_url(self, text):
        return re.sub(r"https?://\S+|www\.\S+", "", text)

    def remove_html_tags(self, text):
        return re.sub(r"<[^>]+>", "", text)

    def correct_spellings(self, text):
        corrected_text = []
        misspelled_words = self.spell.unknown(text.split())
        for word in text.split():
            if word in misspelled_words:
                corrected_word = self.spell.correction(word)
                corrected_text.append(corrected_word)
            else:
                corrected_text.append(word)
        return " ".join(corrected_text)

    def lowercase(self, text):
        return text.lower()

    def preprocess(self, text, steps=None):
        if steps is None:
            steps = [
                "lowercase",
                "remove_punctuation",
                "remove_stopwords",
                "remove_special_characters",
                "remove_url",
                "remove_html_tags",
                "correct_spellings",
                "lemmatize_text",
            ]
        for step in steps:
            text = getattr(self, step)(text)
        return text

    def head(self, texts, n=5):
        """
        Display the first few entries of the dataset for quick visualization.

        Parameters:
        texts (list or pd.Series): The dataset or list of text entries to display.
        n (int): The number of rows to display. Default is 5.

        Returns:
        None
        """
        if isinstance(texts, (list, pd.Series)):
            data = pd.DataFrame({"Text": texts[:n]})
            print(data)
        else:
            raise ValueError(
                "The input should be a list or pandas Series of text entries."
            )

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [117]:
text = [
    "Text preprocessing is a crucial step in natural language processing (NLP) and machine learning workflows, enabling more accurate and efficient text analysis. Raw text data is often messy and unstructured, containing inconsistencies, irrelevant characters, stop words, and varied forms of words.",
    "Through text preprocessing, this data can be cleaned, standardized, and transformed into a format better suited for algorithms to understand and analyze.",
    " Key steps include tokenization, which breaks down text into individual words or tokens; lemmatization, which converts words to their base or root forms; and removing stop words, which are common words (like "
    'the," "and," "is'
    ") that don’t add substantial meaning to the text.",
    "Additionally, text normalization techniques—such as lowercasing and punctuation removal—help in reducing dimensionality and ensuring consistency.",
    "Effective text preprocessing not only improves model performance by reducing noise and redundancy but also enhances the interpretability of text-based insights in applications like sentiment analysis, document classification, and chatbot responses.",
]

In [119]:
processor = TextPreprocessor()

# Sample text
text = "Hello, world! This is a test text with HTML <html> tags and a URL: https://example.com."

# Removing punctuation
no_punctuation_text = processor.remove_punctuation(text)
print("After removing punctuation:", no_punctuation_text)

# Lowercasing text
lowercase_text = processor.lowercase(text)
print("Lowercase text:", lowercase_text)

# Removing stopwords
no_stopwords_text = processor.remove_stopwords(text)
print("After removing stopwords:", no_stopwords_text)

# Removing URLs
no_urls_text = processor.remove_url(text)
print("After removing URLs:", no_urls_text)

# Removing HTML tags
no_html_text = processor.remove_html_tags(text)
print("After removing HTML tags:", no_html_text)

# Correcting spellings
corrected_text = processor.correct_spellings("This is a tst txt for spellng corection")
print("After spell correction:", corrected_text)

# Lemmatizing text
lemmatized_text = processor.lemmatize_text("running jumped plays better")
print("After lemmatization:", lemmatized_text)


# Stemming text
stemmed_text = processor.stem_text("running jumped plays better")
print("After stemming:", stemmed_text)

After removing punctuation: Hello world This is a test text with HTML html tags and a URL httpsexamplecom
Lowercase text: hello, world! this is a test text with html <html> tags and a url: https://example.com.
After removing stopwords: Hello, world! This test text HTML <html> tags URL: https://example.com.
After removing URLs: Hello, world! This is a test text with HTML <html> tags and a URL: 
After removing HTML tags: Hello, world! This is a test text with HTML  tags and a URL: https://example.com.
After spell correction: This is a test text for spelling correction
After lemmatization: run jump play well
After stemming: run jump play better


In [120]:
processor.preprocess(text)

TypeError: sequence item 8: expected str instance, NoneType found

In [1]:
# text_preprocessing_toolkit/preprocessing.py

import string
import re
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from spellchecker import SpellChecker
import nltk
import matplotlib.pyplot as plt

# Download required NLTK resources if not already downloaded
nltk.download("averaged_perceptron_tagger")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("stopwords")


class TextPreprocessor:
    def __init__(self):
        self.stopwords = set(stopwords.words("english"))
        self.lemmatizer = WordNetLemmatizer()
        self.spell = SpellChecker()
        self.ps = PorterStemmer()
        self.wordnet_map = {
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "J": wordnet.ADJ,
            "R": wordnet.ADV,
        }

    def remove_punctuation(self, text):
        return text.translate(str.maketrans("", "", string.punctuation))

    def remove_stopwords(self, text):
        return " ".join([word for word in text.split() if word not in self.stopwords])

    def remove_special_characters(self, text):
        text = re.sub(r"[^a-zA-Z0-9]", " ", text)
        text = re.sub(r"\s+", " ", text)
        return text

    def stem_text(self, text):
        return " ".join([self.ps.stem(word) for word in text.split()])

    def lemmatize_text(self, text):
        pos_text = pos_tag(text.split())
        return " ".join(
            [
                self.lemmatizer.lemmatize(
                    word, self.wordnet_map.get(pos[0], wordnet.NOUN)
                )
                for word, pos in pos_text
            ]
        )

    def remove_url(self, text):
        return re.sub(r"https?://\S+|www\.\S+", "", text)

    def remove_html_tags(self, text):
        return re.sub(r"<[^>]+>", "", text)

    def correct_spellings(self, text):
        corrected_text = []
        misspelled_words = self.spell.unknown(text.split())
        for word in text.split():
            if word in misspelled_words:
                corrected_word = self.spell.correction(word)
                corrected_text.append(corrected_word)
            else:
                corrected_text.append(word)
        return " ".join(corrected_text)

    def lowercase(self, text):
        return text.lower()

    def preprocess(self, text, custom_steps=None):
        """
        Automatically preprocess text with a default pipeline.
        User can specify custom_steps for specific preprocessing order.

        Parameters:
        text (str): Text to preprocess.
        custom_steps (list): List of preprocessing steps in desired order.

        Returns:
        str: Preprocessed text.
        """
        # Define the default pipeline
        default_pipeline = [
            "lowercase",
            "remove_punctuation",
            "remove_stopwords",
            "remove_special_characters",
            "remove_url",
            "remove_html_tags",
            "correct_spellings",
            "lemmatize_text",
        ]

        # Use custom steps if provided, otherwise default steps
        steps = custom_steps if custom_steps else default_pipeline
        for step in steps:
            text = getattr(self, step)(text)
        return text

    def head(self, texts, n=5):
        """
        Display a summary of the first few entries of the dataset for quick visualization.

        Parameters:
        texts (list or pd.Series): The dataset or list of text entries to display.
        n (int): The number of rows to display. Default is 5.

        Returns:
        None
        """
        if isinstance(texts, (list, pd.Series)):
            data = pd.DataFrame({"Text": texts[:n]})
            data["Word Count"] = data["Text"].apply(lambda x: len(x.split()))
            data["Character Count"] = data["Text"].apply(len)
            display(data)

            # Plotting word counts for quick overview
            plt.figure(figsize=(8, 5))
            plt.bar(range(n), data["Word Count"], color="skyblue")
            plt.xticks(range(n), [f"Text {i+1}" for i in range(n)], rotation=45)
            plt.xlabel("Text Entries")
            plt.ylabel("Word Count")
            plt.title("Word Count of First Few Text Entries")
            plt.show()
        else:
            raise ValueError(
                "The input should be a list or pandas Series of text entries."
            )

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
processor = TextPreprocessor()

# Sample text
text = "Hello, world! This is a test text with HTML <html> tags and a URL: https://example.com."

# Removing punctuation
no_punctuation_text = processor.remove_punctuation(text)
print("After removing punctuation:", no_punctuation_text)

# Lowercasing text
lowercase_text = processor.lowercase(text)
print("Lowercase text:", lowercase_text)

# Removing stopwords
no_stopwords_text = processor.remove_stopwords(text)
print("After removing stopwords:", no_stopwords_text)

# Removing URLs
no_urls_text = processor.remove_url(text)
print("After removing URLs:", no_urls_text)

# Removing HTML tags
no_html_text = processor.remove_html_tags(text)
print("After removing HTML tags:", no_html_text)

# Correcting spellings
corrected_text = processor.correct_spellings("This is a tst txt for spellng corection")
print("After spell correction:", corrected_text)

# Lemmatizing text
lemmatized_text = processor.lemmatize_text("running jumped plays better")
print("After lemmatization:", lemmatized_text)


# Stemming text
stemmed_text = processor.stem_text("running jumped plays better")
print("After stemming:", stemmed_text)

After removing punctuation: Hello world This is a test text with HTML html tags and a URL httpsexamplecom
Lowercase text: hello, world! this is a test text with html <html> tags and a url: https://example.com.
After removing stopwords: Hello, world! This test text HTML <html> tags URL: https://example.com.
After removing URLs: Hello, world! This is a test text with HTML <html> tags and a URL: 
After removing HTML tags: Hello, world! This is a test text with HTML  tags and a URL: https://example.com.
After spell correction: This is a test text for spelling correction
After lemmatization: run jump play well
After stemming: run jump play better


: 

In [129]:
# Import the text preprocessing toolkit

# Create an instance of the TextPreprocessor class
preprocessor = TextPreprocessor()

# Example text

# Perform automatic preprocessing
processed_text = preprocessor.preprocess(text)

# Display the preprocessed text
print("Processed Text:", processed_text)

AttributeError: 'list' object has no attribute 'lower'

In [131]:
# Import the text preprocessing toolkit

# Create an instance of the TextPreprocessor class
preprocessor = TextPreprocessor()

# Sample dataset of text entries
texts = [
    "Natural Language Processing (NLP) is fascinating! Visit https://nlp.com for more info.",
    "Text processing includes <html> tags, URLs like http://example.com, and stopwords.",
    "Punctuation, typos, and other #special characters are common in real-world texts.",
    "NLP is transforming industries by making sense of large-scale textual data!",
    "Let's clean and prepare our dataset for machine learning models.",
]

# Step 1: Preprocess each text in the list
processed_texts = [preprocessor.preprocess(text) for text in texts]

# Step 2: Use the head function to display the first few processed entries
print("Processed Text Entries:")
preprocessor.head(processed_texts, n=5)

TypeError: sequence item 6: expected str instance, NoneType found

In [1]:
import string
import re
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from spellchecker import SpellChecker
import nltk
from IPython.display import display
from typing import List, Optional, Union

# Download required NLTK resources if not already downloaded
nltk.download("averaged_perceptron_tagger")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("stopwords")


class TextPreprocessor:
    def __init__(self) -> None:
        self.stopwords: set[str] = set(stopwords.words("english"))
        self.lemmatizer: WordNetLemmatizer = WordNetLemmatizer()
        self.spell: SpellChecker = SpellChecker()
        self.ps: PorterStemmer = PorterStemmer()
        self.wordnet_map: dict[str, wordnet] = {
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "J": wordnet.ADJ,
            "R": wordnet.ADV,
        }

    def remove_punctuation(self, text: Optional[str]) -> Optional[str]:
        if text:
            return text.translate(str.maketrans("", "", string.punctuation))
        return text

    def remove_stopwords(self, text: Optional[str]) -> Optional[str]:
        if text:
            return " ".join(
                [word for word in text.split() if word not in self.stopwords]
            )
        return text

    def remove_special_characters(self, text: Optional[str]) -> Optional[str]:
        if text:
            text = re.sub(r"[^a-zA-Z0-9]", " ", text)
            text = re.sub(r"\s+", " ", text)
            return text
        return text

    def stem_text(self, text: Optional[str]) -> Optional[str]:
        if text:
            return " ".join([self.ps.stem(word) for word in text.split()])
        return text

    def lemmatize_text(self, text: Optional[str]) -> Optional[str]:
        if text:
            pos_text = pos_tag(text.split())
            return " ".join(
                [
                    self.lemmatizer.lemmatize(
                        word, self.wordnet_map.get(pos[0], wordnet.NOUN)
                    )
                    for word, pos in pos_text
                ]
            )
        return text

    def remove_url(self, text: Optional[str]) -> Optional[str]:
        if text:
            return re.sub(r"https?://\S+|www\.\S+", "", text)
        return text

    def remove_html_tags(self, text: Optional[str]) -> Optional[str]:
        if text:
            return re.sub(r"<[^>]+>", "", text)
        return text

    def correct_spellings(self, text: Optional[str]) -> Optional[str]:
        if text:
            corrected_text: List[str] = []
            misspelled_words = self.spell.unknown(text.split())
            for word in text.split():
                if word in misspelled_words:
                    corrected_word = self.spell.correction(word)
                    corrected_text.append(corrected_word)
                else:
                    corrected_text.append(word)
            return " ".join(corrected_text)
        return text

    def lowercase(self, text: Optional[str]) -> Optional[str]:
        if text:
            return text.lower()
        return text

    def preprocess(self, text: str, steps: Optional[List[str]] = None) -> str:
        """
        Automatically preprocess text with a default pipeline.
        User can specify steps for specific preprocessing order.

        Parameters:
        text (str): Text to preprocess.
        steps (list): List of preprocessing steps in desired order.

        Returns:
        str: Preprocessed text.
        """
        default_pipeline = [
            "lowercase",
            "remove_punctuation",
            "remove_stopwords",
            "remove_special_characters",
            "remove_url",
            "remove_html_tags",
            "correct_spellings",
            "lemmatize_text",
        ]
        steps = steps if steps else default_pipeline
        for step in steps:
            text = getattr(self, step)(text)  # type: ignore
        return text

    def head(self, texts: Union[List[str], pd.Series], n: int = 5) -> None:
        """
        Display a summary of the first few entries of the dataset for quick visualization.

        Parameters:
        texts (list or pd.Series): The dataset or list of text entries to display.
        n (int): The number of rows to display. Default is 5.

        Returns:
        None
        """
        if isinstance(texts, (list, pd.Series)):
            data = pd.DataFrame({"Text": texts[:n]})
            data["Word Count"] = data["Text"].apply(lambda x: len(x.split()))
            data["Character Count"] = data["Text"].apply(len)
            display(data)


if __name__ == "__main__":
    TextPreprocessor()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Import the TextPreprocessor class

# Initialize the TextPreprocessor
processor = TextPreprocessor()

# Sample text data
texts = [
    "This is a sample sentence, with punctuation!",
    "Check out the website: https://example.com for more information.",
    "<p>This is a paragraph with HTML tags.</p>",
    "I'm here to test spellng corrections.",
    "Another example sentence for testing preprocessing steps.",
]

# Apply preprocessing with a custom pipeline
cleaned_texts = [
    processor.preprocess(
        text,
        steps=[
            "lowercase",
            "remove_punctuation",
            "remove_stopwords",
            "lemmatize_text",
            "remove_special_characters",
            "remove_url",
            "remove_html_tags",
        ],
    )
    for text in texts
]

# Display the first few cleaned texts
processor.head(cleaned_texts)

Unnamed: 0,Text,Word Count,Character Count
0,sample sentence punctuation,3,27
1,check website httpsexamplecom information,4,41
2,pthis paragraph html tagsp,4,26
3,im test spellng correction,4,26
4,another example sentence test preprocessing step,6,48
