In [None]:
# text_preprocessing_toolkit/preprocessor.py

import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


class TextPreprocessor:
    def __init__(
        self,
        remove_stopwords=True,
        lemmatize=True,
        lowercase=True,
        remove_punctuation=True,
    ):
        self.remove_stopwords = remove_stopwords
        self.lemmatize = lemmatize
        self.lowercase = lowercase
        self.remove_punctuation = remove_punctuation
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words("english"))

    def preprocess(self, text):
        if self.lowercase:
            text = text.lower()
        if self.remove_punctuation:
            text = self._remove_punctuation(text)
        tokens = word_tokenize(text)
        if self.remove_stopwords:
            tokens = self._remove_stopwords(tokens)
        if self.lemmatize:
            tokens = self._lemmatize(tokens)
        return " ".join(tokens)

    def _remove_punctuation(self, text):
        return text.translate(str.maketrans("", "", string.punctuation))

    def _remove_stopwords(self, tokens):
        return [word for word in tokens if word not in self.stop_words]

    def _lemmatize(self, tokens):
        return [self.lemmatizer.lemmatize(word) for word in tokens]

In [22]:
text = (
    "Text preprocessing is a crucial step in natural language processing (NLP) and machine learning workflows, enabling more accurate and efficient text analysis. Raw text data is often messy and unstructured, containing inconsistencies, irrelevant characters, stop words, and varied forms of words. Through text preprocessing, this data can be cleaned, standardized, and transformed into a format better suited for algorithms to understand and analyze. Key steps include tokenization, which breaks down text into individual words or tokens; lemmatization, which converts words to their base or root forms; and removing stop words, which are common words (like "
    'the," "and," "is'
    ") that don’t add substantial meaning to the text. Additionally, text normalization techniques—such as lowercasing and punctuation removal—help in reducing dimensionality and ensuring consistency. Effective text preprocessing not only improves model performance by reducing noise and redundancy but also enhances the interpretability of text-based insights in applications like sentiment analysis, document classification, and chatbot responses."
)

In [34]:
data = TextPreprocessor(remove_stopwords=True)

# **Main Code**

In [70]:
# text_preprocessing_toolkit/preprocessing.py


import string
import re
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from spellchecker import SpellChecker

import nltk

nltk.download("averaged_perceptron_tagger_eng")
nltk.download("wordnet")
nltk.download("omw-1.4")  # Optional, improves WordNet lemmatization


class TextPreprocessor:
    def __init__(self):
        self.stopwords = set(stopwords.words("english"))
        self.lemmatizer = WordNetLemmatizer()
        self.spell = SpellChecker()
        self.ps = PorterStemmer()
        self.wordnet_map = {
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "J": wordnet.ADJ,
            "R": wordnet.ADV,
        }

    def remove_punctuation(self, text):
        return text.translate(str.maketrans("", "", string.punctuation))

    def remove_stopwords(self, text):
        return " ".join(
            [word for word in str(text).split() if word not in self.stopwords]
        )

    def get_frequent_words(self, texts, top_n=10):
        word_counts = Counter()
        for text in texts:
            for word in text.split():
                word_counts[word] += 1
        return set([word for word, count in word_counts.most_common(top_n)])

    def remove_frequent_words(self, text, frequent_words):
        return " ".join(
            [word for word in str(text).split() if word not in frequent_words]
        )

    def get_rare_words(self, texts, bottom_n=10):
        word_counts = Counter()
        for text in texts:
            for word in text.split():
                word_counts[word] += 1
        return set(
            word for word, count in word_counts.most_common()[: -bottom_n - 1 : -1]
        )

    def remove_rare_words(self, text, rare_words):
        return " ".join([word for word in str(text).split() if word not in rare_words])

    def remove_special_characters(self, text):
        text = re.sub(r"[^a-zA-Z0-9]", " ", text)
        text = re.sub(r"\s+", " ", text)
        return text

    def stem_text(self, text):
        return " ".join([self.ps.stem(word) for word in text.split()])

    def lemmatize_text(self, text):
        pos_text = pos_tag(text.split())
        return " ".join(
            [
                self.lemmatizer.lemmatize(
                    word, self.wordnet_map.get(pos[0], wordnet.NOUN)
                )
                for word, pos in pos_text
            ]
        )

    # def lemmatize_text(self, text):
    #     pos_text = pos_tag(text.split())
    #     return " ".join([self.lemmatizer.lemmatize(word, self.wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text])

    def remove_url(self, text):
        return re.sub(r"https?://\S+|www\.\S+", "", text)

    def remove_html_tags(self, text):
        return re.sub(r"<[^>]+>", "", text)

    def correct_spellings(self, text):
        corrected_text = []
        misspelled_words = self.spell.unknown(text.split())
        for word in text.split():
            if word in misspelled_words:
                corrected_word = self.spell.correction(word)
                corrected_text.append(corrected_word)
            else:
                corrected_text.append(word)
        return " ".join(corrected_text)

    def lowercase(self, text):
        return text.lower()

    def preprocess(self, text, steps=None):
        if steps is None:
            steps = [
                "lowercase",
                "remove_punctuation",
                "remove_stopwords",
                "remove_special_characters",
                "remove_url",
                "remove_html_tags",
                "correct_spellings",
                "lemmatize_text",
            ]
        for step in steps:
            text = getattr(self, step)(text)
        return text

    def head(self, texts, n=5):
        """
        Display the first few entries of the dataset for quick visualization.

        Parameters:
        texts (list or pd.Series): The dataset or list of text entries to display.
        n (int): The number of rows to display. Default is 5.

        Returns:
        None
        """
        if isinstance(texts, (list, pd.Series)):
            data = pd.DataFrame({"Text": texts[:n]})
            print(data)
        else:
            raise ValueError(
                "The input should be a list or pandas Series of text entries."
            )

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [72]:
processor = TextPreprocessor()

# Sample text
text = "Hello, world! This is a test text with HTML <html> tags and a URL: https://example.com."

# Removing punctuation
no_punctuation_text = processor.remove_punctuation(text)
print("After removing punctuation:", no_punctuation_text)

# Lowercasing text
lowercase_text = processor.lowercase(text)
print("Lowercase text:", lowercase_text)

# Removing stopwords
no_stopwords_text = processor.remove_stopwords(text)
print("After removing stopwords:", no_stopwords_text)

# Removing URLs
no_urls_text = processor.remove_url(text)
print("After removing URLs:", no_urls_text)

# Removing HTML tags
no_html_text = processor.remove_html_tags(text)
print("After removing HTML tags:", no_html_text)

# Correcting spellings
corrected_text = processor.correct_spellings("This is a tst txt for spellng corection")
print("After spell correction:", corrected_text)

# Lemmatizing text
lemmatized_text = processor.lemmatize_text("running jumped plays better")
print("After lemmatization:", lemmatized_text)


# Stemming text
stemmed_text = processor.stem_text("running jumped plays better")
print("After stemming:", stemmed_text)

After removing punctuation: Hello world This is a test text with HTML html tags and a URL httpsexamplecom
Lowercase text: hello, world! this is a test text with html <html> tags and a url: https://example.com.
After removing stopwords: Hello, world! This test text HTML <html> tags URL: https://example.com.
After removing URLs: Hello, world! This is a test text with HTML <html> tags and a URL: 
After removing HTML tags: Hello, world! This is a test text with HTML  tags and a URL: https://example.com.
After spell correction: This is a test text for spelling correction
After lemmatization: run jump play well
After stemming: run jump play better


In [52]:
import pandas as pd


class TextPreprocessor:
    def __init__(self):
        # Initialize the attributes for text preprocessing
        self.stopwords = set(stopwords.words("english"))
        self.lemmatizer = WordNetLemmatizer()
        self.spell = SpellChecker()
        self.ps = PorterStemmer()
        self.wordnet_map = {
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "J": wordnet.ADJ,
            "R": wordnet.ADV,
        }

    # Existing preprocessing functions
    # ...

    def head(self, data, n=5):
        """
        Display the first few entries of the data for quick visualization.

        Parameters:
        data (list, pd.Series, pd.DataFrame): The data to display.
        n (int): The number of rows to display. Default is 5.

        Returns:
        None
        """
        if isinstance(data, list):
            data = pd.DataFrame({"Text": data})

        if isinstance(data, pd.Series):
            data = data.to_frame("Text")

        if isinstance(data, pd.DataFrame):
            print(data.head(n))
        else:
            raise ValueError("The input should be a list, pandas Series, or DataFrame.")


# Sample list of text entries
sample_texts = [
    "This is the first entry.",
    "Another example sentence.",
    "Text preprocessing is essential.",
    "This toolkit is very useful.",
    "It provides various functions for NLP.",
]

# Sample Series of text entries
sample_series = pd.Series(sample_texts)

# Sample DataFrame
sample_df = pd.DataFrame(
    {"Text": sample_texts, "Length": [len(text) for text in sample_texts]}
)

# Initialize the TextPreprocessor instance
processor = TextPreprocessor()

# Using head() on a list
processor.head(sample_texts, n=3)

# Using head() on a Series
processor.head(sample_series, n=3)

# Using head() on a DataFrame
processor.head(sample_df, n=3)

                               Text
0          This is the first entry.
1         Another example sentence.
2  Text preprocessing is essential.
                               Text
0          This is the first entry.
1         Another example sentence.
2  Text preprocessing is essential.
                               Text  Length
0          This is the first entry.      24
1         Another example sentence.      25
2  Text preprocessing is essential.      32


In [61]:
import string
import re
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from spellchecker import SpellChecker
import nltk

# Download necessary NLTK data files
nltk.download("averaged_perceptron_tagger")
nltk.download("wordnet")
nltk.download("omw-1.4")  # Optional, improves WordNet lemmatization


class TextPreprocessor:
    def __init__(self):
        self.stopwords = set(stopwords.words("english"))
        self.lemmatizer = WordNetLemmatizer()
        self.spell = SpellChecker()
        self.ps = PorterStemmer()
        self.wordnet_map = {
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "J": wordnet.ADJ,
            "R": wordnet.ADV,
        }

    # Other preprocessing methods...

    def head(self, texts, n=5):
        """
        Display the first few entries of the dataset for quick visualization.

        Parameters:
        texts (list or pd.Series): The dataset or list of text entries to display.
        n (int): The number of rows to display. Default is 5.

        Returns:
        None
        """
        if isinstance(texts, (list, pd.Series)):
            data = pd.DataFrame({"Text": texts[:n]})
            print(data)
        else:
            raise ValueError(
                "The input should be a list or pandas Series of text entries."
            )


processor = TextPreprocessor()
processor.head(sample_texts, n=3)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


                               Text
0          This is the first entry.
1         Another example sentence.
2  Text preprocessing is essential.


In [55]:
lemmatized_text = processor.lemmatize_text("running jumped plays better")
print("After lemmatization:", lemmatized_text)

AttributeError: 'TextPreprocessor' object has no attribute 'lemmatize_text'

In [66]:
processor = TextPreprocessor()

# Sample text
text = "Hello, world! This is a test text with HTML <html> tags and a URL: https://example.com."

# Removing punctuation
no_punctuation_text = processor.remove_punctuation(text)
print("After removing punctuation:", no_punctuation_text)

# Lowercasing text
lowercase_text = processor.lowercase(text)
print("Lowercase text:", lowercase_text)

# Removing stopwords
no_stopwords_text = processor.remove_stopwords(text)
print("After removing stopwords:", no_stopwords_text)

# Removing URLs
no_urls_text = processor.remove_url(text)
print("After removing URLs:", no_urls_text)

# Removing HTML tags
no_html_text = processor.remove_html_tags(text)
print("After removing HTML tags:", no_html_text)

# Correcting spellings
corrected_text = processor.correct_spellings("This is a tst txt for spellng corection")
print("After spell correction:", corrected_text)

# Lemmatizing text
lemmatized_text = processor.lemmatize_text("running jumped plays better")
print("After lemmatization:", lemmatized_text)


# Stemming text
stemmed_text = processor.stem_text("running jumped plays better")
print("After stemming:", stemmed_text)

After removing punctuation: Hello world This is a test text with HTML html tags and a URL httpsexamplecom
Lowercase text: hello, world! this is a test text with html <html> tags and a url: https://example.com.
After removing stopwords: Hello, world! This test text HTML <html> tags URL: https://example.com.
After removing URLs: Hello, world! This is a test text with HTML <html> tags and a URL: 
After removing HTML tags: Hello, world! This is a test text with HTML  tags and a URL: https://example.com.
After spell correction: This is a test text for spelling correction


LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger_eng[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger_eng')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger_eng/[0m

  Searched in:
    - 'C:\\Users\\Gaurav/nltk_data'
    - 'c:\\Users\\Gaurav\\OneDrive\\Desktop\\TPT\\.venv\\nltk_data'
    - 'c:\\Users\\Gaurav\\OneDrive\\Desktop\\TPT\\.venv\\share\\nltk_data'
    - 'c:\\Users\\Gaurav\\OneDrive\\Desktop\\TPT\\.venv\\lib\\nltk_data'
    - 'C:\\Users\\Gaurav\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [78]:
def head(self, texts, n=5):
    """
    Display a summary of the first few entries of the dataset for quick visualization.

    Parameters:
    texts (list or pd.Series): The dataset or list of text entries to display.
    n (int): The number of rows to display. Default is 5.

    Returns:
    None
    """
    if isinstance(texts, (list, pd.Series)):
        data = pd.DataFrame({"Text": texts[:n]})
        data["Word Count"] = data["Text"].apply(lambda x: len(x.split()))
        data["Character Count"] = data["Text"].apply(len)
        display(data)

        # Plotting word counts for quick overview
        plt.figure(figsize=(8, 5))
        plt.bar(range(n), data["Word Count"], color="skyblue")
        plt.xticks(range(n), [f"Text {i+1}" for i in range(n)], rotation=45)
        plt.xlabel("Text Entries")
        plt.ylabel("Word Count")
        plt.title("Word Count of First Few Text Entries")
        plt.show()
    else:
        raise ValueError("The input should be a list or pandas Series of text entries.")

In [79]:
# text_preprocessing_toolkit/preprocessing.py


import string
import re
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from spellchecker import SpellChecker

import nltk

nltk.download("averaged_perceptron_tagger_eng")
nltk.download("wordnet")
nltk.download("omw-1.4")  # Optional, improves WordNet lemmatization


class TextPreprocessor:
    def __init__(self):
        self.stopwords = set(stopwords.words("english"))
        self.lemmatizer = WordNetLemmatizer()
        self.spell = SpellChecker()
        self.ps = PorterStemmer()
        self.wordnet_map = {
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "J": wordnet.ADJ,
            "R": wordnet.ADV,
        }

    def remove_punctuation(self, text):
        return text.translate(str.maketrans("", "", string.punctuation))

    def remove_stopwords(self, text):
        return " ".join(
            [word for word in str(text).split() if word not in self.stopwords]
        )

    def get_frequent_words(self, texts, top_n=10):
        word_counts = Counter()
        for text in texts:
            for word in text.split():
                word_counts[word] += 1
        return set([word for word, count in word_counts.most_common(top_n)])

    def remove_frequent_words(self, text, frequent_words):
        return " ".join(
            [word for word in str(text).split() if word not in frequent_words]
        )

    def get_rare_words(self, texts, bottom_n=10):
        word_counts = Counter()
        for text in texts:
            for word in text.split():
                word_counts[word] += 1
        return set(
            word for word, count in word_counts.most_common()[: -bottom_n - 1 : -1]
        )

    def remove_rare_words(self, text, rare_words):
        return " ".join([word for word in str(text).split() if word not in rare_words])

    def remove_special_characters(self, text):
        text = re.sub(r"[^a-zA-Z0-9]", " ", text)
        text = re.sub(r"\s+", " ", text)
        return text

    def stem_text(self, text):
        return " ".join([self.ps.stem(word) for word in text.split()])

    def lemmatize_text(self, text):
        pos_text = pos_tag(text.split())
        return " ".join(
            [
                self.lemmatizer.lemmatize(
                    word, self.wordnet_map.get(pos[0], wordnet.NOUN)
                )
                for word, pos in pos_text
            ]
        )

    # def lemmatize_text(self, text):
    #     pos_text = pos_tag(text.split())
    #     return " ".join([self.lemmatizer.lemmatize(word, self.wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text])

    def remove_url(self, text):
        return re.sub(r"https?://\S+|www\.\S+", "", text)

    def remove_html_tags(self, text):
        return re.sub(r"<[^>]+>", "", text)

    def correct_spellings(self, text):
        corrected_text = []
        misspelled_words = self.spell.unknown(text.split())
        for word in text.split():
            if word in misspelled_words:
                corrected_word = self.spell.correction(word)
                corrected_text.append(corrected_word)
            else:
                corrected_text.append(word)
        return " ".join(corrected_text)

    def lowercase(self, text):
        return text.lower()

    def preprocess(self, text, steps=None):
        if steps is None:
            steps = [
                "lowercase",
                "remove_punctuation",
                "remove_stopwords",
                "remove_special_characters",
                "remove_url",
                "remove_html_tags",
                "correct_spellings",
                "lemmatize_text",
            ]
        for step in steps:
            text = getattr(self, step)(text)
        return text

    def head(self, texts, n=5):
        """
        Display a summary of the first few entries of the dataset for quick visualization.

        Parameters:
        texts (list or pd.Series): The dataset or list of text entries to display.
        n (int): The number of rows to display. Default is 5.

        Returns:
        None
        """
        if isinstance(texts, (list, pd.Series)):
            data = pd.DataFrame({"Text": texts[:n]})
            data["Word Count"] = data["Text"].apply(lambda x: len(x.split()))
            data["Character Count"] = data["Text"].apply(len)
            display(data)

            # Plotting word counts for quick overview
            plt.figure(figsize=(8, 5))
            plt.bar(range(n), data["Word Count"], color="skyblue")
            plt.xticks(range(n), [f"Text {i+1}" for i in range(n)], rotation=45)
            plt.xlabel("Text Entries")
            plt.ylabel("Word Count")
            plt.title("Word Count of First Few Text Entries")
            plt.show()
        else:
            raise ValueError(
                "The input should be a list or pandas Series of text entries."
            )

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Gaurav\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [84]:
processor = TextPreprocessor()

# Sample text
text = "Hello, world! This is a test text with HTML <html> tags and a URL: https://example.com."

# Removing punctuation
no_punctuation_text = processor.remove_punctuation(text)
print("After removing punctuation:", no_punctuation_text)

# Lowercasing text
lowercase_text = processor.lowercase(text)
print("Lowercase text:", lowercase_text)

# Removing stopwords
no_stopwords_text = processor.remove_stopwords(text)
print("After removing stopwords:", no_stopwords_text)

# Removing URLs
no_urls_text = processor.remove_url(text)
print("After removing URLs:", no_urls_text)

# Removing HTML tags
no_html_text = processor.remove_html_tags(text)
print("After removing HTML tags:", no_html_text)

# Correcting spellings
corrected_text = processor.correct_spellings("This is a tst txt for spellng corection")
print("After spell correction:", corrected_text)

# Lemmatizing text
# lemmatized_text = processor.lemmatize_text("running jumped plays better")
# print("After lemmatization:", lemmatized_text)


# Stemming text
stemmed_text = processor.stem_text("running jumped plays better")
print("After stemming:", stemmed_text)

After removing punctuation: Hello world This is a test text with HTML html tags and a URL httpsexamplecom
Lowercase text: hello, world! this is a test text with html <html> tags and a url: https://example.com.
After removing stopwords: Hello, world! This test text HTML <html> tags URL: https://example.com.
After removing URLs: Hello, world! This is a test text with HTML <html> tags and a URL: 
After removing HTML tags: Hello, world! This is a test text with HTML  tags and a URL: https://example.com.
After spell correction: This is a test text for spelling correction
After stemming: run jump play better


In [86]:
import matplotlib.pyplot as plt

In [90]:
t = text.DataFrame(text)

AttributeError: 'str' object has no attribute 'DataFrame'

In [88]:
processor.head(text, n=3)

ValueError: The input should be a list or pandas Series of text entries.