# Explanation:
Static Data:

The data dictionary contains a simple dataset with a text column (text_column) and a numeric column (numeric_column).
Initialization:

The DataPreprocessor class initializes with the DataFrame created from the static data.
Handling Missing Values:

The handle_missing_values method fills missing values in numeric columns with the median and in text columns with the mode.
Text Normalization:

The lowercase_text method converts text to lowercase.
The remove_special_characters method removes special characters from the text.
The tokenize_text method tokenizes the text using NLTK.
The remove_stop_words method removes common English stop words.
Preprocessing Pipeline:

The preprocess_pipeline method combines all preprocessing steps into a single pipeline for ease of use.
Display Preprocessed Data:

The preprocessed DataFrame is displayed using print.

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Define the preprocessing class
class DataPreprocessor:
    def __init__(self, dataframe):
        self.df = dataframe
    
    def handle_missing_values(self):
        # Fill missing values with an appropriate method, here we use median for numerical and mode for categorical
        for column in self.df.columns:
            if pd.api.types.is_numeric_dtype(self.df[column]):
                self.df[column].fillna(self.df[column].median(), inplace=True)
            else:
                self.df[column].fillna(self.df[column].mode()[0], inplace=True)
        return self.df
    
    def lowercase_text(self, text_column):
        self.df[text_column] = self.df[text_column].str.lower()
        return self.df
    
    def remove_special_characters(self, text_column):
        self.df[text_column] = self.df[text_column].apply(lambda x: re.sub(r'\W', ' ', str(x)))
        return self.df
    
    def tokenize_text(self, text_column):
        self.df[text_column] = self.df[text_column].apply(nltk.word_tokenize)
        return self.df
    
    def remove_stop_words(self, text_column):
        stop_words = set(stopwords.words('english'))
        self.df[text_column] = self.df[text_column].apply(lambda x: [word for word in x if word not in stop_words])
        return self.df
    
    def preprocess_pipeline(self, text_column):
        self.handle_missing_values()
        self.lowercase_text(text_column)
        self.remove_special_characters(text_column)
        self.tokenize_text(text_column)
        self.remove_stop_words(text_column)
        return self.df

# Static data
data = {
    'text_column': [
        'Hello world! This is a test.',
        'Preprocessing is crucial for AI models.',
        'Missing values should be handled properly.',
        np.nan,
        'Special characters & stopwords need removal!'
    ],
    'numeric_column': [1, 2, np.nan, 4, 5]
}

# Convert static data to DataFrame
df = pd.DataFrame(data)

# Initialize the preprocessor
preprocessor = DataPreprocessor(df)

# Preprocess the data
processed_df = preprocessor.preprocess_pipeline('text_column')

# Display the preprocessed DataFrame
print("Preprocessed DataFrame:")
print(processed_df)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\iShop\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\iShop\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Preprocessed DataFrame:
                                       text_column  numeric_column
0                             [hello, world, test]             1.0
1             [preprocessing, crucial, ai, models]             2.0
2             [missing, values, handled, properly]             3.0
3                             [hello, world, test]             4.0
4  [special, characters, stopwords, need, removal]             5.0
