# Preprocessing Text Data
## Import Necessary Libraries

In [3]:

import re
import string
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


ModuleNotFoundError: No module named 'nltk'

In [None]:

# Example data
data = {
    'text': [
        "This is a sample sentence!",
        "Preprocessing text data is essential for NLP.",
        "Text preprocessing involves several steps."
    ]
}

df = pd.DataFrame(data)
print("Original Data:\n", df)


In [None]:

# Lowercasing
df['text'] = df['text'].str.lower()
print("\nAfter Lowercasing:\n", df)


In [None]:

# Remove Punctuation
df['text'] = df['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
print("\nAfter Removing Punctuation:\n", df)


In [None]:

# Remove Numbers
df['text'] = df['text'].apply(lambda x: re.sub(r'\d+', '', x))
print("\nAfter Removing Numbers:\n", df)


In [None]:

# Remove Whitespace
df['text'] = df['text'].str.strip()
print("\nAfter Removing Whitespace:\n", df)


In [None]:

# Tokenization
df['tokens'] = df['text'].apply(word_tokenize)
print("\nAfter Tokenization:\n", df)


In [None]:

# Remove Stopwords
stop_words = set(stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])
print("\nAfter Removing Stopwords:\n", df)


In [None]:

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['tokens'] = df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
print("\nAfter Lemmatization:\n", df)


In [None]:

# Joining Tokens Back to Strings
df['processed_text'] = df['tokens'].apply(lambda x: ' '.join(x))
print("\nProcessed Text:\n", df)


In [None]:

# Bag of Words (Count Vectorizer)
count_vectorizer = CountVectorizer()
X_counts = count_vectorizer.fit_transform(df['processed_text'])
print("\nCount Vectorizer Feature Names:", count_vectorizer.get_feature_names_out())
print("Count Vectorizer Output:\n", X_counts.toarray())


In [None]:

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['processed_text'])
print("\nTF-IDF Vectorizer Feature Names:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Vectorizer Output:\n", X_tfidf.toarray())
