# Preprocessing Text Data
## Import Necessary Libraries

In [1]:

import re
import string
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...


True

In [2]:

# Example data
data = {
    'text': [
        "This is a sample sentence!",
        "Preprocessing text data is essential for NLP.",
        "Text preprocessing involves several steps."
    ]
}

df = pd.DataFrame(data)
print("Original Data:\n", df)


Original Data:
                                             text
0                     This is a sample sentence!
1  Preprocessing text data is essential for NLP.
2     Text preprocessing involves several steps.


In [3]:

# Lowercasing
df['text'] = df['text'].str.lower()
print("\nAfter Lowercasing:\n", df)



After Lowercasing:
                                             text
0                     this is a sample sentence!
1  preprocessing text data is essential for nlp.
2     text preprocessing involves several steps.


In [4]:

# Remove Punctuation
df['text'] = df['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
print("\nAfter Removing Punctuation:\n", df)



After Removing Punctuation:
                                            text
0                     this is a sample sentence
1  preprocessing text data is essential for nlp
2     text preprocessing involves several steps


In [5]:

# Remove Numbers
df['text'] = df['text'].apply(lambda x: re.sub(r'\d+', '', x))
print("\nAfter Removing Numbers:\n", df)



After Removing Numbers:
                                            text
0                     this is a sample sentence
1  preprocessing text data is essential for nlp
2     text preprocessing involves several steps


In [6]:

# Remove Whitespace
df['text'] = df['text'].str.strip()
print("\nAfter Removing Whitespace:\n", df)



After Removing Whitespace:
                                            text
0                     this is a sample sentence
1  preprocessing text data is essential for nlp
2     text preprocessing involves several steps


In [7]:

# Tokenization
df['tokens'] = df['text'].apply(word_tokenize)
print("\nAfter Tokenization:\n", df)



After Tokenization:
                                            text  \
0                     this is a sample sentence   
1  preprocessing text data is essential for nlp   
2     text preprocessing involves several steps   

                                              tokens  
0                    [this, is, a, sample, sentence]  
1  [preprocessing, text, data, is, essential, for...  
2    [text, preprocessing, involves, several, steps]  


In [8]:

# Remove Stopwords
stop_words = set(stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])
print("\nAfter Removing Stopwords:\n", df)



After Removing Stopwords:
                                            text  \
0                     this is a sample sentence   
1  preprocessing text data is essential for nlp   
2     text preprocessing involves several steps   

                                            tokens  
0                               [sample, sentence]  
1      [preprocessing, text, data, essential, nlp]  
2  [text, preprocessing, involves, several, steps]  


In [9]:

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['tokens'] = df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
print("\nAfter Lemmatization:\n", df)



After Lemmatization:
                                            text  \
0                     this is a sample sentence   
1  preprocessing text data is essential for nlp   
2     text preprocessing involves several steps   

                                           tokens  
0                              [sample, sentence]  
1     [preprocessing, text, data, essential, nlp]  
2  [text, preprocessing, involves, several, step]  


In [10]:

# Joining Tokens Back to Strings
df['processed_text'] = df['tokens'].apply(lambda x: ' '.join(x))
print("\nProcessed Text:\n", df)



Processed Text:
                                            text  \
0                     this is a sample sentence   
1  preprocessing text data is essential for nlp   
2     text preprocessing involves several steps   

                                           tokens  \
0                              [sample, sentence]   
1     [preprocessing, text, data, essential, nlp]   
2  [text, preprocessing, involves, several, step]   

                             processed_text  
0                           sample sentence  
1     preprocessing text data essential nlp  
2  text preprocessing involves several step  


In [11]:

# Bag of Words (Count Vectorizer)
count_vectorizer = CountVectorizer()
X_counts = count_vectorizer.fit_transform(df['processed_text'])
print("\nCount Vectorizer Feature Names:", count_vectorizer.get_feature_names_out())
print("Count Vectorizer Output:\n", X_counts.toarray())



Count Vectorizer Feature Names: ['data' 'essential' 'involves' 'nlp' 'preprocessing' 'sample' 'sentence'
 'several' 'step' 'text']
Count Vectorizer Output:
 [[0 0 0 0 0 1 1 0 0 0]
 [1 1 0 1 1 0 0 0 0 1]
 [0 0 1 0 1 0 0 1 1 1]]


In [12]:

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['processed_text'])
print("\nTF-IDF Vectorizer Feature Names:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Vectorizer Output:\n", X_tfidf.toarray())



TF-IDF Vectorizer Feature Names: ['data' 'essential' 'involves' 'nlp' 'preprocessing' 'sample' 'sentence'
 'several' 'step' 'text']
TF-IDF Vectorizer Output:
 [[0.         0.         0.         0.         0.         0.70710678
  0.70710678 0.         0.         0.        ]
 [0.49047908 0.49047908 0.         0.49047908 0.37302199 0.
  0.         0.         0.         0.37302199]
 [0.         0.         0.49047908 0.         0.37302199 0.
  0.         0.49047908 0.49047908 0.37302199]]
