In [1]:
# Import necessary libraries
import warnings
warnings.filterwarnings('ignore')

# Step 0: Ensure necessary NLTK resources are downloaded
import nltk
nltk.download('punkt')  # For tokenization
nltk.download('stopwords')  # For stopword removal
nltk.download('wordnet')  # For lemmatization


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mkha7672\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mkha7672\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mkha7672\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
!pip install pyspellchecker



In [3]:
raw_docs = [
    "I loooove NLP!!! It's sooo coool. #nlp #fun @ai",
    "Artificial Intelligence is the future of technology... $$$",
    "Python is great for Data Science, and it's widely used in AI & ML."
]

# Step 1: Convert to Lowercase

In [4]:
import string

# Step 1: Convert to lowercase
raw_docs = [doc.lower() for doc in raw_docs]
print("Lowercased documents:\n", raw_docs)


Lowercased documents:
 ["i loooove nlp!!! it's sooo coool. #nlp #fun @ai", 'artificial intelligence is the future of technology... $$$', "python is great for data science, and it's widely used in ai & ml."]


# Step 2: Tokenization (Word and Sentence Tokenization)


In [5]:
from nltk.tokenize import word_tokenize, sent_tokenize

# Word tokenization
tokenized_docs = [word_tokenize(doc) for doc in raw_docs]
print("Word Tokenized documents:\n", tokenized_docs)

# Sentence tokenization
sent_token = [sent_tokenize(doc) for doc in raw_docs]
print("\nSentence Tokenized documents:\n", sent_token)


Word Tokenized documents:
 [['i', 'loooove', 'nlp', '!', '!', '!', 'it', "'s", 'sooo', 'coool', '.', '#', 'nlp', '#', 'fun', '@', 'ai'], ['artificial', 'intelligence', 'is', 'the', 'future', 'of', 'technology', '...', '$', '$', '$'], ['python', 'is', 'great', 'for', 'data', 'science', ',', 'and', 'it', "'s", 'widely', 'used', 'in', 'ai', '&', 'ml', '.']]

Sentence Tokenized documents:
 [['i loooove nlp!!!', "it's sooo coool.", '#nlp #fun @ai'], ['artificial intelligence is the future of technology... $$$'], ["python is great for data science, and it's widely used in ai & ml."]]


# Step 3: Punctuation Removal
Removing punctuation helps reduce noise in the text data.

re.sub(), which is a function for substituting patterns in text. 

In [6]:
#using re to remove punctuation from text by finding and replacing any punctuation marks(like . , ? ! etc.) with an empty string.
import re

# Removing punctuation
regex = re.compile('[%s]' % re.escape(string.punctuation)) #escapes any special characters

tokenized_docs_no_punctuation = []

for review in tokenized_docs:
    new_review = []
    for token in review:
        new_token = regex.sub(u'', token) # Replace punctuation with nothing
        if new_token != u'':  # Only keep non-empty tokens after removing punctuation
            new_review.append(new_token)
    tokenized_docs_no_punctuation.append(new_review)

print("\nDocuments after punctuation removal:\n", tokenized_docs_no_punctuation)



Documents after punctuation removal:
 [['i', 'loooove', 'nlp', 'it', 's', 'sooo', 'coool', 'nlp', 'fun', 'ai'], ['artificial', 'intelligence', 'is', 'the', 'future', 'of', 'technology'], ['python', 'is', 'great', 'for', 'data', 'science', 'and', 'it', 's', 'widely', 'used', 'in', 'ai', 'ml']]


# Step 4: Remove Stopwords
We remove common words that don’t contribute much to the meaning.

In [7]:
from nltk.corpus import stopwords

# Removing stopwords
stop_words = set(stopwords.words('english'))

tokenized_docs_no_stopwords = []
for doc in tokenized_docs_no_punctuation:
    new_term_vector = [word for word in doc if word not in stop_words]
    tokenized_docs_no_stopwords.append(new_term_vector)

print("\nDocuments after stopword removal:\n", tokenized_docs_no_stopwords)



Documents after stopword removal:
 [['loooove', 'nlp', 'sooo', 'coool', 'nlp', 'fun', 'ai'], ['artificial', 'intelligence', 'future', 'technology'], ['python', 'great', 'data', 'science', 'widely', 'used', 'ai', 'ml']]


# Step 5: Lemmatization
Lemmatization reduces words to their root form, ensuring they are valid dictionary words.

In [8]:
from nltk.stem import WordNetLemmatizer

# Initialize lemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Apply lemmatization
preprocessed_docs = []
for doc in tokenized_docs_no_stopwords:
    final_doc = [wordnet_lemmatizer.lemmatize(word) for word in doc]
    preprocessed_docs.append(final_doc)

print("\nDocuments after lemmatization:\n", preprocessed_docs)



Documents after lemmatization:
 [['loooove', 'nlp', 'sooo', 'coool', 'nlp', 'fun', 'ai'], ['artificial', 'intelligence', 'future', 'technology'], ['python', 'great', 'data', 'science', 'widely', 'used', 'ai', 'ml']]


In [9]:
#Using PorterStemmer for Stemming
from nltk.stem.porter import PorterStemmer

# Initialize the Porter Stemmer
porter_stemmer = PorterStemmer()

# Apply stemming
stemmed_docs = []
for doc in tokenized_docs_no_stopwords:
    final_doc = [porter_stemmer.stem(word) for word in doc]
    stemmed_docs.append(final_doc)

print("\nDocuments after stemming (Porter Stemmer):\n", stemmed_docs)



Documents after stemming (Porter Stemmer):
 [['loooov', 'nlp', 'sooo', 'coool', 'nlp', 'fun', 'ai'], ['artifici', 'intellig', 'futur', 'technolog'], ['python', 'great', 'data', 'scienc', 'wide', 'use', 'ai', 'ml']]


Stemming is good when you need speed and don’t care too much about whether the root word is a valid word. It's often used in information retrieval and search engines.

Lemmatization is preferred when you need accuracy and want to retain the correct word forms. It is better for tasks like machine translation or where the output needs to be human-readable.

# Advanced Steps
Spell Checking (Optional) Spell checking can correct misspelled words like "loooove" and "sooo."

In [10]:
from spellchecker import SpellChecker

# Initialize the spell checker
spell = SpellChecker()

# List of domain-specific words that shouldn't be corrected
# These words might be specific to your domain or dataset
custom_words = ['nlp', 'ai', 'ml']

# Sample tokenized documents after stopword removal
# In a real scenario, this would be the output from previous preprocessing steps
tokenized_docs_no_stopwords = [
    ['loooove', 'nlp', 'sooo', 'coool', 'nlp', 'fun', 'ai'],
    ['artificial', 'intelligence', 'future', 'technology'],
    ['python', 'great', 'data', 'science', 'widely', 'used', 'ai', 'ml']
]

# This prevents the spell checker from trying to correct these words
spell.word_frequency.load_words(custom_words)

# Function to correct spelling in a single document
def correct_spelling(doc):
    corrected_doc = []
    for word in doc:
        if word in custom_words:
            # Skip spell check for custom words
            corrected_doc.append(word)
        else:
            # Attempt to correct the word
            corrected_word = spell.correction(word)
            # If correction fails (returns None), keep the original word
            corrected_doc.append(corrected_word if corrected_word else word)
    return corrected_doc

# Correct spelling mistakes in all documents
corrected_docs = [correct_spelling(doc) for doc in tokenized_docs_no_stopwords]

# Print the results
print("\nOriginal documents:")
for doc in tokenized_docs_no_stopwords:
    print(doc)

print("\nDocuments after spell correction:")
for doc in corrected_docs:
    print(doc)

# Example of how spelling correction can affect word counts
from collections import Counter

print("\nWord frequency before correction:")
before_correction = Counter([word for doc in tokenized_docs_no_stopwords for word in doc])
print(before_correction.most_common(5))

print("\nWord frequency after correction:")
after_correction = Counter([word for doc in corrected_docs for word in doc])
print(after_correction.most_common(5))



Original documents:
['loooove', 'nlp', 'sooo', 'coool', 'nlp', 'fun', 'ai']
['artificial', 'intelligence', 'future', 'technology']
['python', 'great', 'data', 'science', 'widely', 'used', 'ai', 'ml']

Documents after spell correction:
['loooove', 'nlp', 'soon', 'cool', 'nlp', 'fun', 'ai']
['artificial', 'intelligence', 'future', 'technology']
['python', 'great', 'data', 'science', 'widely', 'used', 'ai', 'ml']

Word frequency before correction:
[('nlp', 2), ('ai', 2), ('loooove', 1), ('sooo', 1), ('coool', 1)]

Word frequency after correction:
[('nlp', 2), ('ai', 2), ('loooove', 1), ('soon', 1), ('cool', 1)]
