# Assignment: Text Cleaning and Preprocessing wit NLTK 

Coder: Hisham D Macaraya

### Importing necessary libraries and packages

In [1]:
# Importing NLTK library for natural language processing tasks
import nltk
from nltk.tokenize import word_tokenize  # For tokenizing text into words
from nltk.corpus import stopwords  # For accessing common stop words
from nltk.stem import PorterStemmer  # For stemming words to their root form

In [2]:
# Download required NLTK data
nltk.download('punkt')  # Data for tokenization
nltk.download('stopwords')  # Data for common stop words

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hisha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hisha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Sample Dataset: User Reviews

In [3]:
# Defining a list of user reviews for a hypothetical product
reviews = [
    "I absolutely love this product! It has changed my life!!!",
    "Do not buy this. It broke after just a day.",
    "Decent product for the price, I guess...",
    "Absolutely fantastic! I can't believe how good this is.",
    "Waste of money. Stopped working after a week.",
    "For the price, it's okay...not great, but not terrible either."
]

### Task 1: Tokenization

In [4]:
# Tokenizing each review into individual words
# Tokenization helps in breaking down text into smaller units for analysis
tokenized_reviews = [word_tokenize(review) for review in reviews]
print("Tokens for the first review:")
print(tokenized_reviews[0])  # Display tokens for the first review
print("\nTokens for the second review:")
print(tokenized_reviews[1])  # Display tokens for the second review

Tokens for the first review:
['I', 'absolutely', 'love', 'this', 'product', '!', 'It', 'has', 'changed', 'my', 'life', '!', '!', '!']

Tokens for the second review:
['Do', 'not', 'buy', 'this', '.', 'It', 'broke', 'after', 'just', 'a', 'day', '.']


### Task 2: Lowercasing

In [5]:
# Converting all tokens to lowercase for consistency
# This helps in treating words like 'Product' and 'product' as the same
lowercased_reviews = [[word.lower() for word in review] for review in tokenized_reviews]
print("\nLowercased tokens for the third review:")
print(lowercased_reviews[2])  # Display lowercased tokens for the third review


Lowercased tokens for the third review:
['decent', 'product', 'for', 'the', 'price', ',', 'i', 'guess', '...']


### Task 3: Removing Punctuation and Stopwords

In [6]:
# Removing punctuation and common stop words from the reviews
# Stop words are common words that do not contribute much meaning, e.g., 'the', 'is'
stop_words = set(stopwords.words('english'))
cleaned_reviews = [[word for word in review if word.isalnum() and word not in stop_words] for review in lowercased_reviews]
print("\nCleaned tokens for the fourth review (no punctuation and stopwords):")
print(cleaned_reviews[3])  # Display cleaned tokens for the fourth review


Cleaned tokens for the fourth review (no punctuation and stopwords):
['absolutely', 'fantastic', 'ca', 'believe', 'good']


### Task 4: Stemming

In [7]:
# Applying stemming to reduce words to their root forms
# Stemming helps in normalizing words to a common base form, e.g., 'running' to 'run'
stemmer = PorterStemmer()
stemmed_reviews = [[stemmer.stem(word) for word in review] for review in cleaned_reviews]
print("\nStemmed tokens for the fifth review:")
print(stemmed_reviews[4])  # Display stemmed tokens for the fifth review


Stemmed tokens for the fifth review:
['wast', 'money', 'stop', 'work', 'week']
