# Data Cleaning and Preprocessing

This notebook demonstrates the data cleaning and preprocessing steps for retail review sentiment analysis.


In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

print("Libraries imported successfully!")


## Load Raw Data


In [None]:
# Load raw reviews
df = pd.read_csv('../data/raw_reviews.csv')
print(f"Loaded {len(df)} reviews")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()


## Data Cleaning Functions


In [None]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    """Clean text by removing HTML tags, special characters, and extra whitespace."""
    if pd.isna(text):
        return ""
    
    text = str(text)
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def tokenize_and_lemmatize(text):
    """Tokenize text and lemmatize tokens, removing stopwords."""
    if not text:
        return ""
    
    tokens = word_tokenize(text)
    lemmatized_tokens = [
        lemmatizer.lemmatize(token) 
        for token in tokens 
        if token not in stop_words and len(token) > 2
    ]
    
    return ' '.join(lemmatized_tokens)

print("Cleaning functions defined!")


## Preprocessing Steps


In [None]:
# Handle missing values
print("Handling missing values...")
df = df.dropna(subset=['review_text'])
print(f"Reviews after removing missing values: {len(df)}")

# Remove duplicates
print("\nRemoving duplicates...")
initial_count = len(df)
df = df.drop_duplicates(subset=['review_text'])
duplicates_removed = initial_count - len(df)
print(f"Removed {duplicates_removed} duplicate reviews")
print(f"Reviews remaining: {len(df)}")


In [None]:
# Clean text
print("Cleaning text...")
df['cleaned_text'] = df['review_text'].apply(clean_text)
df = df[df['cleaned_text'].str.len() > 0]
print(f"Reviews after cleaning: {len(df)}")

# Show example
print("\nExample of cleaned text:")
print(f"Original: {df.iloc[0]['review_text']}")
print(f"Cleaned: {df.iloc[0]['cleaned_text']}")


In [None]:
# Tokenize and lemmatize
print("Tokenizing and lemmatizing...")
df['processed_text'] = df['cleaned_text'].apply(tokenize_and_lemmatize)
df = df[df['processed_text'].str.len() > 0]
print(f"Final processed reviews: {len(df)}")

# Show example
print("\nExample of processed text:")
print(f"Cleaned: {df.iloc[0]['cleaned_text']}")
print(f"Processed: {df.iloc[0]['processed_text']}")


## Save Processed Data


In [None]:
# Save processed data
df.to_csv('../data/processed_reviews.csv', index=False)
print("Processed data saved to ../data/processed_reviews.csv")
print(f"\nFinal dataset shape: {df.shape}")
df.head()
