In [2]:
# Import required libraries
import pandas as pd
import numpy as np
import re
import string
import warnings
warnings.filterwarnings('ignore')

# NLTK imports
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download required NLTK data
print("Downloading NLTK resources...")
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('omw-1.4', quiet=True)
print(" NLTK resources downloaded!")

Downloading NLTK resources...
 NLTK resources downloaded!


In [4]:
# Load the raw data
data_path = r'reviews_data_dump\reviews_tea\data.csv'
df = pd.read_csv(data_path)

print(f" Dataset loaded: {len(df)} reviews")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

 Dataset loaded: 9170 reviews

Columns: ['reviewer_name', 'reviewer_rating', 'review_title', 'review_text', 'place_of_review', 'Date_of_review', 'up_votes', 'Down_votes']


Unnamed: 0,reviewer_name,reviewer_rating,review_title,review_text,place_of_review,Date_of_review,up_votes,Down_votes
0,Subhro Banerjee,5,Worth every penny,Great product ü§ó with great deals üòçüòç Tata Tea G...,"Certified Buyer, Budge Budge",Subhro Banerjee,236,59
1,Shiv chandra Jha,5,Great product,Very nice and super qwality tea taste are grea...,"Certified Buyer, Saharsa",Shiv chandra Jha,225,79
2,Flipkart Customer,5,Highly recommended,Great test great quality great price point tim...,"Certified Buyer, Sri Ganganagar",Flipkart Customer,89,27
3,DTH Y,4,Very Good,Nice üòäREAD MORE,"Certified Buyer, Phaltan",DTH Y,30,6
4,Bhavesh Godhani,5,Classy product,Very Good Tata tea product.READ MORE,"Certified Buyer, Ahmedabad",Bhavesh Godhani,69,22


In [5]:
# Handle missing values
print("Handling missing values...")
print("\nMissing values before:")
print(df.isnull().sum())

# Fill missing reviews with empty string
df['review_text'] = df['review_text'].fillna('')

# Remove rows where Review is completely empty (after filling)
df = df[df['review_text'].str.strip() != '']

print(f"\n Removed empty reviews")
print(f"Remaining reviews: {len(df)}")

Handling missing values...

Missing values before:
reviewer_name      0
reviewer_rating    0
review_title       0
review_text        0
place_of_review    0
Date_of_review     0
up_votes           0
Down_votes         0
dtype: int64

 Removed empty reviews
Remaining reviews: 9170


In [6]:
# Create sentiment labels
print("Creating sentiment labels...")
print("=" * 50)

def create_sentiment_label(rating):
    """
    Convert rating to binary sentiment:
    - Positive: 4-5 stars
    - Negative: 1-2 stars
    - Neutral (3 stars) will be excluded
    """
    if rating in [4, 5]:
        return 1  # Positive
    elif rating in [1, 2]:
        return 0  # Negative
    else:
        return -1  # Neutral (to be excluded)

df['sentiment'] = df['reviewer_rating'].apply(create_sentiment_label)

print(f"Label distribution before filtering:")
print(df['sentiment'].value_counts())

# Exclude neutral reviews (3 stars)
df_filtered = df[df['sentiment'] != -1].copy()

print(f"\n Excluding neutral reviews (Rating = 3)")
print(f"Dataset size after filtering: {len(df_filtered)}")
print(f"\nFinal sentiment distribution:")
print(df_filtered['sentiment'].value_counts())
print(f"\nPositive: {(df_filtered['sentiment'] == 1).sum()} ({(df_filtered['sentiment'] == 1).sum()/len(df_filtered)*100:.1f}%)")
print(f"Negative: {(df_filtered['sentiment'] == 0).sum()} ({(df_filtered['sentiment'] == 0).sum()/len(df_filtered)*100:.1f}%)")

Creating sentiment labels...
Label distribution before filtering:
sentiment
1    8253
0     917
Name: count, dtype: int64

 Excluding neutral reviews (Rating = 3)
Dataset size after filtering: 9170

Final sentiment distribution:
sentiment
1    8253
0     917
Name: count, dtype: int64

Positive: 8253 (90.0%)
Negative: 917 (10.0%)


In [7]:
# Text cleaning functions
print("Defining text preprocessing functions...\n")

# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Remove negation words from stop words (important for sentiment)
negation_words = {'not', 'no', 'nor', 'neither', 'never', 'none', 'nothing', 'nobody', 'nowhere'}
stop_words = stop_words - negation_words

def clean_text(text):
    """
    Comprehensive text cleaning:
    1. Convert to lowercase
    2. Remove URLs
    3. Remove HTML tags
    4. Remove special characters and digits
    5. Remove extra whitespace
    """
    # Convert to string and lowercase
    text = str(text).lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove special characters and digits (keep only letters)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def remove_stopwords(text):
    """
    Remove stop words while preserving negation words
    """
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

def lemmatize_text(text):
    """
    Lemmatize words to their base form
    """
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

def preprocess_text(text):
    """
    Complete preprocessing pipeline
    """
    text = clean_text(text)
    text = remove_stopwords(text)
    text = lemmatize_text(text)
    return text

print(" Text preprocessing functions defined!")

# Test preprocessing on sample text
sample_text = "This is an AMAZING product!!! I love it üòä http://example.com"
print(f"\nSample preprocessing:")
print(f"Original: {sample_text}")
print(f"Cleaned: {clean_text(sample_text)}")
print(f"After stopwords removal: {remove_stopwords(clean_text(sample_text))}")
print(f"After lemmatization: {preprocess_text(sample_text)}")

Defining text preprocessing functions...

 Text preprocessing functions defined!

Sample preprocessing:
Original: This is an AMAZING product!!! I love it üòä http://example.com
Cleaned: this is an amazing product i love it
After stopwords removal: amazing product love
After lemmatization: amazing product love


In [8]:
# Apply preprocessing to all reviews
print("Applying preprocessing to all reviews...")
print("This may take a few minutes...\n")

import time
start_time = time.time()

# Store original review
df_filtered['original_review'] = df_filtered['review_text']

# Apply preprocessing
df_filtered['cleaned_review'] = df_filtered['review_text'].apply(preprocess_text)

# Remove reviews that became empty after preprocessing
df_filtered = df_filtered[df_filtered['cleaned_review'].str.strip() != '']

end_time = time.time()
print(f" Preprocessing completed in {end_time - start_time:.2f} seconds")
print(f"Final dataset size: {len(df_filtered)} reviews")

Applying preprocessing to all reviews...
This may take a few minutes...

 Preprocessing completed in 1.00 seconds
Final dataset size: 9170 reviews


In [9]:
# Show preprocessing examples
print("Preprocessing Examples:")
print("=" * 80)

# Show 5 random examples
sample_indices = df_filtered.sample(5).index

for i, idx in enumerate(sample_indices, 1):
    print(f"\nExample {i}:")
    print(f"Rating: {df_filtered.loc[idx, 'reviewer_rating']} stars")
    print(f"Sentiment: {'Positive' if df_filtered.loc[idx, 'sentiment'] == 1 else 'Negative'}")
    print(f"\nOriginal:")
    original = df_filtered.loc[idx, 'original_review']
    print(f"  {original[:200]}{'...' if len(original) > 200 else ''}")
    print(f"\nCleaned:")
    print(f"  {df_filtered.loc[idx, 'cleaned_review'][:200]}")
    print("-" * 80)

Preprocessing Examples:

Example 1:
Rating: 5 stars
Sentiment: Positive

Original:
  Very nice and super qwality tea taste are greater so i am really empress your service and qwality price some one high but I don't compare price and choices so i am so happy to choose this product than...

Cleaned:
  nice super qwality tea taste greater really empress service qwality price one high dont compare price choice happy choose product thank much ekart seller faster deliveryread
--------------------------------------------------------------------------------

Example 2:
Rating: 4 stars
Sentiment: Positive

Original:
  Nice üòäREAD MORE

Cleaned:
  nice read
--------------------------------------------------------------------------------

Example 3:
Rating: 5 stars
Sentiment: Positive

Original:
  Very nice and super qwality tea taste are greater so i am really empress your service and qwality price some one high but I don't compare price and choices so i am so happy to choose this product than

In [8]:
# Calculate text statistics after preprocessing
print("Text Statistics After Preprocessing:")
print("=" * 50)

df_filtered['cleaned_word_count'] = df_filtered['cleaned_review'].apply(lambda x: len(str(x).split()))

print(f"Average words per review: {df_filtered['cleaned_word_count'].mean():.1f}")
print(f"Median words per review: {df_filtered['cleaned_word_count'].median():.0f}")
print(f"Min words: {df_filtered['cleaned_word_count'].min():.0f}")
print(f"Max words: {df_filtered['cleaned_word_count'].max():.0f}")

print(f"\nWord count by sentiment:")
print(df_filtered.groupby('sentiment')['cleaned_word_count'].agg(['mean', 'median', 'min', 'max']))

Text Statistics After Preprocessing:
Average words per review: 10.8
Median words per review: 6
Min words: 1
Max words: 33

Word count by sentiment:
           mean  median  min  max
sentiment                        
0           9.0     9.0    9    9
1          11.0     4.0    1   33


In [10]:
# Create final dataset with required columns
print("Creating final preprocessed dataset...\n")

# Select and rename columns
processed_df = df_filtered[['original_review', 'cleaned_review', 'reviewer_rating', 'sentiment']].copy()
processed_df.columns = ['original_review', 'processed_review', 'rating', 'sentiment']

# Reset index
processed_df = processed_df.reset_index(drop=True)

print(f"Final dataset shape: {processed_df.shape}")
print(f"\nColumns: {processed_df.columns.tolist()}")
print(f"\nFirst few rows:")
processed_df.head()

Creating final preprocessed dataset...

Final dataset shape: (9170, 4)

Columns: ['original_review', 'processed_review', 'rating', 'sentiment']

First few rows:


Unnamed: 0,original_review,processed_review,rating,sentiment
0,Great product ü§ó with great deals üòçüòç Tata Tea G...,great product great deal tata tea gold best ti...,5,1
1,Very nice and super qwality tea taste are grea...,nice super qwality tea taste greater really em...,5,1
2,Great test great quality great price point tim...,great test great quality great price point tim...,5,1
3,Nice üòäREAD MORE,nice read,4,1
4,Very Good Tata tea product.READ MORE,good tata tea productread,5,1


In [11]:
# Train-test split
from sklearn.model_selection import train_test_split

print("Performing train-test split...")
print("=" * 50)

# Split with stratification to maintain class balance
X = processed_df['processed_review']
y = processed_df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"\n Data split completed!")
print(f"\nTraining set size: {len(X_train)} ({len(X_train)/len(processed_df)*100:.1f}%)")
print(f"Test set size: {len(X_test)} ({len(X_test)/len(processed_df)*100:.1f}%)")

print(f"\nTraining set sentiment distribution:")
print(y_train.value_counts())
print(f"Positive: {(y_train == 1).sum()/len(y_train)*100:.1f}%")
print(f"Negative: {(y_train == 0).sum()/len(y_train)*100:.1f}%")

print(f"\nTest set sentiment distribution:")
print(y_test.value_counts())
print(f"Positive: {(y_test == 1).sum()/len(y_test)*100:.1f}%")
print(f"Negative: {(y_test == 0).sum()/len(y_test)*100:.1f}%")

Performing train-test split...

 Data split completed!

Training set size: 7336 (80.0%)
Test set size: 1834 (20.0%)

Training set sentiment distribution:
sentiment
1    6602
0     734
Name: count, dtype: int64
Positive: 90.0%
Negative: 10.0%

Test set sentiment distribution:
sentiment
1    1651
0     183
Name: count, dtype: int64
Positive: 90.0%
Negative: 10.0%


In [12]:
# Save preprocessed data
import os

print("Saving preprocessed data...")
print("=" * 50)

# Create data directory if it doesn't exist
os.makedirs('data', exist_ok=True)

# Save full processed dataset
output_path = 'data/processed_reviews.csv'
processed_df.to_csv(output_path, index=False)
print(f" Saved: {output_path}")

# Save train and test splits
train_df = pd.DataFrame({
    'review': X_train,
    'sentiment': y_train
})
test_df = pd.DataFrame({
    'review': X_test,
    'sentiment': y_test
})

train_df.to_csv('data/train_reviews.csv', index=False)
print(f" Saved: data/train_reviews.csv")

test_df.to_csv('data/test_reviews.csv', index=False)
print(f" Saved: data/test_reviews.csv")

print(f"\n" + "=" * 50)
print(" All preprocessing completed successfully!")
print("=" * 50)

Saving preprocessed data...
 Saved: data/processed_reviews.csv
 Saved: data/train_reviews.csv
 Saved: data/test_reviews.csv

 All preprocessing completed successfully!


In [12]:
# Summary
print("\n" + "=" * 70)
print("PREPROCESSING SUMMARY")
print("=" * 70)

print(f"\n‚úì COMPLETED STEPS:")
print(f"  1. Loaded {len(df)} raw reviews")
print(f"  2. Removed missing/empty reviews")
print(f"  3. Created binary sentiment labels (Positive: 1, Negative: 0)")
print(f"  4. Excluded neutral reviews (3 stars)")
print(f"  5. Applied text cleaning (URLs, HTML, special chars)")
print(f"  6. Removed stop words (kept negation words)")
print(f"  7. Applied lemmatization")
print(f"  8. Split data (80% train, 20% test)")
print(f"  9. Saved preprocessed datasets")

print(f"\nüìä FINAL STATISTICS:")
print(f"  - Total usable reviews: {len(processed_df)}")
print(f"  - Training samples: {len(X_train)}")
print(f"  - Test samples: {len(X_test)}")
print(f"  - Positive reviews: {(processed_df['sentiment'] == 1).sum()} ({(processed_df['sentiment'] == 1).sum()/len(processed_df)*100:.1f}%)")
print(f"  - Negative reviews: {(processed_df['sentiment'] == 0).sum()} ({(processed_df['sentiment'] == 0).sum()/len(processed_df)*100:.1f}%)")
print(f"  - Average words per review: {processed_df['processed_review'].apply(lambda x: len(str(x).split())).mean():.1f}")

print(f"\nüìÅ OUTPUT FILES:")
print(f"  - data/processed_reviews.csv (full dataset)")
print(f"  - data/train_reviews.csv (training set)")
print(f"  - data/test_reviews.csv (test set)")

print(f"\nüéØ NEXT STEP:")
print(f"  ‚Üí Proceed to feature extraction (3_feature_extraction.ipynb)")

print("\n" + "=" * 70)


PREPROCESSING SUMMARY

‚úì COMPLETED STEPS:
  1. Loaded 9170 raw reviews
  2. Removed missing/empty reviews
  3. Created binary sentiment labels (Positive: 1, Negative: 0)
  4. Excluded neutral reviews (3 stars)
  5. Applied text cleaning (URLs, HTML, special chars)
  6. Removed stop words (kept negation words)
  7. Applied lemmatization
  8. Split data (80% train, 20% test)
  9. Saved preprocessed datasets

üìä FINAL STATISTICS:
  - Total usable reviews: 9170
  - Training samples: 7336
  - Test samples: 1834
  - Positive reviews: 8253 (90.0%)
  - Negative reviews: 917 (10.0%)
  - Average words per review: 10.8

üìÅ OUTPUT FILES:
  - data/processed_reviews.csv (full dataset)
  - data/train_reviews.csv (training set)
  - data/test_reviews.csv (test set)

üéØ NEXT STEP:
  ‚Üí Proceed to feature extraction (3_feature_extraction.ipynb)

