In [2]:
import pandas as pd
import nltk
from textblob import TextBlob
import string
from nltk.corpus import stopwords

nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mahaant\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df = pd.read_csv('../data/raw/flipkart_product.csv', encoding='ISO-8859-1')

print("Original Columns:", df.columns)
df.head(3)

Original Columns: Index(['ProductName', 'Price', 'Rate', 'Review', 'Summary'], dtype='object')


Unnamed: 0,ProductName,Price,Rate,Review,Summary
0,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,"??3,999",5,Super!,Great cooler.. excellent air flow and for this...
1,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,"??3,999",5,Awesome,Best budget 2 fit cooler. Nice cooling
2,Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...,"??3,999",3,Fair,The quality is good but the power of air is de...


In [5]:
def clean_price(price):
    """
    Fixes 'f??3,999' -> 3999
    Also handles weird superscript numbers like '¹'
    """
    if not isinstance(price, str):
        return 0
    
    # STRICT FIX: Only allow characters 0-9
    valid_digits = "0123456789"
    clean_str = ''.join([char for char in price if char in valid_digits])
    
    return int(clean_str) if clean_str else 0

def clean_text_manual(text):
    """
    Standard cleaning: Lowercase -> No Punctuation -> No Stopwords
    """
    if not isinstance(text, str):
        return ""
    
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    words = text.split()
    cleaned = [w for w in words if w not in STOPWORDS]
    
    return " ".join(cleaned)

# Apply the cleaning
# 1. We map 'Summary' to our main 'Review_Text' because that's where the content is.
df['Review_Text'] = df['Summary']  
df['Star_Rating'] = df['Rate']

# 2. Clean the text
df['Cleaned_Review'] = df['Review_Text'].apply(clean_text_manual)

# 3. (Optional) Clean the price
df['Cleaned_Price'] = df['Price'].apply(clean_price)

print("Cleaning Complete. Check the new columns:")
df[['Summary', 'Cleaned_Review', 'Cleaned_Price']].head(10)


Cleaning Complete. Check the new columns:


Unnamed: 0,Summary,Cleaned_Review,Cleaned_Price
0,Great cooler.. excellent air flow and for this...,great cooler excellent air flow price amazing ...,3999
1,Best budget 2 fit cooler. Nice cooling,best budget 2 fit cooler nice cooling,3999
2,The quality is good but the power of air is de...,quality good power air decent,3999
3,Very bad product it's a only a fan,bad product fan,3999
4,Ok ok product,ok ok product,3999
5,The cooler is really fantastic and provides go...,cooler really fantastic provides good air flow...,3999
6,Very good product,good product,3999
7,Very nice,nice,3999
8,Very bad cooler,bad cooler,3999
9,Very good,good,3999


In [6]:
df.shape

(189874, 9)

In [7]:
def get_sentiment(text):
    # Handle empty rows
    if not isinstance(text, str) or text.strip() == "":
        return 'Neutral'
        
    score = TextBlob(text).sentiment.polarity
    if score > 0.1: return 'Positive'
    elif score < -0.1: return 'Negative'
    else: return 'Neutral'

df['Predicted_Sentiment'] = df['Cleaned_Review'].apply(get_sentiment)

# Let's compare the Star Rating vs Our Prediction
# If Rate is 5 but Sentiment is Negative, that's interesting!
df[['Rate', 'Predicted_Sentiment', 'Summary']].head(10)

Unnamed: 0,Rate,Predicted_Sentiment,Summary
0,5,Positive,Great cooler.. excellent air flow and for this...
1,5,Positive,Best budget 2 fit cooler. Nice cooling
2,3,Positive,The quality is good but the power of air is de...
3,1,Negative,Very bad product it's a only a fan
4,3,Positive,Ok ok product
5,5,Positive,The cooler is really fantastic and provides go...
6,5,Positive,Very good product
7,3,Positive,Very nice
8,1,Negative,Very bad cooler
9,4,Positive,Very good
