<a href="https://colab.research.google.com/github/Indranil0603/meesho-dice-challenge-2025/blob/master/Review_Enhancement_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Importing libraries

In [None]:
import pandas as pd
import numpy as np
import re
from textblob import TextBlob
from collections import defaultdict
from sentence_transformers import SentenceTransformer

# Loading the data

In [None]:
csv_file_path = '/content/drive/MyDrive/review-data-sets/flipkart-product-review.csv'

In [None]:
df = pd.read_csv(csv_file_path)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205052 entries, 0 to 205051
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   product_name   205052 non-null  object
 1   product_price  205052 non-null  object
 2   Rate           205052 non-null  object
 3   Review         180388 non-null  object
 4   Summary        205041 non-null  object
 5   Sentiment      205052 non-null  object
dtypes: object(6)
memory usage: 9.4+ MB


In [None]:
df.describe()

Unnamed: 0,product_name,product_price,Rate,Review,Summary,Sentiment
count,205052,205052,205052,180388,205041,205052
unique,958,525,8,1324,92923,3
top,cello Pack of 18 Opalware Cello Dazzle Lush Fi...,1299,5,wonderful,good,positive
freq,6005,9150,118765,9016,17430,166581


In [None]:
df.head(5)

Unnamed: 0,product_name,product_price,Rate,Review,Summary,Sentiment
0,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,super!,great cooler excellent air flow and for this p...,positive
1,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,awesome,best budget 2 fit cooler nice cooling,positive
2,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,the quality is good but the power of air is de...,positive
3,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,1,useless product,very bad product its a only a fan,negative
4,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,ok ok product,neutral


# Product feature maps

In [None]:
# Product-specific features that users care about
PRODUCT_FEATURES = {
    'air_cooler': ['cooling', 'airflow', 'noise', 'water_capacity', 'build_quality'],
    'smartphone': ['battery', 'camera', 'display', 'performance', 'build_quality'],
    'motorcycle': ['looks_design', 'gear', 'brake', 'mileage', 'build_quality'],
    'laptop': ['performance', 'battery', 'display', 'keyboard', 'build_quality'],
    'headphones': ['sound_quality', 'comfort', 'battery', 'noise_cancellation', 'build_quality'],
    'shirt': ['fit', 'material', 'color_accuracy', 'size', 'stitching'],
    'jeans': ['fit', 'material', 'size', 'comfort', 'durability'],
    'shoes': ['fit', 'comfort', 'size', 'material', 'durability'],
    'saree': ['material', 'color_accuracy', 'design', 'length', 'border_quality'],
    'cookware': ['material', 'durability', 'heating_efficiency', 'size', 'ease_of_cleaning'],
    'plates_bowls': ['material', 'size', 'durability', 'design', 'ease_of_cleaning'],
    'furniture': ['build_quality', 'material', 'size', 'comfort', 'assembly'],
    'skincare': ['effectiveness', 'skin_compatibility', 'texture', 'fragrance', 'packaging'],
    'general': ['quality', 'value', 'design', 'durability', 'build_quality'],

}

In [None]:
# Comprehensive keyword mapping for feature detection
FEATURE_KEYWORDS = {
    'cooling': ['cool', 'cooling', 'cold', 'temperature', 'chill', 'freeze', 'ice'],
    'airflow': ['air flow', 'airflow', 'air', 'wind', 'breeze', 'circulation', 'throw'],
    'noise': ['noise', 'noisy', 'sound', 'quiet', 'silent', 'loud', 'disturbing'],
    'water_capacity': ['water', 'tank', 'capacity', 'refill', 'small tank', 'large tank', 'litres'],
    'battery': ['battery', 'charge', 'charging', 'backup', 'lasts', 'drain', 'power', 'mah'],
    'camera': ['camera', 'photo', 'picture', 'selfie', 'video', 'clarity', 'megapixel', 'lens'],
    'display': ['display', 'screen', 'clear', 'bright', 'resolution', 'color', 'touch'],
    'performance': ['fast', 'slow', 'speed', 'performance', 'lag', 'smooth', 'processor', 'ram'],
    'looks_design': ['look', 'looks', 'design', 'beautiful', 'style', 'appearance', 'attractive'],
    'gear': ['gear', 'shifting', 'smooth', 'transmission', 'change', 'gearbox'],
    'brake': ['brake', 'braking', 'stop', 'stopping', 'brake quality'],
    'mileage': ['mileage', 'fuel', 'petrol', 'efficiency', 'consumption', 'kmpl'],
    'sound_quality': ['sound', 'audio', 'bass', 'treble', 'clear', 'music', 'volume'],
    'comfort': ['comfort', 'comfortable', 'fit', 'soft', 'cushion', 'ergonomic'],
    'noise_cancellation': ['noise cancel', 'ambient', 'isolation', 'quiet', 'outside noise'],
    'fit': ['fit', 'fitting', 'loose', 'tight', 'perfect fit', 'size fit', 'body fit'],
    'material': ['material', 'fabric', 'cloth', 'cotton', 'silk', 'leather', 'plastic', 'quality material'],
    'color_accuracy': ['color', 'colour', 'shade', 'bright', 'dark', 'faded', 'exact color', 'as shown'],
    'size': ['size', 'small', 'large', 'medium', 'xl', 'xxl', 'wrong size', 'perfect size'],
    'stitching': ['stitch', 'stitching', 'seam', 'thread', 'cut', 'finishing'],
    'durability': ['durable', 'lasting', 'strong', 'break', 'broken', 'fragile', 'sturdy'],
    'heating_efficiency': ['heat', 'heating', 'cook', 'cooking', 'hot', 'temperature', 'even heating'],
    'ease_of_cleaning': ['clean', 'cleaning', 'wash', 'dishwasher', 'stain', 'easy to clean'],
    'build_quality': ['build', 'construction', 'quality', 'sturdy', 'solid', 'cheap build'],
    'effectiveness': ['effective', 'work', 'result', 'improvement', 'change', 'difference'],
    'skin_compatibility': ['skin', 'irritation', 'allergy', 'sensitive', 'suitable', 'reaction'],
    'texture': ['texture', 'smooth', 'rough', 'creamy', 'thick', 'thin', 'consistency'],
    'fragrance': ['smell', 'fragrance', 'scent', 'odor', 'perfume', 'aroma'],
    'quality': ['quality', 'good', 'bad', 'excellent', 'poor', 'great', 'awful'],
    'value': ['value', 'money', 'price', 'worth', 'expensive', 'cheap', 'budget'],
    'design': ['design', 'look', 'style', 'appearance', 'beautiful', 'attractive']
}

In [None]:
def classify_product_type(product_name):
    """Classify product to determine relevant features"""
    name_lower = product_name.lower()

    # Electronics
    if any(word in name_lower for word in ['cooler', 'air cooler']):
        return 'air_cooler'
    elif any(word in name_lower for word in ['phone', 'smartphone', 'mobile', 'galaxy', 'iphone', 'redmi']):
        return 'smartphone'
    elif any(word in name_lower for word in ['laptop', 'notebook', 'macbook', 'dell', 'hp']):
        return 'laptop'
    elif any(word in name_lower for word in ['headphone', 'earphone', 'earbud', 'airpods']):
        return 'headphones'

    # Vehicles
    elif any(word in name_lower for word in ['bike', 'motorcycle', 'splendor', 'hero', 'bajaj', 'yamaha']):
        return 'motorcycle'

    # Fashion
    elif any(word in name_lower for word in ['shirt', 'tshirt', 't-shirt', 'polo']):
        return 'shirt'
    elif any(word in name_lower for word in ['jeans', 'denim', 'trouser']):
        return 'jeans'
    elif any(word in name_lower for word in ['shoe', 'sneaker', 'boot', 'sandal']):
        return 'shoes'
    elif any(word in name_lower for word in ['saree', 'sari']):
        return 'saree'

    # Home & Kitchen
    elif any(word in name_lower for word in ['cookware', 'pan', 'pot', 'kadai']):
        return 'cookware'
    elif any(word in name_lower for word in ['plate', 'bowl', 'dish', 'opalware']):
        return 'plates_bowls'
    elif any(word in name_lower for word in ['chair', 'table', 'sofa', 'furniture']):
        return 'furniture'

    # Beauty
    elif any(word in name_lower for word in ['cream', 'lotion', 'serum', 'skincare']):
        return 'skincare'

    else:
        return 'general'

In [None]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Labels for sentiment classification
label_sentences = ["I am very satisfied", "I am very disappointed"]
label_embeddings = model.encode(label_sentences)
label_norm = label_embeddings / np.linalg.norm(label_embeddings, axis=1, keepdims=True)

In [None]:
# Extract sentiment for each feature from precomputed review embeddings
def extract_feature_sentiments_from_reviews(reviews_text, review_embeddings, features):
    feature_sentiments = {}

    for feature in features:
        keywords = FEATURE_KEYWORDS.get(feature, [feature])

        # Find indices of reviews containing feature keywords
        feature_indices = []
        for i, review in enumerate(reviews_text):
            review_lower = review.lower()
            if any(keyword in review_lower for keyword in keywords):
                feature_indices.append(i)

        if feature_indices:
            # Select embeddings for those reviews
            selected_embeddings = review_embeddings[feature_indices]

            # Normalize
            selected_norm = selected_embeddings / np.linalg.norm(selected_embeddings, axis=1, keepdims=True)

            # Cosine similarity with labels → (n_reviews, 2)
            cosine_sim = np.dot(selected_norm, label_norm.T)

            # Sentiment score
            pos = cosine_sim[:, 0]
            neg = cosine_sim[:, 1]
            sentiments = (pos - neg) / (np.abs(pos) + np.abs(neg) + 1e-8)

            if len(sentiments) > 0:
                avg_sentiment = np.mean(sentiments)
                feature_sentiments[feature] = avg_sentiment

    return feature_sentiments

In [None]:
# Convert sentiment score (-1 to 1) to rating (1 to 5)
def convert_sentiment_to_rating(sentiment_score):
    rating = 3 + (sentiment_score * 2)
    return round(max(1.0, min(5.0, rating)))

In [None]:
# data cleaning
df = df.drop_duplicates(subset=['product_name', 'Review', 'Summary'])
df = df.dropna(subset=['product_name', 'Review', 'Summary', 'Rate'])

# Normalize text columns
def clean_text(text):
    text = str(text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['product_name'] = df['product_name'].apply(clean_text)
df['Review']       = df['Review'].apply(clean_text)
df['Summary']      = df['Summary'].apply(clean_text)

# Convert Rate to numeric (in case it's read as string)
df['Rate'] = pd.to_numeric(df['Rate'], errors='coerce')

# Drop any rows where Rate conversion failed
df = df.dropna(subset=['Rate'])


In [None]:
from tqdm import tqdm

In [None]:
results = []
detailed_analysis = []

print(f"Analyzing {len(df)} reviews across {df['product_name'].nunique()} products")

for product_name, product_reviews in tqdm(df.groupby('product_name')):

    # Classify product type
    product_type = classify_product_type(product_name)
    relevant_features = PRODUCT_FEATURES.get(product_type, PRODUCT_FEATURES['general'])

    # Combine all review text for this product
    all_reviews = (product_reviews['Review'] + ' ' + product_reviews['Summary']).tolist()

    # Convert reviews to embeddings
    review_embeddings = model.encode(all_reviews)

    # Extract feature sentiments from ALL reviews combined
    feature_sentiments = extract_feature_sentiments_from_reviews(all_reviews, review_embeddings, relevant_features)

    # Convert sentiments to ratings (1-5 scale)
    feature_ratings = {}
    feature_details = {}

    for feature in relevant_features:
        if feature in feature_sentiments:
            # Feature mentioned in reviews - use sentiment-based rating
            sentiment_score = feature_sentiments[feature]
            rating = convert_sentiment_to_rating(sentiment_score)
            feature_ratings[feature] = round(rating, 1)
            feature_details[feature] = {
                'rating': round(rating, 1),
                'sentiment_score': round(sentiment_score, 3),
                'mentioned': True
            }
        else:
            # Feature not mentioned - use average overall rating as baseline
            avg_rating = product_reviews['Rate'].mean()
            feature_ratings[feature] = round(avg_rating, 1)
            feature_details[feature] = {
                'rating': round(avg_rating, 1),
                'sentiment_score': 0.0,
                'mentioned': False
            }

    # Store results
    result = {
        'product_name': product_name,
        'product_type': product_type,
        'total_reviews': len(product_reviews),
        'avg_overall_rating': round(product_reviews['Rate'].mean(), 1),
        **{f'{feature}_rating': feature_ratings[feature] for feature in relevant_features}
    }
    results.append(result)

    detailed_analysis.append({
        'product_name': product_name,
        'product_type': product_type,
        'total_reviews': len(product_reviews),
        'relevant_features': relevant_features,
        'feature_details': feature_details,
        'sample_reviews': all_reviews[:2]  # Store sample reviews
    })

results_df, detailed_analysis = pd.DataFrame(results), detailed_analysis


Analyzing 153266 reviews across 841 products


100%|██████████| 841/841 [03:33<00:00,  3.94it/s]


In [None]:
results_df.to_csv('product_feature_ratings.csv', index=False)

In [None]:
def formated_print(detailed_analysis):
    i = 0;
    for analysis in detailed_analysis[95:100]:
        print(f"  Product : {analysis['product_name']}")
        print(f"  Type: {analysis['product_type']}")
        print(f"  Total Reviews: {analysis['total_reviews']}")
        print(f"  Relevant Features: {', '.join(analysis['relevant_features'])}")
        print("  Feature Ratings:")
        for feature, details in analysis['feature_details'].items():
            print(f"    {feature}: {details['rating']}")
        print("\n\n")


In [None]:
formated_print(detailed_analysis)

  Product : BOSCH SMS66GW01I Free Standing 13 Place Settings Intensive Kadhai Cleaning| No Pre-rinse Required Dishwasher
  Type: plates_bowls
  Total Reviews: 120
  Relevant Features: material, size, durability, design, ease_of_cleaning
  Feature Ratings:
    material: 3
    size: 3
    durability: 3
    design: 3
    ease_of_cleaning: 3



  Product : BROWNIE Engineered Wood Computer DeskStraight Finish Color Brown Preassembled
  Type: general
  Total Reviews: 99
  Relevant Features: quality, value, design, durability, build_quality
  Feature Ratings:
    quality: 3
    value: 3
    design: 3
    durability: 3
    build_quality: 3



  Product : BROWNIE Engineered Wood Study TableFree Standing Finish Color Brown Preassembled
  Type: furniture
  Total Reviews: 98
  Relevant Features: build_quality, material, size, comfort, assembly
  Feature Ratings:
    build_quality: 3
    material: 3
    size: 4.1
    comfort: 3
    assembly: 4.1



  Product : Baby Boys Baby Girls Casual Tshirt Pan