# GENERALIZED PRODUCT REVIEW SUMMARIZER

### import libraries

In [None]:
import pandas as pd
import re
import textwrap
import random
from collections import defaultdict
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer

#STEP 1: LOAD & CLEAN DATASET

In [None]:

# Load the dataset
df = pd.read_csv("balanced_reviews_dataset.csv")

# Step 1.1: Keep only relevant columns
columns_to_keep = [
    "name", "brand", "categories", "primaryCategories", "rating",
    "review", "sentiment"
]
df_filtered = df[columns_to_keep].copy()

# Step 1.2: Fill missing primaryCategories using categories as fallback
df_filtered["primaryCategories"] = df_filtered["primaryCategories"].fillna(df_filtered["categories"])

# Step 1.3: Drop rows with missing essential fields
df_filtered.dropna(subset=["name", "review", "rating", "primaryCategories"], inplace=True)

# Step 1.4: Clean the category field — take only the first category if multiple
df_filtered["primaryCategories"] = df_filtered["primaryCategories"].apply(lambda x: x.split(",")[0].strip())

# Preview cleaned data
print(df_filtered.head())

# STEP 2: RANK PRODUCTS BY CATEGORY

In [None]:
# Group and rank products by category
grouped = df_filtered.groupby(["primaryCategories", "name"]).agg(
    avg_rating=("rating", "mean"),
    review_count=("review", "count")
).reset_index()

# Keep products with at least 5 reviews to avoid noise
filtered = grouped[grouped["review_count"] >= 5]

# Get Top 3 products per category
top3_by_category = (
    filtered.sort_values(by=["primaryCategories", "avg_rating", "review_count"], ascending=[True, False, False])
    .groupby("primaryCategories")
    .head(3)
)

# Get Worst product per category
worst_by_category = (
    filtered.sort_values(by=["primaryCategories", "avg_rating", "review_count"], ascending=[True, True, False])
    .groupby("primaryCategories")
    .head(1)
)

# STEP 3: SUMMARIZE REVIEWS BY SENTIMENT

In [None]:
from transformers import pipeline

# Load FLAN-T5 for better generation
summarizer = pipeline("text2text-generation", model="t5-small", device=-1)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use cpu


In [None]:
# from transformers import pipeline

# # Load BART summarization model (use CPU)
# summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=-1)

(summarize_reviews):

generate a text summary of reviews for a specific product, category, and sentiment (like "positive" or "negative").

In [None]:
def summarize_reviews(df, product_name, category, sentiment, max_reviews=20):
    # Filter the dataset for reviews that match the given product, category, and sentiment
    reviews = df[
        (df["name"] == product_name) &
        (df["primaryCategories"] == category) &
        (df["sentiment"] == sentiment)
    ]["review"].tolist()[:max_reviews]  # Limit to the first 'max_reviews' reviews

    # If no reviews are found, return a default message
    if not reviews:
        return "No reviews available."

    # Combine the selected reviews into one large text block
    text_block = " ".join(reviews)

    # Use the summarization model to generate a summary from the text block
    try:
        return summarizer(text_block, max_length=60, min_length=20, do_sample=False)[0]['generated_text']
    except Exception as e:
        # If summarization fails, return the error message in a readable format
        return f"(Summarization failed: {str(e)})"

##generating summaries of customer reviews for both:


###  Top 3 products per category

### Worst-rated product per category

In [None]:
# Initialize lists to store summarized feedback
top3_feedback = []
worst_feedback = []

# Loop through the top 3 products in each category
for _, row in top3_by_category.iterrows():
    category = row["primaryCategories"]  # Get the category
    name = row["name"]                   # Get the product name

    # Summarize positive reviews
    pros = summarize_reviews(df_filtered, name, category, "positive")

    # Summarize negative reviews
    cons = summarize_reviews(df_filtered, name, category, "negative")

    # Add summary info to top3_feedback list
    top3_feedback.append({
        "category": category,
        "product": name,
        "positive_summary": pros,
        "negative_summary": cons
    })

# Loop through the worst-rated product in each category
for _, row in worst_by_category.iterrows():
    category = row["primaryCategories"]  # Get the category
    name = row["name"]                   # Get the product name

    # Summarize positive reviews
    pros = summarize_reviews(df_filtered, name, category, "positive")

    # Summarize negative reviews
    cons = summarize_reviews(df_filtered, name, category, "negative")

    # Add summary info to worst_feedback list
    worst_feedback.append({
        "category": category,
        "product": name,
        "positive_summary": pros,
        "negative_summary": cons
    })

Token indices sequence length is longer than the specified maximum sequence length for this model (1626 > 512). Running this sequence through the model will result in indexing errors


### examples for testing:

In [None]:
# Print how many top product summaries were generated
print(len(top3_feedback))  # Number of summarized top 3 products across all categories

# Print the first summary in top3_feedback to inspect the output
print(top3_feedback[0] if top3_feedback else "No data")  # Show example or message if empty

28
{'category': 'Amazon Devices & Accessories', 'product': 'Amazon 5W USB Official OEM Charger and Power Adapter for Fire Tablets and Kindle eReaders,,,\r\nAmazon 5W USB Official OEM Charger and Power Adapter for Fire Tablets and Kindle eReaders,,,', 'positive_summary': '- works It worked perfectly! - works It worked perfectly! f reat f reat f reat f reat f reat f reat f reat f reat f', 'negative_summary': "a 5W charger instead of a 5V 1A charger is a distinction without a difference . I will use my apple USB plug instead, for free! if you actually paid 20 for this adapter, you're out of your mind . I will"}


In [None]:
# Print how many worst product summaries were generated
print(len(worst_feedback))  # Number of worst-rated product summaries across all categories

# Print the first summary in worst_feedback to inspect the output
print(worst_feedback[0] if worst_feedback else "No data")  # Show example or message if empty

14
{'category': 'Amazon Devices & Accessories', 'product': 'All-New Kindle E-reader - Black, 6 Glare-Free Touchscreen Display, Wi-Fi -  Includes Special Offers,,', 'positive_summary': '. Good purchase. Works good. Works good but, you can purchase a charger with same or better specs. on Amazon MUCH cheaper. I purchased a 5 port 30 watt charger for about 1/2 of the cost of this.', 'negative_summary': '. This is honestly a little expensive. And I can find other ways to charge my devices. this is way over priced this is way over priced Should have been included with the e-reader. Way too expensive I ordered the Amazon 5W USB Official OEM Charger and Power'}


In [None]:
import random

print(" Sample Top Product Review Summary:")
print(random.choice(top3_feedback))

print("\n Sample Worst Product Review Summary:")
print(random.choice(worst_feedback))

 Sample Top Product Review Summary:
{'category': 'Walmart for Business', 'product': 'Kindle Voyage E-reader, 6 High-Resolution Display (300 ppi) with Adaptive Built-in Light, PagePress Sensors, Wi-Fi - Includes Special Offers,', 'positive_summary': '. I love the Kindle Voyage 6 and the Kindle Voyage 6 . I bought this as a gift for my wife. She loves it! I love it! I love it! I love it! I love it! I love it! I love it', 'negative_summary': 'worked. was great.. Not a happy customer. Not a happy customer. I realize it was my fault but thought they should do something more to minimize the loss and keep my business.'}

 Sample Worst Product Review Summary:
{'category': 'Tablets', 'product': 'Amazon 5W USB Official OEM Charger and Power Adapter for Fire Tablets and Kindle eReaders,,,\r\nAmazon 5W USB Official OEM Charger and Power Adapter for Fire Tablets and Kindle eReaders,,,', 'positive_summary': 'and so fast. and so fast. and so fast. and hold a charge for a very long time. and. a sudden

In [None]:
# Only keep products with at least one usable summary
top3_clean = [
    item for item in top3_feedback
    if item["positive_summary"] != "No reviews available." or item["negative_summary"] != "No reviews available."
]

worst_clean = [
    item for item in worst_feedback
    if item["negative_summary"] != "No reviews available." and item["category"] == item["category"]
]

# STEP 4: CLEAN SUMMARIES

###filtering phase

making sure to keep only the products that have at least one meaningful summary

helps avoid generating articles with empty or irrelevant content.


In [None]:
# Only keep products with at least one usable summary

# Filter the top 3 product summaries
top3_clean = [
    item for item in top3_feedback
    # Keep if either the positive OR negative summary exists (not both missing)
    if item["positive_summary"] != "No reviews available." or item["negative_summary"] != "No reviews available."
]

# Filter the worst product summaries
worst_clean = [
    item for item in worst_feedback
    # Keep only if there's a valid negative summary
    if item["negative_summary"] != "No reviews available."
    # Redundant category check (safety net to ensure valid data)
    and item["category"] == item["category"]
]

### clean up and improve the readability

In [None]:
import re

def clean_summary(text):
    if not text or not isinstance(text, str):
        return ""

    # Remove exact phrase repetitions (keep one instance)
    phrases = re.findall(r'([A-Z][^.?!]{10,}?\.)', text, re.IGNORECASE)
    seen = set()
    for phrase in phrases:
        if text.count(phrase) > 2:
            text = text.replace(phrase, "", text.count(phrase) - 1)

    # Normalize spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Capitalize first letter
    return text[0].upper() + text[1:] if text else text

# STEP 5: GENERATE ARTICLE PER CATEGORY

In [None]:
# import textwrap
from collections import defaultdict

# wrapper = textwrap.TextWrapper(width=100)

# Group by category
top3_by_cat = defaultdict(list)
for item in top3_feedback:
    top3_by_cat[item["category"]].append(item)

    worst_by_cat = {item["category"]: item for item in worst_feedback}

# # Clean worsts: ensure product belongs to correct category
# worst_by_cat_clean = {}
# for cat, prod in worst_by_cat.items():
#     if prod["category"] == cat:
#         worst_by_cat_clean[cat] = prod

# # Generate articles
# final_articles = {}

# for category in top3_by_cat:
#     top_products = top3_by_cat[category]
#     worst_product = worst_by_cat_clean.get(category)

#     if not worst_product or len(top_products) < 3:
#         continue

#     # Build structured input for generation
#     prompt = f"In the category '{category}', customer reviews reveal the best and worst Amazon products.\n\n"
#     prompt += "Here are the top 3 recommended products:\n\n"

#     for i, product in enumerate(top_products, 1):
#         if product["positive_summary"] == "No reviews available." and product["negative_summary"] == "No reviews available.":
#            continue

#         prompt += f"{i}. **{product['product']}**\n"
#         prompt += f"Customers appreciated: {product['positive_summary']}\n"
#         prompt += f"Common complaints: {product['negative_summary']}\n\n"

#     if worst_product["positive_summary"] != "No reviews available." or worst_product["negative_summary"] != "No reviews available.":
#        prompt += f"The worst-rated product in this category was **{worst_product['product']}**.\n"
#        prompt += f"While a few users noted: {worst_product['positive_summary']}\n"
#        prompt += f"The most common complaints were: {worst_product['negative_summary']}\n\n"

#     prompt += "Now write a full blog-style article that introduces the category, highlights the top 3 products with their differences, includes top complaints, and ends with a warning about the worst product. Keep the tone friendly and informative."

#     # Generate article using BART
#     try:
#         article = summarizer(prompt, max_length=500, min_length=250, do_sample=False)[0]['generated_text']
#     except Exception as e:
#         article = f"(Summarization failed: {e})"

#     final_articles[category] = article

# # Display nicely formatted articles
# for category, article in final_articles.items():
#     print(f"\n Article for Category: {category}\n")
#     print(wrapper.fill(article))
#     print("\n" + "="*120 + "\n")

In [None]:
import textwrap
from collections import defaultdict

# Only define this if not already done earlier
#top3_by_cat = defaultdict(list)
#for item in top3_feedback:
    # top3_by_cat[item["category"]].append(item)

wrapper = textwrap.TextWrapper(width=100)
final_articles = {}

for category in top3_by_cat:
    top_products = top3_by_cat[category][:3]
    worst_product = worst_by_cat.get(category)

    if not worst_product or len(top_products) < 3:
        continue

    article = f" **Article for Category: {category}**\n\n"
    article += f"The {category} category on Amazon features a range of devices suited for entertainment, productivity, and more. Based on customer feedback, here are the top recommendations — and one to avoid.\n\n"

#     article = f"📝 **Article for Category: {category}**\n\n"
#     article += (
#     f"The **{category}** category features a variety of products designed for different needs and preferences. "
#     f"Based on customer reviews, here are the top recommended products — and one to avoid.\n\n"
# )


    seen_names = set()
    count = 1
    for product in top_products:
        name = product["product"].strip()
        if name in seen_names:
            continue
        seen_names.add(name)

        pos = clean_summary(product["positive_summary"].strip())
        neg = clean_summary(product["negative_summary"].strip())

        if "summarization failed" in pos.lower() and "summarization failed" in neg.lower():
            continue

        article += f"**{count}. {name}**\n"
        if "no reviews available" not in pos.lower() and "summarization failed" not in pos.lower():
            article += f"Customers appreciated that {pos[0].upper() + pos[1:]}.\n"
        if "no reviews available" not in neg.lower() and "summarization failed" not in neg.lower():
            article += f"However, some customers mentioned that {neg[0].upper() + neg[1:]}.\n\n"

        count += 1

    wname = worst_product["product"].strip()
    wneg = clean_summary(worst_product["negative_summary"].strip())
    if "no reviews available" not in wneg.lower() and "summarization failed" not in wneg.lower():
        article += f"🚫 **Worst Product: {wname}**\n"
        article += f"While it might be budget-friendly or popular, many users warned that {wneg[0].upper() + wneg[1:]}.\n"

    final_articles[category] = article

# ✅ Print articles
for category, article in final_articles.items():
    print("\n" + "=" * 120 + "\n")
    print(wrapper.fill(article))



 **Article for Category: Electronics**  The Electronics category on Amazon features a range of
devices suited for entertainment, productivity, and more. Based on customer feedback, here are the
top recommendations — and one to avoid.  **1. Amazon - Echo Plus w/ Built-In Hub - Silver**
Customers appreciated that The Echo Plus. Great product my wife and I love it makes home automation
simple Easy to use no Complains at all. Easy to use great sound , easy to use, wife love it, connect
with wifi and all my phones. Great product my wife and I love it makes. However, some customers
mentioned that It and really had no use for it HUB is almost useless. Ended up using spare Philips
HUB. The HUB built in would only "discover" 2 out of 12 bulbs. 3 days of trying. HUE bridge 12 out
of 12 first try. Returned.  **2. Fire Tablet, 7 Display, Wi-Fi, 8 GB - Includes Special Offers,
Black** Customers appreciated that !! the kindle is super light weight, easy on the eyes, and allows
you to define words 

# STEP 6: SAVE SUMMARIZATION MODEL LOCALLY

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import os
import shutil
from google.colab import files

# Load the lightweight summarization model
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
tokenizer = AutoTokenizer.from_pretrained("t5-small")
# Create target folder under 'models/'
save_path = "models/t5_small"
os.makedirs(save_path, exist_ok=True)

# Save model + tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)


('models/t5_small/tokenizer_config.json',
 'models/t5_small/special_tokens_map.json',
 'models/t5_small/spiece.model',
 'models/t5_small/added_tokens.json',
 'models/t5_small/tokenizer.json')

In [None]:
# Zip the saved model folder
shutil.make_archive("t5_small", 'zip', "models/t5_small")

'/content/t5_small.zip'

In [None]:
# Download the zip file
files.download("t5_small.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>