In [10]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer


In [11]:
# Load merged clean data
merged = pd.read_csv("merged_clean.csv", engine="python", on_bad_lines="skip")

# Keep only columns needed for sentiment
merged = merged[
    ["product_id", "product_name_product", "secondary_category", "review_text"]
].dropna(subset=["review_text"])

print("Reviews used for sentiment:", len(merged))


Reviews used for sentiment: 1092967


In [12]:
sid = SentimentIntensityAnalyzer()


In [13]:
# Calculate sentiment scores
merged["sentiment_score"] = merged["review_text"].apply(
    lambda x: sid.polarity_scores(str(x))["compound"]
)


In [16]:
# Label sentiment based on score
def label_sentiment(score):
    if score > 0.05:
        return "Positive"
    elif score < -0.05:
        return "Negative"
    else:
        return "Neutral"

merged["sentiment_label"] = merged["sentiment_score"].apply(label_sentiment)


In [17]:
# Aggregate sentiment by secondary category
category_sentiment = (
    merged
    .groupby("secondary_category")
    .agg(
        avg_sentiment=("sentiment_score", "mean"),
        median_sentiment=("sentiment_score", "median"),
        positive_share=("sentiment_label", lambda x: (x == "Positive").mean()),
        review_count=("sentiment_score", "count")
    )
    .reset_index()
)

category_sentiment["avg_sentiment"] = category_sentiment["avg_sentiment"].round(3)
category_sentiment["positive_share"] = (category_sentiment["positive_share"] * 100).round(1)

category_sentiment.sort_values("review_count", ascending=False).head(10)


Unnamed: 0,secondary_category,avg_sentiment,median_sentiment,positive_share,review_count
6,Moisturizers,0.693,0.8619,90.3,297114
10,Treatments,0.652,0.8392,87.9,221690
0,Cleansers,0.679,0.8516,89.7,200300
5,Mini Size,0.67,0.8481,89.2,85397
1,Eye Care,0.663,0.8439,88.4,74919
4,Masks,0.684,0.8585,89.6,70433
3,Lip Balms & Treatments,0.663,0.8271,88.9,61549
9,Sunscreen,0.67,0.8631,88.7,41100
11,Value & Gift Sets,0.698,0.886,89.3,12089
7,Self Tanners,0.666,0.8738,87.7,11926
