# Sentiment Analysis Workflow for Product Reviews

This notebook covers:
- Data Loading & Exploration
- Text Preprocessing
- Sentiment Analysis
- Exploratory Data Analysis (EDA) of Sentiments
- (Optional) Topic Modeling
- Insight Generation

In [None]:
# 1. Data Loading and Initial Exploration
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('Data/product_reviews_mock_data.csv')
df.info()
df.head()

In [None]:
# Examine distribution of ratings
sns.countplot(df['Rating'])
plt.title('Distribution of Ratings')
plt.show()

# Review text sample
df[['ReviewText', 'Rating']].sample(5)

## 2. Text Preprocessing
We'll clean the text, tokenize, remove stopwords, and perform stemming/lemmatization.

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation & numbers
    tokens = text.split()
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return ' '.join(tokens)

df['CleanedReview'] = df['ReviewText'].apply(preprocess)
df[['ReviewText', 'CleanedReview']].head()

## 3. Sentiment Analysis
- We'll use VADER (suited for short, product reviews)
- Categorize reviews as Positive, Negative, Neutral based on compound score

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

def get_sentiment(row):
    score = sia.polarity_scores(row['CleanedReview'])['compound']
    if score >= 0.05:
        return 'Positive'
    elif score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

df['Sentiment'] = df.apply(get_sentiment, axis=1)
df[['ReviewText', 'CleanedReview', 'Sentiment']].head()

## 4. Exploratory Data Analysis of Sentiments
- Distribution of sentiments
- Word clouds for positive/negative reviews
- Sentiment by product and over time

In [None]:
# Distribution
sns.countplot(df['Sentiment'], order=['Positive','Neutral','Negative'])
plt.title('Sentiment Distribution')
plt.show()

# Sentiment by Product
plt.figure(figsize=(10,4))
sns.countplot(data=df, x='ProductID', hue='Sentiment', order=df['ProductID'].value_counts().index)
plt.title('Sentiment by Product')
plt.show()

# Sentiment over time (if ReviewDate is available)
df['ReviewDate'] = pd.to_datetime(df['ReviewDate'])
sent_by_date = df.groupby([pd.Grouper(key='ReviewDate', freq='M'), 'Sentiment']).size().unstack().fillna(0)
sent_by_date.plot(kind='line', figsize=(12,5), marker='o')
plt.title('Sentiment Trend Over Time')
plt.ylabel('Number of Reviews')
plt.show()

In [None]:
# Wordclouds
from wordcloud import WordCloud

for sentiment in ['Positive', 'Negative']:
    text = ' '.join(df[df['Sentiment']==sentiment]['CleanedReview'])
    wc = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(8,4))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'{sentiment} Reviews WordCloud')
    plt.show()

## 5. Topic Modeling (Optional, on Negative Reviews)
Discover common complaints using LDA

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

neg_reviews = df[df['Sentiment']=='Negative']['CleanedReview']
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(neg_reviews)
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(dtm)

for idx, topic in enumerate(lda.components_):
    print(f"Topic #{idx+1}")
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
    print()

## 6. Insight Generation and Recommendations
- Summarize sentiment proportions
- List common positive/negative themes
- List main topics from LDA
- Actionable recommendations

In [None]:
# Example summary output
print('Sentiment breakdown:')
print(df['Sentiment'].value_counts(normalize=True))

print('\nFrequent positive words:')
from collections import Counter
pos_words = ' '.join(df[df['Sentiment']=='Positive']['CleanedReview']).split()
print(Counter(pos_words).most_common(10))

print('\nFrequent negative words:')
neg_words = ' '.join(df[df['Sentiment']=='Negative']['CleanedReview']).split()
print(Counter(neg_words).most_common(10))

print('\nRecommendations:')
print('- Address major pain points (e.g., "broke easily", "poor quality", "customer service")')
print('- Highlight positive themes (e.g., "amazing features", "wonderful experience") in marketing')
print('- Review topics from LDA for product improvement focus')