# Retail Semantic Analysis Demo (No Kaggle Required)
This notebook demonstrates the complete pipeline for retail semantic analysis using sample data that doesn't require external downloads.

In [1]:
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Import our custom modules
from data_loader import RetailDataLoader
from preprocessor import RetailTextPreprocessor
from sentiment_analyzer import RetailSentimentAnalyzer
from topic_modeling import RetailTopicModeler
from visualizations import RetailVisualizationGenerator

# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# import nltk
# nltk.download('stopwords', download_dir='/Users/mayankdw/nltk_data')

# import os
# os.environ['NLTK_DATA'] = '/Users/mayankdw/nltk_data'

print("All modules imported successfully!")
print(f"Working directory: {os.getcwd()}")

All modules imported successfully!
Working directory: /Users/mayankdw/retail_semantic_analysis/notebooks


## 1. Data Loading and Preparation (Using Sample Data)

In [2]:
# Initialize data loader
loader = RetailDataLoader()

# Create a more realistic sample dataset for demonstration
print("Creating sample dataset...")
df = loader.create_sample_dataset(size=2000)

print(f"Dataset loaded with {len(df)} reviews")
print("\nDataset info:")
print(df.info())
print("\nFirst few rows:")
display(df.head())

print("\nDataset statistics:")
print(f"Categories: {df['product_category'].value_counts().to_dict()}")
print(f"Rating distribution: {df['rating'].value_counts().sort_index().to_dict()}")
print(f"Sentiment distribution: {df['sentiment'].value_counts().to_dict()}")

Creating sample dataset...
Dataset loaded with 2000 reviews

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   review_text       2000 non-null   object
 1   sentiment         2000 non-null   object
 2   rating            2000 non-null   int64 
 3   product_category  2000 non-null   object
dtypes: int64(1), object(3)
memory usage: 62.6+ KB
None

First few rows:


Unnamed: 0,review_text,sentiment,rating,product_category
0,Outstanding customer service and product quality.,positive,4,Home
1,Would not recommend. Poor customer service.,negative,1,Electronics
2,"Perfect item, exactly as described. Five stars!",positive,4,Clothing
3,Outstanding customer service and product quality.,positive,5,Home
4,"Perfect item, exactly as described. Five stars!",positive,5,Clothing



Dataset statistics:
Categories: {'Clothing': 538, 'Electronics': 495, 'Books': 486, 'Home': 481}
Rating distribution: {1: 407, 2: 407, 4: 584, 5: 602}
Sentiment distribution: {'positive': 1186, 'negative': 814}


## 2. Enhanced Sample Data Creation
Let's create more realistic and diverse sample data for better demonstration

In [3]:
# Create enhanced sample data with more realistic reviews
np.random.seed(42)

# Enhanced review templates
positive_reviews = [
    "This product exceeded my expectations! The quality is outstanding and delivery was fast.",
    "Excellent value for money. The material feels premium and the design is beautiful.",
    "Perfect fit and great functionality. I would definitely recommend this to others.",
    "Amazing customer service and quick response. The product works exactly as described.",
    "Love the innovative features and user-friendly design. Five stars!",
    "Great packaging and the item arrived in perfect condition. Very satisfied.",
    "This is exactly what I was looking for. Good quality and reasonable price.",
    "Impressive build quality and attention to detail. Worth every penny.",
    "Fast shipping and excellent product. Will definitely buy from this seller again.",
    "Outstanding performance and reliability. Highly recommended for anyone."
]

negative_reviews = [
    "Poor quality materials and the product broke after just a few days of use.",
    "Very disappointed with this purchase. Not as described and overpriced.",
    "Terrible customer service and slow shipping. The item was damaged on arrival.",
    "Cheap construction and doesn't work properly. Waste of money.",
    "The fit is completely wrong and the material feels flimsy.",
    "Misleading product description. What I received was nothing like the pictures.",
    "Took forever to arrive and when it did, it was defective.",
    "Not worth the price at all. Very poor quality and design.",
    "Packaging was terrible and the product was damaged. Very unsatisfied.",
    "Would not recommend this product to anyone. Save your money."
]

neutral_reviews = [
    "The product is okay, nothing special but does what it's supposed to do.",
    "Average quality for the price. Could be better but not terrible.",
    "It's fine, meets basic expectations but nothing extraordinary.",
    "Decent product with some pros and cons. Mixed feelings about it.",
    "Works as expected, though there are some minor issues.",
    "The quality is acceptable for this price range. Nothing more, nothing less.",
    "Standard product with average performance. Does the job.",
    "It's an okay purchase. Not great but not bad either.",
    "Functional but could use some improvements in design.",
    "Fair value for money. Some good features, some not so good."
]

# Create enhanced dataset
reviews = []
sentiments = []
ratings = []
categories = []
dates = []

# Generate more realistic distribution
for i in range(2000):
    # Generate date within last 2 years
    start_date = datetime.now() - timedelta(days=730)
    random_days = np.random.randint(0, 730)
    review_date = start_date + timedelta(days=random_days)
    
    # Determine sentiment with realistic distribution
    rand = np.random.random()
    if rand < 0.6:  # 60% positive
        review = np.random.choice(positive_reviews)
        sentiment = 'positive'
        rating = np.random.choice([4, 5], p=[0.3, 0.7])
    elif rand < 0.85:  # 25% negative
        review = np.random.choice(negative_reviews)
        sentiment = 'negative'
        rating = np.random.choice([1, 2], p=[0.6, 0.4])
    else:  # 15% neutral
        review = np.random.choice(neutral_reviews)
        sentiment = 'neutral'
        rating = 3
    
    category = np.random.choice(['Electronics', 'Clothing', 'Books', 'Home & Kitchen'], 
                               p=[0.35, 0.25, 0.20, 0.20])
    
    reviews.append(review)
    sentiments.append(sentiment)
    ratings.append(rating)
    categories.append(category)
    dates.append(review_date)

# Create enhanced DataFrame
df_enhanced = pd.DataFrame({
    'review_text': reviews,
    'sentiment': sentiments,
    'rating': ratings,
    'product_category': categories,
    'review_date': dates
})

print(f"Enhanced dataset created with {len(df_enhanced)} reviews")
print("\nEnhanced dataset statistics:")
print(f"Categories: {df_enhanced['product_category'].value_counts().to_dict()}")
print(f"Rating distribution: {df_enhanced['rating'].value_counts().sort_index().to_dict()}")
print(f"Sentiment distribution: {df_enhanced['sentiment'].value_counts().to_dict()}")
print(f"Date range: {df_enhanced['review_date'].min()} to {df_enhanced['review_date'].max()}")

# Use enhanced dataset
df = df_enhanced

Enhanced dataset created with 2000 reviews

Enhanced dataset statistics:
Categories: {'Electronics': 699, 'Clothing': 511, 'Books': 412, 'Home & Kitchen': 378}
Rating distribution: {1: 282, 2: 197, 3: 292, 4: 376, 5: 853}
Sentiment distribution: {'positive': 1229, 'negative': 479, 'neutral': 292}
Date range: 2023-07-09 22:41:44.591750 to 2025-07-07 22:41:44.620479


## 3. Data Preprocessing

In [4]:
# Initialize preprocessor
print("Initializing text preprocessor...")
preprocessor = RetailTextPreprocessor(download_nltk=True)

# Preprocess the dataset
print("Preprocessing text data...")
df_processed = preprocessor.preprocess_dataframe(df, text_column='review_text')

print(f"Processed dataset has {len(df_processed)} reviews")
print("\nText statistics:")
stats = preprocessor.get_text_statistics(df_processed)
display(stats)

# Show example of preprocessing
print("\nExample of preprocessing:")
for i in range(3):
    print(f"\n--- Example {i+1} ---")
    print(f"Original: {df['review_text'].iloc[i]}")
    print(f"Processed: {df_processed['review_text_clean'].iloc[i]}")
    print(f"Word count: {df_processed['review_text_word_count'].iloc[i]}")

Initializing text preprocessor...
Preprocessing text data...


LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger_eng[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger_eng')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger_eng/[0m

  Searched in:
    - '/Users/mayankdw/nltk_data'
    - '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/nltk_data'
    - '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/share/nltk_data'
    - '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


## 4. Sentiment Analysis

In [None]:
# Initialize sentiment analyzer
print("Initializing sentiment analyzer...")
sentiment_analyzer = RetailSentimentAnalyzer(model_type='textblob')

# Analyze sentiment
print("Analyzing sentiment...")
df_sentiment = sentiment_analyzer.analyze_dataframe(df_processed)

print("Sentiment analysis completed!")
print("\nSentiment distribution from analysis:")
sentiment_dist = sentiment_analyzer.get_sentiment_distribution(df_sentiment)
display(sentiment_dist)

# Compare with original labels
print("\nComparison with original sentiment labels:")
comparison = pd.crosstab(df_sentiment['sentiment'], df_sentiment['sentiment_sentiment'], 
                        margins=True, normalize='columns')
display(comparison)

# Show example results
print("\nExample sentiment analysis results:")
for i in range(5):
    print(f"\n--- Example {i+1} ---")
    print(f"Text: {df_sentiment['review_text_clean'].iloc[i][:80]}...")
    print(f"Original Sentiment: {df_sentiment['sentiment'].iloc[i]}")
    print(f"Predicted Sentiment: {df_sentiment['sentiment_sentiment'].iloc[i]}")
    print(f"Confidence: {df_sentiment['sentiment_confidence'].iloc[i]:.3f}")
    if 'sentiment_polarity' in df_sentiment.columns:
        print(f"Polarity: {df_sentiment['sentiment_polarity'].iloc[i]:.3f}")

## 5. Topic Modeling

In [None]:
# Initialize topic modeler
print("Initializing topic modeler...")
topic_modeler = RetailTopicModeler(method='lda')

# Prepare texts for topic modeling
texts = df_sentiment['review_text_clean'].tolist()
print(f"Preparing {len(texts)} texts for topic modeling...")

# Find optimal number of topics (simplified for demo)
print("Finding optimal number of topics...")
try:
    coherence_scores = topic_modeler.find_optimal_topics(texts, max_topics=10, step=2)
    print("Coherence scores:", coherence_scores)
    optimal_topics = max(coherence_scores, key=coherence_scores.get)
    print(f"Optimal number of topics: {optimal_topics}")
except Exception as e:
    print(f"Coherence optimization failed: {e}")
    optimal_topics = 5
    print(f"Using default number of topics: {optimal_topics}")

# Fit topic model
print(f"Fitting topic model with {optimal_topics} topics...")
topic_modeler.fit_topic_model(texts, num_topics=optimal_topics)

# Get topic words
topic_words = topic_modeler.get_topic_words(num_words=10)
print("\nTopic words:")
for i, words in enumerate(topic_words):
    print(f"Topic {i}: {', '.join(words)}")

# Create topic summary
topic_summary = topic_modeler.create_topic_summary(texts)
print("\nTopic summary:")
display(topic_summary[['topic_id', 'top_words', 'document_count', 'percentage']])

## 6. Visualization Generation

In [None]:
# Initialize visualization generator
print("Initializing visualization generator...")
viz_generator = RetailVisualizationGenerator()

# Create figures directory
os.makedirs('../figures', exist_ok=True)

print("Generating visualizations...")

In [None]:
# 1. Sentiment distribution plot
print("Creating sentiment distribution plot...")
fig1 = viz_generator.plot_sentiment_distribution(
    df_sentiment, 
    sentiment_column='sentiment_sentiment',
    save_path='../figures/sentiment_distribution.png'
)
plt.show()

In [None]:
# 2. Sentiment by category plot
print("Creating sentiment by category plot...")
fig2 = viz_generator.plot_sentiment_by_category(
    df_sentiment,
    save_path='../figures/sentiment_by_category.png'
)
plt.show()

In [None]:
# 3. Topic distribution plot
print("Creating topic distribution plot...")
fig3 = viz_generator.plot_topic_distribution(
    topic_summary,
    save_path='../figures/topic_distribution.png'
)
plt.show()

In [None]:
# 4. Sentiment confidence distribution
print("Creating sentiment confidence plot...")
fig4 = viz_generator.plot_sentiment_confidence(
    df_sentiment,
    save_path='../figures/sentiment_confidence.png'
)
plt.show()

## 7. Word Cloud Generation

In [None]:
# Create word cloud for all reviews
print("Creating word cloud for all reviews...")
all_text = ' '.join(df_sentiment['review_text_clean'].tolist())
wordcloud = viz_generator.create_word_cloud(
    all_text, 
    title="Customer Reviews Word Cloud",
    save_path='../figures/wordcloud_all.png'
)
plt.show()

In [None]:
# Create word clouds for each sentiment
for sentiment in ['positive', 'negative', 'neutral']:
    if sentiment in df_sentiment['sentiment_sentiment'].values:
        sentiment_text = ' '.join(
            df_sentiment[df_sentiment['sentiment_sentiment'] == sentiment]['review_text_clean'].tolist()
        )
        if sentiment_text.strip():
            print(f"Creating word cloud for {sentiment} reviews...")
            wordcloud = viz_generator.create_word_cloud(
                sentiment_text, 
                title=f"{sentiment.capitalize()} Reviews Word Cloud",
                save_path=f'../figures/wordcloud_{sentiment}.png'
            )
            plt.show()

## 8. Comprehensive Dashboard

In [None]:
# Create comprehensive dashboard
print("Creating comprehensive dashboard...")
dashboard = viz_generator.create_comprehensive_dashboard(
    df_sentiment, 
    topic_summary,
    save_path='../figures/comprehensive_dashboard.png'
)
plt.show()

## 9. Business Intelligence Insights

In [None]:
# Generate business insights
print("Generating business intelligence insights...")

insights = {
    'total_reviews': len(df_sentiment),
    'sentiment_distribution': df_sentiment['sentiment_sentiment'].value_counts().to_dict(),
    'avg_confidence': df_sentiment['sentiment_confidence'].mean(),
    'category_performance': df_sentiment.groupby('product_category')['sentiment_sentiment'].value_counts().to_dict(),
    'top_topics': topic_summary[['topic_id', 'top_words', 'percentage']].to_dict('records'),
    'rating_sentiment_correlation': df_sentiment.groupby('rating')['sentiment_sentiment'].value_counts().to_dict()
}

print("=" * 50)
print("BUSINESS INTELLIGENCE INSIGHTS")
print("=" * 50)
print(f"Total Reviews Analyzed: {insights['total_reviews']:,}")
print(f"Average Sentiment Confidence: {insights['avg_confidence']:.3f}")

print("\n📊 SENTIMENT DISTRIBUTION:")
for sentiment, count in insights['sentiment_distribution'].items():
    percentage = (count / insights['total_reviews']) * 100
    print(f"  {sentiment.capitalize()}: {count:,} ({percentage:.1f}%)")

print("\n🏷️ TOP TOPICS:")
for topic in insights['top_topics']:
    print(f"  Topic {topic['topic_id']}: {topic['top_words'][:50]}... ({topic['percentage']:.1f}%)")

print("\n📈 CATEGORY PERFORMANCE:")
category_sentiment = df_sentiment.groupby(['product_category', 'sentiment_sentiment']).size().unstack(fill_value=0)
category_sentiment_pct = category_sentiment.div(category_sentiment.sum(axis=1), axis=0) * 100
display(category_sentiment_pct.round(1))

print("\n⭐ RATING vs SENTIMENT CORRELATION:")
rating_sentiment = pd.crosstab(df_sentiment['rating'], df_sentiment['sentiment_sentiment'], normalize='index') * 100
display(rating_sentiment.round(1))

## 10. Actionable Business Recommendations

In [None]:
# Generate actionable recommendations
print("=" * 50)
print("ACTIONABLE BUSINESS RECOMMENDATIONS")
print("=" * 50)

# Calculate key metrics for recommendations
positive_pct = (insights['sentiment_distribution'].get('positive', 0) / insights['total_reviews']) * 100
negative_pct = (insights['sentiment_distribution'].get('negative', 0) / insights['total_reviews']) * 100

# Category with lowest positive sentiment
category_positive = df_sentiment[df_sentiment['sentiment_sentiment'] == 'positive']['product_category'].value_counts(normalize=True) * 100
category_negative = df_sentiment[df_sentiment['sentiment_sentiment'] == 'negative']['product_category'].value_counts(normalize=True) * 100

print(f"\n🎯 IMMEDIATE ACTIONS (0-3 months):")
print(f"1. Monitor negative sentiment ({negative_pct:.1f}%) - implement real-time alerts")
print(f"2. Address quality issues in categories with high negative sentiment")
print(f"3. Leverage positive sentiment ({positive_pct:.1f}%) for marketing campaigns")
print(f"4. Set up automated sentiment monitoring dashboard")

print(f"\n📊 STRATEGIC INITIATIVES (3-12 months):")
print(f"1. Implement predictive analytics for customer satisfaction")
print(f"2. Develop category-specific improvement strategies")
print(f"3. Create sentiment-driven product development roadmap")
print(f"4. Establish competitive sentiment benchmarking")

print(f"\n🚀 LONG-TERM GOALS (12+ months):")
print(f"1. Achieve 70%+ positive sentiment across all categories")
print(f"2. Reduce negative sentiment to <15%")
print(f"3. Implement real-time personalization based on sentiment")
print(f"4. Establish market leadership in customer satisfaction")

print(f"\n💡 INNOVATION OPPORTUNITIES:")
print(f"1. AI-powered customer service based on sentiment patterns")
print(f"2. Dynamic pricing models incorporating sentiment data")
print(f"3. Predictive quality assurance using review analysis")
print(f"4. Sentiment-driven supply chain optimization")

## 11. Save Results and Generate Report

In [None]:
# Save processed data
print("Saving analysis results...")

# Save main results
loader.save_processed_data(df_sentiment, 'analyzed_reviews_demo.csv')
loader.save_processed_data(topic_summary, 'topic_summary_demo.csv')

# Save business insights
insights_df = pd.DataFrame({
    'metric': ['total_reviews', 'avg_confidence', 'positive_pct', 'negative_pct', 'neutral_pct'],
    'value': [
        insights['total_reviews'],
        insights['avg_confidence'],
        (insights['sentiment_distribution'].get('positive', 0) / insights['total_reviews']) * 100,
        (insights['sentiment_distribution'].get('negative', 0) / insights['total_reviews']) * 100,
        (insights['sentiment_distribution'].get('neutral', 0) / insights['total_reviews']) * 100
    ]
})
loader.save_processed_data(insights_df, 'business_insights_demo.csv')

print("\n✅ ANALYSIS COMPLETE!")
print("\n📁 Results saved to:")
print("- data/processed/analyzed_reviews_demo.csv")
print("- data/processed/topic_summary_demo.csv")
print("- data/processed/business_insights_demo.csv")
print("- figures/ (all visualization plots)")

print("\n📋 Next steps:")
print("1. Review the research paper: paper/research_paper.md")
print("2. Explore the generated visualizations in figures/")
print("3. Implement recommendations based on business insights")
print("4. Set up real-time monitoring using this framework")

print("\n🎉 Thank you for using the Retail Semantic Analysis framework!")