# Quick Fix Demo - Retail Semantic Analysis
This notebook uses simplified preprocessing that doesn't require NLTK downloads

In [1]:
# First, let's try to download NLTK data
import sys
sys.path.append('../src')

try:
    import nltk
    print("Attempting to download NLTK data...")
    nltk.download('punkt')
    nltk.download('punkt_tab')
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('averaged_perceptron_tagger')
    nltk.download('vader_lexicon')
    print("NLTK data downloaded successfully!")
    NLTK_AVAILABLE = True
except Exception as e:
    print(f"NLTK download failed: {e}")
    print("Using simple preprocessor instead...")
    NLTK_AVAILABLE = False

Attempting to download NLTK data...
NLTK data downloaded successfully!


[nltk_data] Downloading package punkt to /Users/mayankdw/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/mayankdw/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mayankdw/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mayankdw/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mayankdw/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mayankdw/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Import modules based on NLTK availability
from data_loader import RetailDataLoader
if NLTK_AVAILABLE:
    from preprocessor import RetailTextPreprocessor
    print("Using full NLTK preprocessor")
else:
    from simple_preprocessor import SimpleTextPreprocessor as RetailTextPreprocessor
    print("Using simplified preprocessor")

from sentiment_analyzer import RetailSentimentAnalyzer
from topic_modeling import RetailTopicModeler
from visualizations import RetailVisualizationGenerator

print("All modules imported successfully!")

Using full NLTK preprocessor
All modules imported successfully!


## 1. Create Enhanced Sample Data

In [3]:
# Initialize data loader
loader = RetailDataLoader()

# Create enhanced sample data
np.random.seed(42)

positive_reviews = [
    "This product exceeded my expectations! The quality is outstanding and delivery was fast.",
    "Excellent value for money. The material feels premium and the design is beautiful.",
    "Perfect fit and great functionality. I would definitely recommend this to others.",
    "Amazing customer service and quick response. The product works exactly as described.",
    "Love the innovative features and user-friendly design. Five stars!",
    "Great packaging and the item arrived in perfect condition. Very satisfied.",
    "This is exactly what I was looking for. Good quality and reasonable price.",
    "Impressive build quality and attention to detail. Worth every penny.",
    "Fast shipping and excellent product. Will definitely buy from this seller again.",
    "Outstanding performance and reliability. Highly recommended for anyone."
]

negative_reviews = [
    "Poor quality materials and the product broke after just a few days of use.",
    "Very disappointed with this purchase. Not as described and overpriced.",
    "Terrible customer service and slow shipping. The item was damaged on arrival.",
    "Cheap construction and doesn't work properly. Waste of money.",
    "The fit is completely wrong and the material feels flimsy.",
    "Misleading product description. What I received was nothing like the pictures.",
    "Took forever to arrive and when it did, it was defective.",
    "Not worth the price at all. Very poor quality and design.",
    "Packaging was terrible and the product was damaged. Very unsatisfied.",
    "Would not recommend this product to anyone. Save your money."
]

neutral_reviews = [
    "The product is okay, nothing special but does what it's supposed to do.",
    "Average quality for the price. Could be better but not terrible.",
    "It's fine, meets basic expectations but nothing extraordinary.",
    "Decent product with some pros and cons. Mixed feelings about it.",
    "Works as expected, though there are some minor issues.",
    "The quality is acceptable for this price range. Nothing more, nothing less.",
    "Standard product with average performance. Does the job.",
    "It's an okay purchase. Not great but not bad either.",
    "Functional but could use some improvements in design.",
    "Fair value for money. Some good features, some not so good."
]

# Generate dataset
reviews = []
sentiments = []
ratings = []
categories = []
dates = []

for i in range(1500):
    # Generate date within last 2 years
    start_date = datetime.now() - timedelta(days=730)
    random_days = np.random.randint(0, 730)
    review_date = start_date + timedelta(days=random_days)
    
    # Determine sentiment with realistic distribution
    rand = np.random.random()
    if rand < 0.6:  # 60% positive
        review = np.random.choice(positive_reviews)
        sentiment = 'positive'
        rating = np.random.choice([4, 5], p=[0.3, 0.7])
    elif rand < 0.85:  # 25% negative
        review = np.random.choice(negative_reviews)
        sentiment = 'negative'
        rating = np.random.choice([1, 2], p=[0.6, 0.4])
    else:  # 15% neutral
        review = np.random.choice(neutral_reviews)
        sentiment = 'neutral'
        rating = 3
    
    category = np.random.choice(['Electronics', 'Clothing', 'Books', 'Home & Kitchen'], 
                               p=[0.35, 0.25, 0.20, 0.20])
    
    reviews.append(review)
    sentiments.append(sentiment)
    ratings.append(rating)
    categories.append(category)
    dates.append(review_date)

# Create DataFrame
df = pd.DataFrame({
    'review_text': reviews,
    'sentiment': sentiments,
    'rating': ratings,
    'product_category': categories,
    'review_date': dates
})

print(f"Dataset created with {len(df)} reviews")
print(f"Categories: {df['product_category'].value_counts().to_dict()}")
print(f"Sentiment distribution: {df['sentiment'].value_counts().to_dict()}")
df.head()

Dataset created with 1500 reviews
Categories: {'Electronics': 507, 'Clothing': 388, 'Books': 324, 'Home & Kitchen': 281}
Sentiment distribution: {'positive': 915, 'negative': 362, 'neutral': 223}


Unnamed: 0,review_text,sentiment,rating,product_category,review_date
0,Not worth the price at all. Very poor quality ...,negative,1,Electronics,2023-10-19 22:55:36.485785
1,Impressive build quality and attention to deta...,positive,5,Electronics,2024-10-17 22:55:36.485995
2,Excellent value for money. The material feels ...,positive,5,Home & Kitchen,2023-11-16 22:55:36.486039
3,Love the innovative features and user-friendly...,positive,5,Books,2024-07-28 22:55:36.486068
4,This product exceeded my expectations! The qua...,positive,5,Clothing,2024-03-17 22:55:36.486093


## 2. Text Preprocessing

In [4]:
# Initialize preprocessor
if NLTK_AVAILABLE:
    preprocessor = RetailTextPreprocessor(download_nltk=True)
else:
    preprocessor = RetailTextPreprocessor()

# Preprocess the dataset
print("Preprocessing text data...")
df_processed = preprocessor.preprocess_dataframe(df, text_column='review_text')

print(f"Processed dataset has {len(df_processed)} reviews")
print("\nText statistics:")
stats = preprocessor.get_text_statistics(df_processed)
display(stats)

# Show examples
print("\nPreprocessing examples:")
for i in range(3):
    print(f"\n--- Example {i+1} ---")
    print(f"Original: {df['review_text'].iloc[i]}")
    print(f"Processed: {df_processed['review_text_clean'].iloc[i]}")

Preprocessing text data...


LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger_eng[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger_eng')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger_eng/[0m

  Searched in:
    - '/Users/mayankdw/nltk_data'
    - '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/nltk_data'
    - '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/share/nltk_data'
    - '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


## 3. Sentiment Analysis

In [None]:
# Initialize sentiment analyzer with TextBlob (most reliable)
sentiment_analyzer = RetailSentimentAnalyzer(model_type='textblob')

# Analyze sentiment
print("Analyzing sentiment...")
df_sentiment = sentiment_analyzer.analyze_dataframe(df_processed)

print("Sentiment analysis completed!")
sentiment_dist = sentiment_analyzer.get_sentiment_distribution(df_sentiment)
print("\nPredicted sentiment distribution:")
display(sentiment_dist)

# Compare with original labels
print("\nComparison: Original vs Predicted")
comparison = pd.crosstab(df_sentiment['sentiment'], df_sentiment['sentiment_sentiment'], 
                        margins=True, normalize='columns')
display(comparison.round(3))

## 4. Topic Modeling

In [None]:
# Initialize topic modeler
topic_modeler = RetailTopicModeler(method='lda')

# Prepare texts
texts = df_sentiment['review_text_clean'].tolist()
print(f"Preparing {len(texts)} texts for topic modeling...")

# Fit topic model with 5 topics
print("Fitting topic model with 5 topics...")
topic_modeler.fit_topic_model(texts, num_topics=5)

# Get topic words
topic_words = topic_modeler.get_topic_words(num_words=8)
print("\nIdentified topics:")
for i, words in enumerate(topic_words):
    print(f"Topic {i}: {', '.join(words)}")

# Create topic summary
topic_summary = topic_modeler.create_topic_summary(texts)
print("\nTopic summary:")
display(topic_summary[['topic_id', 'top_words', 'document_count', 'percentage']])

## 5. Create Visualizations

In [None]:
# Initialize visualization generator
viz_generator = RetailVisualizationGenerator()

# Create sentiment distribution plot
print("Creating sentiment distribution plot...")
fig1 = viz_generator.plot_sentiment_distribution(df_sentiment, sentiment_column='sentiment_sentiment')
plt.show()

In [None]:
# Create sentiment by category plot
print("Creating sentiment by category plot...")
fig2 = viz_generator.plot_sentiment_by_category(df_sentiment)
plt.show()

In [None]:
# Create topic distribution plot
print("Creating topic distribution plot...")
fig3 = viz_generator.plot_topic_distribution(topic_summary)
plt.show()

In [None]:
# Create word cloud
print("Creating word cloud...")
all_text = ' '.join(df_sentiment['review_text_clean'].tolist())
wordcloud = viz_generator.create_word_cloud(all_text, title="Customer Reviews Word Cloud")
plt.show()

## 6. Business Insights Summary

In [None]:
# Generate business insights
insights = {
    'total_reviews': len(df_sentiment),
    'sentiment_distribution': df_sentiment['sentiment_sentiment'].value_counts().to_dict(),
    'avg_confidence': df_sentiment['sentiment_confidence'].mean(),
}

print("=" * 50)
print("BUSINESS INTELLIGENCE INSIGHTS")
print("=" * 50)
print(f"Total Reviews Analyzed: {insights['total_reviews']:,}")
print(f"Average Sentiment Confidence: {insights['avg_confidence']:.3f}")

print("\n📊 SENTIMENT DISTRIBUTION:")
for sentiment, count in insights['sentiment_distribution'].items():
    percentage = (count / insights['total_reviews']) * 100
    print(f"  {sentiment.capitalize()}: {count:,} ({percentage:.1f}%)")

print("\n📈 CATEGORY PERFORMANCE:")
category_sentiment = df_sentiment.groupby(['product_category', 'sentiment_sentiment']).size().unstack(fill_value=0)
category_sentiment_pct = category_sentiment.div(category_sentiment.sum(axis=1), axis=0) * 100
display(category_sentiment_pct.round(1))

print("\n🎯 KEY RECOMMENDATIONS:")
positive_pct = (insights['sentiment_distribution'].get('positive', 0) / insights['total_reviews']) * 100
negative_pct = (insights['sentiment_distribution'].get('negative', 0) / insights['total_reviews']) * 100

print(f"1. Monitor negative sentiment ({negative_pct:.1f}%) for improvement opportunities")
print(f"2. Leverage positive sentiment ({positive_pct:.1f}%) for marketing")
print(f"3. Focus on quality improvements in Electronics category")
print(f"4. Implement real-time sentiment monitoring")
print(f"5. Use topic insights for product development")

print("\n✅ ANALYSIS COMPLETE!")
print("Check the research paper at: paper/research_paper.md")