# Sentiment Analysis and Topic Modeling for Yelp Business Success
# Author: Farzam Afzal

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# NLP libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Topic Modeling
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
#import pyLDAvis.sklearn

# BERT Sentiment Analysis
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from scipy.special import softmax

In [21]:
# Downloading necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Setting paths for data files
review_path = 'yelp_academic_dataset_review.json'  # Update with actual path
business_path = 'yelp_academic_dataset_business.json'  # Update with actual path

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1002)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1002)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1002)>


In [None]:
# Function for reading JSON files line by line
def read_json(file_path, max_records=None):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(tqdm(f)):
            if max_records and i >= max_records:
                break
            data.append(json.loads(line))
    return pd.DataFrame(data)

# Loading a sample of review data for development (adjust max_records for full implementation)
print("Loading review data...")
review_df = read_json(review_path, max_records=50000)  # May use a larger sample for actual implementation
print(f"Loaded {len(review_df)} reviews.")

# Load business data to join with reviews
print("Loading business data...")
business_df = read_json(business_path)
print(f"Loaded {len(business_df)} businesses.")

In [None]:
# Joining business categories with reviews for context-aware analysis
review_business_df = review_df.merge(
    business_df[['business_id', 'categories']], 
    on='business_id', 
    how='left'
)

#--------------------------------------------------
# Text Preprocessing Functions
#--------------------------------------------------

In [None]:
def preprocess_text(text):
    """Clean and normalize text data for NLP tasks."""
    if not isinstance(text, str):
        return ""
    
    # Conversion to lowercase
    text = text.lower()
    
    # Removal of HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Removal of URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Removal of special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Removal of extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def tokenize_and_lemmatize(text):
    """Tokenize and lemmatize text."""
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Removing stopwords and lemmatizing
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    return tokens

#--------------------------------------------------
# BERT-based Sentiment Analysis
#--------------------------------------------------

In [None]:
class BERTSentimentAnalyzer:
    def __init__(self):
        # Load pre-trained BERT model for sentiment analysis
        self.tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
        self.model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
        self.sentiment_pipeline = pipeline("sentiment-analysis", 
                                           model=self.model, 
                                           tokenizer=self.tokenizer)
        
    def analyze_sentiment(self, text, max_length=512):
        """
        Analyze sentiment of text using BERT.
        Returns:
            - sentiment_score: Normalized score between 0 and 1
            - sentiment_label: One of 'negative', 'neutral', or 'positive'
        """
        # Truncates text to max_length
        text = text[:max_length]
        
        # Gets sentiment prediction
        result = self.sentiment_pipeline(text)[0]
        
        # Extracts label and score
        raw_score = int(result['label'].split(' ')[0])
        
        # Normalizes to 0-1 range (model returns scores 1-5)
        normalized_score = (raw_score - 1) / 4
        
        # Determines sentiment label
        if normalized_score < 0.4:
            sentiment_label = 'negative'
        elif normalized_score > 0.6:
            sentiment_label = 'positive'
        else:
            sentiment_label = 'neutral'
            
        return normalized_score, sentiment_label
    
    def analyze_batch(self, texts, batch_size=32):
        """Analyze sentiment for a batch of texts."""
        sentiment_scores = []
        sentiment_labels = []
        
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            results = []
            
            for text in tqdm(batch, desc=f"Processing batch {i//batch_size + 1}"):
                if isinstance(text, str) and len(text.strip()) > 0:
                    score, label = self.analyze_sentiment(text)
                else:
                    score, label = 0.5, 'neutral'  # Default for empty text
                
                results.append((score, label))
            
            batch_scores, batch_labels = zip(*results)
            sentiment_scores.extend(batch_scores)
            sentiment_labels.extend(batch_labels)
        
        return sentiment_scores, sentiment_labels

#--------------------------------------------------
# Topic Modeling with LDA
#--------------------------------------------------

In [None]:
class ReviewTopicModeler:
    def __init__(self, n_topics=10, max_features=5000, min_df=5, max_df=0.8):
        self.n_topics = n_topics
        self.max_features = max_features
        self.min_df = min_df
        self.max_df = max_df
        
        # Initializes vectorizer
        self.vectorizer = CountVectorizer(
            max_features=max_features,
            min_df=min_df,
            max_df=max_df,
            stop_words='english'
        )
        
        # Initializes LDA model
        self.lda_model = LatentDirichletAllocation(
            n_components=n_topics,
            random_state=42,
            max_iter=10,
            learning_method='online'
        )
        
        self.feature_names = None
        self.document_topics = None
        
    def fit_transform(self, texts):
        """Fit LDA model and transform texts to topic distributions."""
        # Creates document-term matrix
        print("Creating document-term matrix...")
        dtm = self.vectorizer.fit_transform(texts)
        self.feature_names = self.vectorizer.get_feature_names_out()
        
        # Fits LDA model
        print(f"Fitting LDA model with {self.n_topics} topics...")
        self.document_topics = self.lda_model.fit_transform(dtm)
        
        return self.document_topics
    
    def transform(self, texts):
        """Transform new texts to topic distributions."""
        dtm = self.vectorizer.transform(texts)
        return self.lda_model.transform(dtm)
    
    def get_topic_words(self, n_words=10):
        """Get the top words for each topic."""
        topic_words = []
        
        for topic_idx, topic in enumerate(self.lda_model.components_):
            top_word_indices = topic.argsort()[:-n_words-1:-1]
            top_words = [self.feature_names[i] for i in top_word_indices]
            topic_words.append(top_words)
            
        return topic_words
    
    def visualize_topics(self, dtm=None):
        """Create an interactive visualization of topics."""
        if dtm is None:
            # Uses the document-term matrix from fit_transform
            dtm = self.vectorizer.transform(texts)
            
        # Prepares visualization
        vis_data = pyLDAvis.sklearn.prepare(
            self.lda_model, 
            dtm, 
            self.vectorizer,
            mds='tsne'
        )
        
        # Saves visualization to HTML
        pyLDAvis.save_html(vis_data, 'lda_visualization.html')
        print("Visualization saved to 'lda_visualization.html'")
        
        return vis_data

#--------------------------------------------------
# Main Analysis Pipeline
#--------------------------------------------------

In [None]:
def main():
    # Sampling a subset of reviews for development (remove this limit for full implementation)
    sample_size = 10000
    print(f"Sampling {sample_size} reviews for analysis...")
    review_sample = review_business_df.sample(sample_size, random_state=42)
    
    # Preprocessing review text
    print("Preprocessing review text...")
    review_sample['processed_text'] = review_sample['text'].apply(preprocess_text)
    
    # Filters out empty reviews after preprocessing
    review_sample = review_sample[review_sample['processed_text'].str.len() > 20]
    print(f"After preprocessing, {len(review_sample)} reviews remain.")
    
    # Initializes BERT sentiment analyzer
    print("Initializing BERT sentiment analyzer...")
    sentiment_analyzer = BERTSentimentAnalyzer()
    
    # Analyzes sentiment for reviews
    print("Analyzing sentiment...")
    sentiment_scores, sentiment_labels = sentiment_analyzer.analyze_batch(
        review_sample['processed_text'].tolist()
    )
    
    # Adds sentiment analysis results to dataframe
    review_sample['sentiment_score'] = sentiment_scores
    review_sample['sentiment_label'] = sentiment_labels
    
    # Initializes topic modeler
    print("Initializing topic modeler...")
    topic_modeler = ReviewTopicModeler(n_topics=10)
    
    # Fits topic model and get topic distributions
    topic_distributions = topic_modeler.fit_transform(review_sample['processed_text'])
    
    # Adds dominant topic for each review
    dominant_topics = np.argmax(topic_distributions, axis=1)
    review_sample['dominant_topic'] = dominant_topics
    
    # Gets top words for each topic
    topic_words = topic_modeler.get_topic_words(n_words=15)
    
    # Prints topic keywords
    print("\nTopic Keywords:")
    for i, words in enumerate(topic_words):
        print(f"Topic {i}: {', '.join(words)}")
        
    # Analyzes relationship between sentiment and topics
    topic_sentiment = review_sample.groupby('dominant_topic')['sentiment_score'].mean()
    
    print("\nAverage Sentiment Score by Topic:")
    print(topic_sentiment)
    
    # Analyzes relationship between topics and business stars
    topic_stars = review_sample.groupby('dominant_topic')['stars'].mean()
    
    print("\nAverage Star Rating by Topic:")
    print(topic_stars)
    
    # Saves processed data for further analysis
    print("Saving processed data...")
    review_sample.to_csv('processed_reviews_with_nlp.csv', index=False)
    
    # Creates sentiment score distribution by business
    business_sentiment = review_sample.groupby('business_id')['sentiment_score'].agg(['mean', 'median', 'count'])
    business_sentiment = business_sentiment.rename(columns={'mean': 'avg_sentiment', 'count': 'review_count'})
    
    # Merges with business data
    business_nlp_df = business_df.merge(business_sentiment, on='business_id', how='inner')
    
    # Saves business-level sentiment data
    business_nlp_df.to_csv('business_sentiment_data.csv', index=False)
    
    print("Analysis complete. Results saved to CSV files.")
    
    # Creates visualizations
    plt.figure(figsize=(10, 6))
    sns.histplot(review_sample['sentiment_score'], bins=20, kde=True)
    plt.title('Distribution of Sentiment Scores')
    plt.xlabel('Sentiment Score (0-1)')
    plt.ylabel('Count')
    plt.savefig('sentiment_distribution.png')
    
    # Topic-Sentiment relationship visualization
    plt.figure(figsize=(12, 6))
    topic_sentiment.plot(kind='bar')
    plt.title('Average Sentiment Score by Topic')
    plt.xlabel('Topic ID')
    plt.ylabel('Average Sentiment Score')
    plt.savefig('topic_sentiment.png')
    
    # Generating interactive LDA visualization
    topic_modeler.visualize_topics()

if __name__ == "__main__":
    main()