<a href="https://colab.research.google.com/github/MaimunaSun/NLP/blob/main/PDF_Notebooks/Insights_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Restaurants receive thousands of customer reviews, but star ratings alone fail to explain why ratings increase or decline. Reviews often contain rich information about food quality, service, pricing, ambience, and operational factors, yet this information remains unstructured and difficult to analyze at scale. This limits restaurants’ ability to identify the drivers of customer satisfaction and dissatisfaction

Notebook 3: Insight Analysis

This notebook performs exploratory and diagnostic analysis on a previously constructed review-level dataset (aspect_df) to explain why restaurant ratings vary.

By analyzing sentiment distributions across review topics (aspects) and linking them to restaurant operational attributes, this notebook identifies the key drivers of positive and negative customer sentiment. The goal is to translate unstructured review text into actionable insights that help explain changes in star ratings beyond the ratings themselves.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Load Prepared Analytical Dataset

In [None]:
import pandas as pd
import pickle

PATH = "/content/drive/MyDrive/restaurant_sentiment_model/aspect_df.pkl"

#load
aspect_df = pd.read_pickle(PATH)


In [None]:
aspect_df.head()

In [None]:
aspect_df.info()

In [None]:
pip install -U bertopic

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
MODEL_PATH = "/content/drive/MyDrive/restaurant_sentiment_model/aspect"
topic_model = BERTopic.load(
    MODEL_PATH,
    embedding_model=embedding_model
)


Topic and Sentiment Label Mapping

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

#Map sentiment labels
sentiment_mapping = {0: "Negative", 1: "Neutral", 2: "Positive"}
aspect_df['sentiment_label'] = aspect_df['sentiment'].map(sentiment_mapping)

#Map topic IDs to descriptive labels
topic_info = topic_model.get_topic_info()

topic_mapping = {}
for topic_id in topic_info['Topic']:
    if topic_id == -1:
        topic_mapping[topic_id] = "Other / Outliers"
    else:
        words = [word for word, _ in topic_model.get_topic(topic_id)[:3]]  # top 3 words
        topic_mapping[topic_id] = " ".join(words)

aspect_df['topic_label'] = aspect_df['topic'].map(topic_mapping)

In [None]:
aspect_df.info()


In [None]:
aspect_df.head()

In [None]:
filtered_df = aspect_df[aspect_df["topic"] != -1]


In [None]:
filtered_df.head()

In [None]:
import pickle

PATH = "/content/drive/MyDrive/restaurant_sentiment_model/filtered_df.pkl"

#Save
filtered_df.to_pickle(PATH)

#Later, load
filtered_df = pd.read_pickle(PATH)


Most Discussed Restaurant Aspects

In [None]:
import matplotlib.pyplot as plt

topic_counts = (
    filtered_df["topic_label"]
    .value_counts()
    .head(10)
)

plt.figure(figsize=(10,5))
topic_counts.plot(kind="bar")
plt.title("Top 10 Most Discussed Restaurant Aspects (Excluding Outliers)")
plt.xlabel("Aspect")
plt.ylabel("Number of Reviews")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()



Sentiment Distribution by Restaurant Aspect

In [None]:
sentiment_topic = (
    aspect_df
    .groupby(["topic_label", "sentiment_label"])
    .size()
    .unstack(fill_value=0)
    .loc[topic_counts.index]
)

plt.figure(figsize=(10,5))
sentiment_topic.plot(kind="bar", stacked=True)
plt.title("Sentiment Distribution by Restaurant Aspect")
plt.xlabel("Aspect")
plt.ylabel("Number of Reviews")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()


Net Sentiment of Restaurent topics

In [None]:
import matplotlib.pyplot as plt

#Get top 10 topics
top_topics = filtered_df['topic_label'].value_counts().head(10).index

#Filter dataframe to only include top topics
filtered_top_df = filtered_df[filtered_df['topic_label'].isin(top_topics)]

#Calculate mean net sentiment by topic
net_sentiment = (
    filtered_top_df
    .groupby("topic_label")["sentiment"]
    .mean()
    .sort_values()
)

#Plot horizontal bar chart
plt.figure(figsize=(8,6))
net_sentiment.plot(kind="barh", color="skyblue")
plt.title("Net Sentiment Score by Topic")
plt.xlabel("Net Sentiment (Positive − Negative)")
plt.ylabel("Topic")
plt.tight_layout()
plt.show()


Deep-Dive Analysis: Aspect-Specific Drivers

In [None]:
import matplotlib.pyplot as plt

# Define topic
topic = "burger fries burgers"

# Filter for topic
topic_df = filtered_df[filtered_df["topic_label"] == topic]

# Automatically detect operational attributes
exclude_cols = ["business_id", "review", "topic", "topic_label", "sentiment", "sentiment_label"]
operational_attributes = [col for col in topic_df.columns if col not in exclude_cols]

# Compute proportion of negative reviews for each attribute relative to all reviews
neg_attr_scores = {}

for attr in operational_attributes:
    if topic_df[attr].dtype == 'object':
        # Get all reviews for this attribute value
        counts = topic_df.groupby(attr)["sentiment_label"].value_counts(normalize=True)
        # Take the proportion of negative sentiment for the most common attribute value
        most_common_value = topic_df[attr].mode()[0]
        neg_score = counts.get((most_common_value, "Negative"), 0)
        neg_attr_scores[attr] = neg_score

# Sort descending and take top 5
top5_neg_attrs = sorted(neg_attr_scores.items(), key=lambda x: x[1], reverse=True)[:10]

# Plot
attrs, scores = zip(*top5_neg_attrs)

plt.figure(figsize=(8,5))
plt.barh(attrs, scores, color='salmon')
plt.xlabel("Proportion of All Reviews that are Negative")
plt.title(f"Top 10 Operational Attributes Causing Negative Sentiment for '{topic}'")
plt.gca().invert_yaxis()  # highest at top
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Define topic
topic = "burger fries burgers"

# Filter for topic
topic_df = filtered_df[filtered_df["topic_label"] == topic]

# Automatically detect operational attributes
exclude_cols = ["business_id", "review", "topic", "topic_label", "sentiment", "sentiment_label"]
operational_attributes = [col for col in topic_df.columns if col not in exclude_cols]

# Compute proportion of positive reviews for each attribute relative to all reviews
pos_attr_scores = {}

for attr in operational_attributes:
    if topic_df[attr].dtype == 'object':
        # Get all reviews for this attribute value
        counts = topic_df.groupby(attr)["sentiment_label"].value_counts(normalize=True)
        # Take the proportion of positive sentiment for the most common attribute value
        most_common_value = topic_df[attr].mode()[0]
        pos_score = counts.get((most_common_value, "Positive"), 0)
        pos_attr_scores[attr] = pos_score

# Sort descending and take top 5
top5_pos_attrs = sorted(pos_attr_scores.items(), key=lambda x: x[1], reverse=True)[:10]

# Plot
attrs, scores = zip(*top5_pos_attrs)

plt.figure(figsize=(8,5))
plt.barh(attrs, scores, color='salmon')
plt.xlabel("Proportion of All Reviews that are Positive")
plt.title(f"Top 10 Operational Attributes Causing Positive Sentiment for '{topic}'")
plt.gca().invert_yaxis()  # highest at top
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

def plot_top_negative_attributes(filtered_df, topic):
    # Define topic
    topic = topic

    # Filter for topic
    topic_df = filtered_df[filtered_df["topic_label"] == topic]

    # Automatically detect operational attributes
    exclude_cols = ["business_id", "review", "topic", "topic_label", "sentiment", "sentiment_label"]
    operational_attributes = [col for col in topic_df.columns if col not in exclude_cols]

    # Compute proportion of negative reviews for each attribute relative to all reviews
    neg_attr_scores = {}

    for attr in operational_attributes:
        if topic_df[attr].dtype == 'object':
            # Get all reviews for this attribute value
            counts = topic_df.groupby(attr)["sentiment_label"].value_counts(normalize=True)
            # Take the proportion of negative sentiment for the most common attribute value
            most_common_value = topic_df[attr].mode()[0]
            neg_score = counts.get((most_common_value, "Negative"), 0)
            neg_attr_scores[attr] = neg_score

    # Sort descending and take top 5
    top5_neg_attrs = sorted(neg_attr_scores.items(), key=lambda x: x[1], reverse=True)[:10]

    # Plot
    attrs, scores = zip(*top5_neg_attrs)

    plt.figure(figsize=(8,5))
    plt.barh(attrs, scores, color='salmon')
    plt.xlabel("Proportion of All Reviews that are Negative")
    plt.title(f"Top 10 Operational Attributes Causing Negative Sentiment for '{topic}'")
    plt.gca().invert_yaxis()  # highest at top
    plt.tight_layout()
    plt.show()


In [None]:
import matplotlib.pyplot as plt

def plot_top_positive_attributes(filtered_df, topic):
    # Define topic
    topic = topic

    # Filter for topic
    topic_df = filtered_df[filtered_df["topic_label"] == topic]

    # Automatically detect operational attributes
    exclude_cols = ["business_id", "review", "topic", "topic_label", "sentiment", "sentiment_label"]
    operational_attributes = [col for col in topic_df.columns if col not in exclude_cols]

    # Compute proportion of positive reviews for each attribute relative to all reviews
    pos_attr_scores = {}

    for attr in operational_attributes:
        if topic_df[attr].dtype == 'object':
            # Get all reviews for this attribute value
            counts = topic_df.groupby(attr)["sentiment_label"].value_counts(normalize=True)
            # Take the proportion of positive sentiment for the most common attribute value
            most_common_value = topic_df[attr].mode()[0]
            pos_score = counts.get((most_common_value, "Positive"), 0)
            pos_attr_scores[attr] = pos_score

    # Sort descending and take top 5
    top5_pos_attrs = sorted(pos_attr_scores.items(), key=lambda x: x[1], reverse=True)[:10]

    # Plot
    attrs, scores = zip(*top5_pos_attrs)

    plt.figure(figsize=(8,5))
    plt.barh(attrs, scores, color='salmon')
    plt.xlabel("Proportion of All Reviews that are Positive")
    plt.title(f"Top 10 Operational Attributes Causing Positive Sentiment for '{topic}'")
    plt.gca().invert_yaxis()  # highest at top
    plt.tight_layout()
    plt.show()


In [None]:
plot_top_negative_attributes(
    filtered_df,
    topic="burger fries burgers"
)


In [None]:
plot_top_positive_attributes(
    filtered_df,
    topic="burger fries burgers"
)


In [None]:
plot_top_negative_attributes(
    filtered_df,
    topic = "tacos mexican salsa"
)

plot_top_positive_attributes(
    filtered_df,
    topic = "tacos mexican salsa"
)

Operational Attribute Impact Analysis

In [None]:
topic = "burger fries burgers"
attribute = "HappyHour"

topic_df = filtered_df[filtered_df["topic_label"] == topic]

sentiment_dist = (
    topic_df
    .groupby([attribute, "sentiment_label"])
    .size()
    .unstack(fill_value=0)
)

sentiment_dist = sentiment_dist.div(sentiment_dist.sum(axis=1), axis=0)

sentiment_dist.plot(
    kind="bar",
    stacked=True,
    figsize=(8,5)
)

plt.title(f"Sentiment by {attribute} for '{topic}'")
plt.ylabel("Proportion of Reviews")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


Pareto Analysis of Negative Sentiment Drivers

In [None]:
import matplotlib.pyplot as plt

#Count negative reviews per topic
neg_counts = (
    filtered_df[filtered_df["sentiment_label"] == "Negative"]
    ["topic_label"]
    .value_counts()
)

#Compute cumulative percentage
cum_pct = neg_counts.cumsum() / neg_counts.sum() * 100

#Keep only topics up to 80% cumulative impact
pareto_df = (
    cum_pct[cum_pct <= 80]
    .index
)

neg_counts_80 = neg_counts.loc[pareto_df]
cum_pct_80 = cum_pct.loc[pareto_df]

# Plot
plt.figure(figsize=(10,5))

plt.bar(neg_counts_80.index, neg_counts_80.values)
plt.plot(
    neg_counts_80.index,
    cum_pct_80.values,
    marker="o"
)

plt.axhline(80, linestyle="--")
plt.ylabel("Negative Reviews")
plt.title("Pareto Analysis of Negative Sentiment Drivers (80% Threshold)")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()


Heatmap Analysis of Operational Drivers

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

def plot_single_topic_attribute_heatmap(filtered_df, topic):
    #Filter for topic
    topic_df = filtered_df[filtered_df["topic_label"] == topic]

    #Exclude non-operational columns
    exclude_cols = ["business_id", "review", "topic", "topic_label", "sentiment", "sentiment_label"]
    attributes = [c for c in topic_df.columns if c not in exclude_cols]

    heatmap_data = []

    for attr in attributes:
        if topic_df[attr].dtype == "object":
            counts = (
                topic_df
                .groupby(attr)["sentiment_label"]
                .value_counts(normalize=True)
            )
            most_common_value = topic_df[attr].mode()[0]
            neg_score = counts.get((most_common_value, "Negative"), 0)

            heatmap_data.append({
                "attribute": attr,
                "negative_ratio": neg_score
            })

    heatmap_df = (
        pd.DataFrame(heatmap_data)
        .set_index("attribute")
        .sort_values("negative_ratio", ascending=False)
    )

    plt.figure(figsize=(4, max(6, len(heatmap_df) * 0.35)))
    sns.heatmap(
        heatmap_df,
        cmap="Reds",
        annot=True,
        fmt=".2f",
        cbar=True
    )

    plt.title(f"Negative Sentiment by Operational Attribute\nTopic: '{topic}'")
    plt.xlabel("")
    plt.ylabel("Operational Attributes")
    plt.tight_layout()
    plt.show()


In [None]:
plot_single_topic_attribute_heatmap(
    filtered_df,
    topic="burger fries burgers"
)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

def plot_single_topic_sentiment_heatmap(filtered_df, topic):
    #Filter for topic
    topic_df = filtered_df[filtered_df["topic_label"] == topic]

    #Exclude non-operational columns
    exclude_cols = ["business_id", "review", "topic", "topic_label", "sentiment", "sentiment_label"]
    attributes = [c for c in topic_df.columns if c not in exclude_cols]

    heatmap_data = []

    for attr in attributes:
        if topic_df[attr].dtype == "object":
            counts = (
                topic_df
                .groupby(attr)["sentiment_label"]
                .value_counts(normalize=True)
            )

            most_common_value = topic_df[attr].mode()[0]

            neg_ratio = counts.get((most_common_value, "Negative"), 0)
            pos_ratio = counts.get((most_common_value, "Positive"), 0)

            #Net sentiment score: positive - negative
            net_score = pos_ratio - neg_ratio

            heatmap_data.append({
                "attribute": attr,
                "net_sentiment": net_score
            })

    heatmap_df = (
        pd.DataFrame(heatmap_data)
        .set_index("attribute")
        .sort_values("net_sentiment", ascending=True)
    )

    plt.figure(figsize=(4, max(6, len(heatmap_df) * 0.35)))
    sns.heatmap(
        heatmap_df,
        cmap="RdYlGn",
        center=0,
        annot=True,
        fmt=".2f",
        cbar_kws={"label": "Net Sentiment (Positive − Negative)"}
    )

    plt.title(f"Operational Attribute Sentiment Heatmap\nTopic: '{topic}'")
    plt.xlabel("")
    plt.ylabel("Operational Attributes")
    plt.tight_layout()
    plt.show()


In [None]:
plot_single_topic_sentiment_heatmap(
    filtered_df,
    topic="burger fries burgers"
)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Get top 10 topics
top_topics = filtered_df['topic_label'].value_counts().head(10).index

# Filter dataframe to only include top topics
filtered_top_df = filtered_df[filtered_df['topic_label'].isin(top_topics)]

# Group by topic_label and sentiment
topic_sentiment = (
    filtered_top_df
    .groupby(["topic_label", "sentiment_label"])
    .size()
    .unstack(fill_value=0)
)

# Convert to proportions
topic_sentiment_pct = topic_sentiment.div(topic_sentiment.sum(axis=1), axis=0)

# Plot heatmap
plt.figure(figsize=(10,6))
sns.heatmap(
    topic_sentiment_pct,
    cmap="RdYlGn",
    center=0.5,
    annot=True,
    fmt=".2f"
)
plt.title("Sentiment Distribution by Restaurant Topic")
plt.xlabel("Sentiment")
plt.ylabel("Topic")
plt.tight_layout()
plt.show()
