In [1]:
%pip install pandas numpy scikit-learn plotly matplotlib wordcloud
# This script installs necessary Python packages for data analysis and visualization.
# It includes packages for data manipulation (pandas, numpy) ...
# interactive plotting (plotly), static plotting (matplotlib) ...
# After running this script, you will have the required libraries.
# Do not run if you already have these packages installed.

Note: you may need to restart the kernel to use updated packages.


In [4]:
# Standard libraries
import pandas as pd

# Machine learning / NLP
from sklearn.feature_extraction.text import TfidfVectorizer

# Visualization
import plotly.express as px

import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [5]:
# Colors for datasets
COLORS = {
    "Conversational": "rgba(0, 128, 128, 0.8)",  # Teal
    "Baseline": "rgba(255, 165, 0, 0.75)",  # Soft Orange
}

# Themes you want to analyze
THEMES_TO_ANALYZE = [
    "AI Performance & Quality",
    "Monetization & Value",
    "Technical Performance",
]

In [6]:
def get_top_tfidf_words(documents, top_n=15):
    """
    Compute top N TF-IDF scoring words or phrases from a list/series of documents.

    Parameters:
        documents (iterable): List or pandas Series of text documents.
        top_n (int): Number of top terms to return.

    Returns:
        pandas.DataFrame: DataFrame with columns ['term', 'tfidf'] sorted by tfidf descending.
    """
    vectorizer = TfidfVectorizer(
        ngram_range=(1, 2), max_features=5000, stop_words="english"
    )
    tfidf_matrix = vectorizer.fit_transform(documents)
    avg_tfidf = tfidf_matrix.mean(axis=0).A1
    terms = vectorizer.get_feature_names_out()
    scores_df = pd.DataFrame({"term": terms, "tfidf": avg_tfidf})
    return scores_df.sort_values(by="tfidf", ascending=False).head(top_n)

In [7]:
def plot_pain_words_bar(top_pain_words, theme, dataset_name, color):
    """
    Plot a horizontal bar chart of pain words TF-IDF scores using Plotly Express.

    Parameters:
        top_pain_words (pd.DataFrame): DataFrame with 'term' and 'tfidf' columns.
        theme (str): Theme name.
        dataset_name (str): Name of the dataset (Conversational or Baseline).
        color (str): Color for the bars.
    """
    fig = px.bar(
        top_pain_words.sort_values("tfidf", ascending=True),
        x="tfidf",
        y="term",
        orientation="h",
        title=f"Top Pain Words in '{theme}' - {dataset_name} Dataset (Most Negative Reviews)",
        labels={"tfidf": "TF-IDF Score", "term": "Pain Word / Phrase"},
        color_discrete_sequence=[color],
    )
    fig.update_layout(yaxis=dict(tickfont=dict(size=12)))
    fig.show()

In [8]:
# Load datasets and drop missing reviews early
conv_df = pd.read_csv("conversational_apps_themed_and_scored.csv").dropna(
    subset=["review_text"]
)
base_df = pd.read_csv("baseline_app_themed_and_scored.csv").dropna(
    subset=["review_text"]
)

print(f"Conversational dataset shape: {conv_df.shape}")
print(f"Baseline dataset shape: {base_df.shape}")

print("Conversational Themes:", conv_df["theme"].unique())
print("Baseline Themes:", base_df["theme"].unique())

Conversational dataset shape: (20178, 9)
Baseline dataset shape: (8506, 18)
Conversational Themes: ['Feature-Specific Issues' 'Monetization & Value' 'Other/Misc.'
 'Outliers / Generic' 'AI Performance & Quality' 'Technical Performance']
Baseline Themes: ['Monetization & Value' 'Outliers / Generic' 'Other/Misc.'
 'Content-Specific Issues' 'Technical Performance']


In [9]:
for theme in THEMES_TO_ANALYZE:
    print(f"\n=== Analyzing Theme: {theme} ===")

    for ds_name, dataset in [("Conversational", conv_df), ("Baseline", base_df)]:
        # Filter to theme reviews
        theme_reviews = dataset[dataset["theme"] == theme]

        # Check if any reviews exist for the theme
        if theme_reviews.empty:
            print(f"No reviews found for theme '{theme}' in {ds_name} dataset.")
            continue

        # Get sentiment cutoff for bottom 25%
        sentiment_cutoff = theme_reviews["sentiment_score"].quantile(0.25)

        # Filter to most negative reviews only
        neg_reviews = theme_reviews[
            theme_reviews["sentiment_score"] <= sentiment_cutoff
        ]

        if neg_reviews.empty:
            print(
                f"No negative reviews (bottom 25%) found for theme '{theme}' in {ds_name} dataset."
            )
            continue

        # Get top TF-IDF pain words
        top_pain_words = get_top_tfidf_words(neg_reviews["review_text"], top_n=15)

        # Plot results with configured colors
        plot_pain_words_bar(top_pain_words, theme, ds_name, COLORS[ds_name])


=== Analyzing Theme: AI Performance & Quality ===


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
def get_top_tfidf_words(docs, top_n=15):
    vectorizer = TfidfVectorizer(
        ngram_range=(1, 2), max_features=5000, stop_words="english"
    )
    tfidf_matrix = vectorizer.fit_transform(docs)
    avg_tfidf = tfidf_matrix.mean(axis=0).A1
    terms = vectorizer.get_feature_names_out()
    scores = pd.DataFrame({"term": terms, "tfidf": avg_tfidf})
    return scores.sort_values("tfidf", ascending=False).head(top_n)


# Filter Conversational dataset for theme and negative reviews (bottom 25%)
conv_monet = conv_df[conv_df["theme"] == "Monetization & Value"]
conv_monet_neg = conv_monet[
    conv_monet["sentiment_score"] <= conv_monet["sentiment_score"].quantile(0.25)
]

# Filter Baseline dataset for theme and negative reviews (bottom 25%)
base_monet = base_df[base_df["theme"] == "Monetization & Value"]
base_monet_neg = base_monet[
    base_monet["sentiment_score"] <= base_monet["sentiment_score"].quantile(0.25)
]

# Get top pain words TF-IDF scores
conv_pain = get_top_tfidf_words(conv_monet_neg["review_text"], top_n=15)
conv_pain["dataset"] = "Conversational"

base_pain = get_top_tfidf_words(base_monet_neg["review_text"], top_n=15)
base_pain["dataset"] = "Baseline"

# Combine for plotting
combined = pd.concat([conv_pain, base_pain])

# Sort combined dataframe for better visual ordering
combined_sorted = combined.sort_values(by="tfidf", ascending=True)

# Plot with Plotly Express
fig = px.bar(
    combined_sorted,
    x="tfidf",
    y="term",
    color="dataset",
    orientation="h",
    barmode="group",
    title="Emotional Fingerprint Comparison: 'Monetization & Value'<br><sub>Conversational vs Baseline Apps</sub>",
    height=700,
    color_discrete_map={
        "Conversational": "rgba(0, 128, 128, 0.8)",  # Teal
        "Baseline": "rgba(255, 165, 0, 0.75)",  # Soft Orange
    },
    labels={
        "tfidf": "TF-IDF Score (Pain Word Importance)",
        "term": "Pain Word / Phrase",
        "dataset": "App Type",
    },
)

fig.update_traces(
    marker_line_width=0.5,
    marker_line_color="rgba(0,0,0,0.1)",
    hovertemplate="<b>%{y}</b><br>%{x:.4f} TF-IDF<br>%{color}<extra></extra>",
)

fig.update_layout(
    font=dict(family="Segoe UI", size=14),
    title_font=dict(size=22),
    yaxis=dict(
        categoryorder="total ascending",
        tickfont=dict(size=13),
        gridcolor="rgba(200,200,200,0.1)",
    ),
    xaxis=dict(tickfont=dict(size=13), gridcolor="rgba(200,200,200,0.2)"),
    legend=dict(
        title=None,
        orientation="v",
        yanchor="top",
        y=1,
        xanchor="left",
        x=1.02,
        font=dict(size=13),
    ),
    plot_bgcolor="white",
    paper_bgcolor="white",
    margin=dict(l=100, r=120, t=90, b=60),
)

fig.show()