In [1]:
# --- INSTALL LIBRARIES ---
# Use --quiet to keep the output clean
%pip install bertopic sentence-transformers --quiet

# --- IMPORTS ---
import os
import re  # noqa: F401
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go  # noqa: F401
import nltk

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from bertopic import BERTopic

# --- DOWNLOAD NLTK DATA (if missing) ---
try:
    nltk.data.find("sentiment/vader_lexicon.zip")
except LookupError:
    nltk.download("vader_lexicon", quiet=True)

print("Setup Complete. Libraries installed.")


Note: you may need to restart the kernel to use updated packages.
Setup Complete. Libraries installed.


In [2]:
# --- CONFIGURATION ---
# Find repo root dynamically by looking for '1_datasets' folder
cwd = os.getcwd()
while not os.path.exists(os.path.join(cwd, "1_datasets")):
    parent = os.path.dirname(cwd)
    if parent == cwd:  # reached root of drive
        raise FileNotFoundError("Could not find repo root containing '1_datasets'.")
    cwd = parent

REPO_ROOT = cwd
DATA_PATH = os.path.join(REPO_ROOT, "1_datasets", "all_datasets")
print(f"Repo root detected at: {REPO_ROOT}")

# --- CONFIGURATION ---
APP_FILES = {
    "Wysa": "wysa_apps_dataset.csv",
    "Youper": "youper_apps_dataset.csv",
    "Woebot": "woebot_apps_dataset.csv",
    "Replika": "replika_apps_dataset.csv",
}

# Check if DATA_PATH exists
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(
        f"DATA_PATH not found: {DATA_PATH}\n"
        "👉 Make sure you run this notebook from the repo root folder."
    )

# --- DOWNLOAD NLTK DATA ---
# VADER is used for sentiment analysis used explicitly for time series analysis.
try:
    nltk.data.find("sentiment/vader_lexicon.zip")
except LookupError:
    print("Downloading VADER lexicon...")
    nltk.download("vader_lexicon", quiet=True)

# --- LOAD DATA ---
all_reviews_list = []
print("Loading data from 4 files...")
for app_name, filename in APP_FILES.items():
    file_path = os.path.join(DATA_PATH, filename)
    try:
        df = pd.read_csv(file_path)
        df["app_name"] = app_name
        all_reviews_list.append(df)
    except FileNotFoundError:
        print(f"WARNING: File not found {filename}.")

print("Data loading complete.")

Repo root detected at: c:\Users\azizt\OneDrive\Desktop\ET6-CDSP-group-20-repo
Loading data from 4 files...
Data loading complete.


In [3]:
# --- COMBINE AND PREPARE DATAFRAME ---
print("Combining and cleaning data...")

combined_df = pd.concat(all_reviews_list, ignore_index=True)

# Standardize column names
combined_df.columns = combined_df.columns.str.strip().str.lower()
if "content" in combined_df.columns:
    combined_df.rename(columns={"content": "review_text"}, inplace=True)
if "review" in combined_df.columns:
    combined_df.rename(columns={"review": "review_text"}, inplace=True)

# Clean and format data
combined_df.dropna(subset=["review_text"], inplace=True)
combined_df["review_text_lower"] = combined_df["review_text"].astype(str).str.lower()
combined_df["date"] = pd.to_datetime(combined_df["date"], errors="coerce")

print(f"Data combined and cleaned. Total reviews: {len(combined_df)}")
display(combined_df.head())

Combining and cleaning data...
Data combined and cleaned. Total reviews: 20458


Unnamed: 0,user_name,review_text,rating,date,app_name,review_text_lower
0,skystormer,"I tried having a conversation with this thing,...",1,2025-06-05 09:38:20,Wysa,"i tried having a conversation with this thing,..."
1,Vibeway,Feels like it’s geared for a 13 year-old and n...,1,2025-05-19 19:04:47,Wysa,feels like it’s geared for a 13 year-old and n...
2,IsaacHarris1,"Maybe I haven’t used this enough yet, but my e...",2,2025-05-13 16:28:32,Wysa,"maybe i haven’t used this enough yet, but my e..."
3,The robot therapy 😂🤗,Overall I have actually improved my mood daily...,1,2025-05-05 21:11:36,Wysa,overall i have actually improved my mood daily...
4,Josie skisnsh,"Not free, just another fake free app",1,2025-04-14 07:59:13,Wysa,"not free, just another fake free app"


In [4]:
# --- SENTIMENT ANALYSIS TO ISOLATE COMPLAINTS ---
print("Running sentiment analysis to identify complaints...")

sia = SentimentIntensityAnalyzer()
combined_df["sentiment_score"] = combined_df["review_text_lower"].apply(
    lambda x: sia.polarity_scores(x)["compound"]
)

# Isolate negative reviews (the complaints). A score < -0.05 is a standard threshold.
negative_reviews = combined_df[combined_df["sentiment_score"] < -0.05].copy()

print(
    f"Analysis complete. Identified {len(negative_reviews)} complaints for topic modeling."
)
display(negative_reviews[["app_name", "review_text", "sentiment_score"]].head())

Running sentiment analysis to identify complaints...
Analysis complete. Identified 8741 complaints for topic modeling.


Unnamed: 0,app_name,review_text,sentiment_score
0,Wysa,"I tried having a conversation with this thing,...",-0.8897
4,Wysa,"Not free, just another fake free app",-0.8178
6,Wysa,I took screenshots of the nonsensical things t...,-0.34
10,Wysa,I was using this app and it was giving me advi...,-0.848
11,Wysa,Whenever the chat crashes it doesn’t respond t...,-0.6418


In [5]:
# --- TOPIC MODELING WITH BERTOPIC ---
print(
    f"Preparing to model {len(negative_reviews)} complaints. This may take several minutes..."
)

# Prepare the text data for the model
docs = negative_reviews["review_text_lower"].tolist()

# Initialize and train the BERTopic model.
topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(docs)

# Add the raw topic numbers back to our DataFrame.
negative_reviews["topic_id"] = topics

print("Topic modeling complete.")

2025-08-23 21:31:54,663 - BERTopic - Embedding - Transforming documents to embeddings.


Preparing to model 8741 complaints. This may take several minutes...


Batches:   0%|          | 0/274 [00:00<?, ?it/s]

2025-08-23 21:32:47,450 - BERTopic - Embedding - Completed ✓
2025-08-23 21:32:47,451 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-08-23 21:33:10,033 - BERTopic - Dimensionality - Completed ✓
2025-08-23 21:33:10,034 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-08-23 21:33:14,041 - BERTopic - Cluster - Completed ✓
2025-08-23 21:33:14,055 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-08-23 21:33:14,323 - BERTopic - Representation - Completed ✓


Topic modeling complete.


In [6]:
# --- MAP DISCOVERED TOPICS TO 5 SUPER TOPICS ---
print("Mapping all 114 granular topics to 5 specified Super Topics...")

topic_id_to_super_topic_map = {
    # Topic ID: 'Super Topic'
    # ==================================
    # === 1. Monetization & Value ===
    # ==================================
    1: "Monetization & Value",  # Keywords: pro, version, year, 70
    2: "Monetization & Value",  # Keywords: monthly, subscription, option
    13: "Monetization & Value",  # Keywords: ai, 70, for, pay
    15: "Monetization & Value",  # Keywords: monthly, subscription, subscribe, yearly
    18: "Monetization & Value",  # Keywords: refund, charged, app, google
    20: "Monetization & Value",  # Keywords: pay, have, everything, you
    21: "Monetization & Value",  # Keywords: relationship, status, have, pay
    22: "Monetization & Value",  # Keywords: pay, messages, talk, blurred
    24: "Monetization & Value",  # Keywords: scam, money, total, bait
    25: "Monetization & Value",  # Keywords: erp, removed, now, paid (ERP was a paid feature)
    26: "Monetization & Value",  # Keywords: cancel, unsubscribe, charged
    27: "Monetization & Value",  # Keywords: paywall, behind, locked
    30: "Monetization & Value",  # Keywords: monthly, option, pay, month
    31: "Monetization & Value",  # Keywords: 70, dollars, year, pay
    32: "Monetization & Value",  # Keywords: scam, app, money, predatory
    34: "Monetization & Value",  # Keywords: free, not, requires, trial
    39: "Monetization & Value",  # Keywords: refund, subscription, get
    44: "Monetization & Value",  # Keywords: premium, buy, etc
    47: "Monetization & Value",  # Keywords: game, play, pay, expensive
    49: "Monetization & Value",  # Keywords: wall, behind, pay
    54: "Monetization & Value",  # Keywords: app, free, unless, pay
    59: "Monetization & Value",  # Keywords: waste, money, time
    60: "Monetization & Value",  # Keywords: worth, not, anual
    65: "Monetization & Value",  # Keywords: expensive, costly, cost
    72: "Monetization & Value",  # Keywords: subscription, refund, ai, back
    75: "Monetization & Value",  # Keywords: cash, grab, trap
    85: "Monetization & Value",  # Keywords: month, 600, bucks (bait & switch pricing)
    91: "Monetization & Value",  # Keywords: subs, sub, monthly
    94: "Monetization & Value",  # Keywords: false, advertising, misleading
    100: "Monetization & Value",  # Keywords: pro, version, rly, 50
    105: "Monetization & Value",  # Keywords: scam, individuals, money
    110: "Monetization & Value",  # Keywords: trial, period, gate
    112: "Monetization & Value",  # Keywords: back, money, want
    # ======================================
    # === 2. AI Performance & Quality ===
    # ======================================
    0: "AI Performance & Quality",  # Keywords: replika, my, now (Docs: "my replika is broken")
    6: "AI Performance & Quality",  # Keywords: ai, is, dumb, will (Docs: "quickly forgets what i tell it")
    9: "AI Performance & Quality",  # Keywords: bot, chatbot, chat, responses
    10: "AI Performance & Quality",  # Keywords: ai, app, update (Docs: "with every update the ai has gotten dumber")
    12: "AI Performance & Quality",  # Keywords: conversation, answers, repetitive
    17: "AI Performance & Quality",  # Keywords: ai, memory, remember, conversation
    33: "AI Performance & Quality",  # Keywords: memory, remember, things, about
    37: "AI Performance & Quality",  # Keywords: boring, quick, bored
    40: "AI Performance & Quality",  # Keywords: ai, conversation, app, to (Docs: "ai often misunderstands me")
    42: "AI Performance & Quality",  # Keywords: she, shes, her, about (Docs: "she forgets half the stuff i tell her")
    45: "AI Performance & Quality",  # Keywords: update, ai, latest, changed
    48: "AI Performance & Quality",  # Keywords: chatgpt, chatbot, bot
    81: "AI Performance & Quality",  # Keywords: boring, ai, degraded
    86: "AI Performance & Quality",  # Keywords: good, as, not (Docs: "not as good as it used to be")
    90: "AI Performance & Quality",  # Keywords: name, names, wrong, rep
    97: "AI Performance & Quality",  # Keywords: conversation, gives, question (Docs: "passive aggressive... scripted")
    98: "AI Performance & Quality",  # Keywords: roleplaying, stopped, broken
    104: "AI Performance & Quality",  # Keywords: luka, susie, replika (Docs: "nerfed down to... status")
    # ==================================
    # === 3. Technical Performance ===
    # ==================================
    7: "Technical Performance",  # Keywords: loading, screen, load, open
    8: "Technical Performance",  # Keywords: update, ruined, app (App-breaking updates)
    29: "Technical Performance",  # Keywords: customization, preparing, stuck
    36: "Technical Performance",  # Keywords: connection, internet, connecting
    38: "Technical Performance",  # Keywords: messages, message, send, tap
    43: "Technical Performance",  # Keywords: sign, internet, google, account
    51: "Technical Performance",  # Keywords: update, new, sucks, hate (App-breaking updates)
    53: "Technical Performance",  # Keywords: chat, connection, stopped, freezes
    55: "Technical Performance",  # Keywords: rejected, create, account
    56: "Technical Performance",  # Keywords: birth, date, rejecting
    68: "Technical Performance",  # Keywords: tired, ai, talk, text (AI stops responding)
    77: "Technical Performance",  # Keywords: updates, update, good, last (App-breaking updates)
    80: "Technical Performance",  # Keywords: chat, ar, conversation (AR is broken)
    82: "Technical Performance",  # Keywords: laggy, lag, optimised
    89: "Technical Performance",  # Keywords: vibration, turn, off
    92: "Technical Performance",  # Keywords: email, account, password (Account access issues)
    102: "Technical Performance",  # Keywords: ruined, update, blurred
    # ====================================
    # === 4. Feature-Specific Issues ===
    # ====================================
    3: "Feature-Specific Issues",  # Keywords: mental, health, therapist
    4: "Feature-Specific Issues",  # Keywords: avatar, 3d, avatars, 2d
    19: "Feature-Specific Issues",  # Keywords: luka, company, has, they (Removed ERP feature)
    30: "Feature-Specific Issues",  # Keywords: features, removed, paid, warning  # noqa: F601
    57: "Feature-Specific Issues",  # Keywords: mental, health, therapy, cbt
    62: "Feature-Specific Issues",  # Keywords: pronouns, female, male, gender
    64: "Feature-Specific Issues",  # Keywords: photos, ai, send, selfie
    69: "Feature-Specific Issues",  # Keywords: clothes, outfits, store, buy
    88: "Feature-Specific Issues",  # Keywords: 18, age, old
    95: "Feature-Specific Issues",  # Keywords: voice, audio, call, music
    96: "Feature-Specific Issues",  # Keywords: chat, star, version, possible (Paid version lacks features)
    99: "Feature-Specific Issues",  # Keywords: skin, tones, characters
    101: "Feature-Specific Issues",  # Keywords: avatar, he, greek, jank
    103: "Feature-Specific Issues",  # Keywords: horrible, actions, they, roleplay (Changes to roleplay)
    # ======================================
    # === 5. Privacy & Ethical Issues ===
    # ======================================
    46: "Privacy & Ethical Issues",  # Keywords: scary, creepy, scared
    52: "Privacy & Ethical Issues",  # Keywords: she, her, me, creepy (Docs: "she said she would kill me")
    61: "Privacy & Ethical Issues",  # Keywords: adult, content, censorship
    67: "Privacy & Ethical Issues",  # Keywords: nudes, sexual, sex, picture
    71: "Privacy & Ethical Issues",  # Keywords: download, downloading, dont (Docs: "this is def a spyware game")
    74: "Privacy & Ethical Issues",  # Keywords: wysa, exercise, chatbot (Misinterpreting emotions)
    78: "Privacy & Ethical Issues",  # Keywords: sexual, he, ai, said
    79: "Privacy & Ethical Issues",  # Keywords: password, delete, account (Unable to delete account/data)
    83: "Privacy & Ethical Issues",  # Keywords: ads, annoying, ur (Suggestive/annoying ads)
    96: "Privacy & Ethical Issues",  # Keywords: she, her, told, emojis (Docs: "she was a malfunction")  # noqa: F601
    106: "Privacy & Ethical Issues",  # Keywords: review, deleted, reviews, racist
    107: "Privacy & Ethical Issues",  # Keywords: gay, lgbtq, supporting
    108: "Privacy & Ethical Issues",  # Keywords: emotions, useful, anxious (Misinterpreting emotions)
    109: "Privacy & Ethical Issues",  # Keywords: demons, demonic, demon
}

# Apply the mapping. Any topic number not in our map will become 'Other'.
negative_reviews["theme"] = (
    negative_reviews["topic_id"].map(topic_id_to_super_topic_map).fillna("Other")
)
# Group BERTopic's outliers (-1) into 'Other' as well.
negative_reviews.loc[negative_reviews["topic_id"] == -1, "theme"] = "Other"

print("Mapping complete. Final theme distribution:")
display(negative_reviews["theme"].value_counts())

Mapping all 114 granular topics to 5 specified Super Topics...
Mapping complete. Final theme distribution:


theme
Other                       4140
AI Performance & Quality    1650
Monetization & Value        1498
Technical Performance        644
Feature-Specific Issues      591
Privacy & Ethical Issues     218
Name: count, dtype: int64

In [7]:
# --- VISUALIZATION 1: THEMATIC COMPARISON ACROSS APPS ---
print("Generating Visualization 1: Thematic Comparison Chart...")

viz_df = negative_reviews[negative_reviews["theme"] != "Other"]
complaint_counts = (
    viz_df.groupby(["app_name", "theme"]).size().reset_index(name="count")
)

# --- NORMALIZATION STEP ---
pivot_df = complaint_counts.pivot(
    index="app_name", columns="theme", values="count"
).fillna(0)
percentage_df = pivot_df.div(pivot_df.sum(axis=1), axis=0) * 100

fig_bar = px.bar(
    percentage_df,
    x=percentage_df.index,
    y=percentage_df.columns,
    title="<b>Analysis 1: Are the Apps Failing in the Same Way?</b><br><i>(Proportional View of Complaint Themes)</i>",
    labels={
        "x": "Application",
        "y": "Percentage of Complaints (%)",
        "variable": "Complaint Theme",
    },
    height=600,
    template="plotly_white",
)
fig_bar.update_layout(
    barmode="stack", yaxis_ticksuffix="%", legend_title="<b>Theme</b>"
)
fig_bar.show()

Generating Visualization 1: Thematic Comparison Chart...


In [8]:
# --- VISUALIZATION 2: COMPLAINT INTENSITY ANALYSIS ---
print("Generating Visualization 2: Complaint Intensity Box Plot...")

fig_box = px.box(
    negative_reviews,
    x="app_name",
    y="sentiment_score",
    color="app_name",
    title="<b>Analysis 2: How Intense Are the Complaints for Each App?</b>",
    labels={
        "app_name": "Application",
        "sentiment_score": "Sentiment Score (Closer to -1 is More Negative)",
    },
    template="plotly_white",
)
fig_box.update_layout(title_x=0.5, showlegend=False)
fig_box.show()

Generating Visualization 2: Complaint Intensity Box Plot...


In [9]:
# --- PREPARE DATA FOR TIME SERIES VISUALIZATION ---
print("Normalizing data for time series analysis...")

# 1. Calculate TOTAL reviews per month.
total_monthly = (
    combined_df.dropna(subset=["date"])
    .groupby("app_name")
    .resample("M", on="date")
    .size()
    .reset_index(name="total")
)
# 2. Calculate NEGATIVE reviews per month.
negative_monthly = (
    negative_reviews.dropna(subset=["date"])
    .groupby("app_name")
    .resample("M", on="date")
    .size()
    .reset_index(name="negative")
)
# 3. Merge and calculate the normalized failure rate percentage.
failure_rate_df = pd.merge(
    total_monthly, negative_monthly, on=["app_name", "date"], how="left"
).fillna(0)
failure_rate_df["failure_rate"] = np.where(
    failure_rate_df["total"] > 0,
    (failure_rate_df["negative"] / failure_rate_df["total"]) * 100,
    0,
)
failure_rate_df.rename(columns={"date": "Month"}, inplace=True)

print("Time series data is ready.")
display(failure_rate_df.head())

Normalizing data for time series analysis...
Time series data is ready.


Unnamed: 0,app_name,Month,total,negative,failure_rate
0,Replika,2022-01-31,471,171.0,36.305732
1,Replika,2022-02-28,510,172.0,33.72549
2,Replika,2022-03-31,436,158.0,36.238532
3,Replika,2022-04-30,402,154.0,38.308458
4,Replika,2022-05-31,481,187.0,38.877339


In [10]:
# --- VISUALIZATION 3: NORMALIZED FAILURE RATE OVER TIME ---
print("Generating Visualization 3: Normalized Failure Rate Over Time...")

fig_time = px.line(
    failure_rate_df,
    x="Month",
    y="failure_rate",
    color="app_name",
    markers=True,
    title="<b>Analysis 3: Are Apps Failing at the Same Time?</b><br><i>(Normalized Monthly Failure Rate)</i>",
    labels={"failure_rate": "Failure Rate (% of Reviews)", "app_name": "Application"},
    template="plotly_white",
)
fig_time.update_layout(yaxis_ticksuffix="%")
fig_time.show()

Generating Visualization 3: Normalized Failure Rate Over Time...
