In [44]:
# Data handling
import pandas as pd
# NLP tools
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# Vectorization & Topic Modeling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
# Misc utilities
from collections import Counter
import re
import spacy
from tqdm.auto import tqdm
# Transformers for advanced summarization or classification
from transformers import pipeline

In [2]:
## Loading Fitbit review dataset into a pandas DataFrame
df = pd.read_csv('fitbit.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,source,review_id,user_name,review_title,review_description,rating,thumbs_up,review_date
0,0,Google Play,b52e6513-a955-438a-a048-4ae5e1e81abb,Marlena H.,,Horrible change not allowing group challenges,2,0,4/17/2023 15:55
1,1,Google Play,d88c0a81-2611-4f53-b0c4-8f1a2442b6ac,Jeremy Chandler,,Google already starting to sunset features and...,2,0,4/17/2023 15:39
2,2,Google Play,22205257-f36d-464d-ba47-4ef7283071a3,Stu McConnell,,Whatever you have done with the latest update ...,1,0,4/17/2023 15:15
3,3,Google Play,c975c4ab-1a72-4e0f-a579-021ffbbe2d50,Emer Waters,,"Challenges with friends/family were removed, s...",1,0,4/17/2023 14:59
4,4,Google Play,17ae54ba-9a79-4ba4-9ad2-a0b1020e6290,erica potts,,Helpful for me to keep track of my steps,5,0,4/17/2023 14:51


In [4]:
#drop unnecessary column or rows
df.drop(['source','review_title','review_date'],axis = 1, inplace = True)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,review_id,user_name,review_description,rating,thumbs_up
0,0,b52e6513-a955-438a-a048-4ae5e1e81abb,Marlena H.,Horrible change not allowing group challenges,2,0
1,1,d88c0a81-2611-4f53-b0c4-8f1a2442b6ac,Jeremy Chandler,Google already starting to sunset features and...,2,0
2,2,22205257-f36d-464d-ba47-4ef7283071a3,Stu McConnell,Whatever you have done with the latest update ...,1,0
3,3,c975c4ab-1a72-4e0f-a579-021ffbbe2d50,Emer Waters,"Challenges with friends/family were removed, s...",1,0
4,4,17ae54ba-9a79-4ba4-9ad2-a0b1020e6290,erica potts,Helpful for me to keep track of my steps,5,0


In [6]:
df.info() #checking the datatypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333491 entries, 0 to 333490
Data columns (total 6 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Unnamed: 0          333491 non-null  int64 
 1   review_id           333491 non-null  object
 2   user_name           333491 non-null  object
 3   review_description  333353 non-null  object
 4   rating              333491 non-null  int64 
 5   thumbs_up           333491 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 15.3+ MB


In [7]:
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [8]:
df['review_id'].nunique() #getting all the unique values

333491

In [9]:
df.isnull().sum()

review_id               0
user_name               0
review_description    138
rating                  0
thumbs_up               0
dtype: int64

In [10]:
df.dropna(inplace = True)

In [11]:
df.shape

(333353, 5)

In [19]:
df.to_csv('FitBitReviews_updated.csv',index = False)

In [15]:
try:
    df = pd.read_csv('FitBitReviews_updated.csv')
except FileNotFoundError:
    print("Make sure 'FitBitReviews_updated.csv' is in the correct directory.")
    exit()

df.head()

Unnamed: 0,review_id,user_name,review_description,rating,thumbs_up
0,b52e6513-a955-438a-a048-4ae5e1e81abb,Marlena H.,Horrible change not allowing group challenges,2,0
1,d88c0a81-2611-4f53-b0c4-8f1a2442b6ac,Jeremy Chandler,Google already starting to sunset features and...,2,0
2,22205257-f36d-464d-ba47-4ef7283071a3,Stu McConnell,Whatever you have done with the latest update ...,1,0
3,c975c4ab-1a72-4e0f-a579-021ffbbe2d50,Emer Waters,"Challenges with friends/family were removed, s...",1,0
4,17ae54ba-9a79-4ba4-9ad2-a0b1020e6290,erica potts,Helpful for me to keep track of my steps,5,0


In [16]:
df.describe()

Unnamed: 0,rating,thumbs_up
count,333353.0,333353.0
mean,3.294265,1.226022
std,1.660247,13.244563
min,1.0,0.0
25%,1.0,0.0
50%,4.0,0.0
75%,5.0,0.0
max,5.0,1303.0


In [14]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Problem 1 - Most Common Issues faced 

In [17]:
# 1. Filter for reviews with rating 1 or 2
negative_reviews_df = df[df['rating'].isin([1, 2])].copy()

In [18]:
# 2. Preprocess the text
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
# Add custom stop words if needed
custom_stop_words = ['fitbit', 'device', 'one', 'get', 'would', 'ive', 'im']
stop_words.update(custom_stop_words)

In [19]:
def preprocess_text(text):
    if not isinstance(text, str):# Ensure text is a string
        return []
    # Lowercase and remove non-alphanumeric characters
    text = re.sub(r'\W+', ' ', text.lower())
    # Tokenize
    tokens = word_tokenize(text)
    # Lemmatize and remove stop words
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 1]
    return lemmatized_tokens
negative_reviews_df['processed_description'] = negative_reviews_df['review_description'].apply(preprocess_text)

In [20]:
# 3. Generate and count N-grams
all_tokens = [token for sublist in negative_reviews_df['processed_description'] for token in sublist]

# Bigrams (pairs of words)
bigrams = list(nltk.bigrams(all_tokens))
bigram_counts = Counter(bigrams)

# Trigrams (sequences of three words)
trigrams = list(nltk.trigrams(all_tokens))
trigram_counts = Counter(trigrams)

In [21]:
print("Most Common Issues (Bigrams):")
for item, count in bigram_counts.most_common(15):
    print(f'{" ".join(item)}: {count}')

print("\nMost Common Issues (Trigrams):")
for item, count in trigram_counts.most_common(15):
    print(f'{" ".join(item)}: {count}')

Most Common Issues (Bigrams):
please fix: 4950
last update: 4862
latest update: 3310
sync phone: 2821
new update: 2773
since last: 2768
fit bit: 2741
app sync: 2546
heart rate: 2402
since update: 2354
sync app: 2251
stopped syncing: 2237
update app: 2227
customer service: 2143
every time: 2057

Most Common Issues (Trigrams):
since last update: 2237
bad bad bad: 816
uninstall reinstall app: 780
uninstalled reinstalled app: 628
since latest update: 621
since new update: 460
please fix app: 405
please fix asap: 402
take forever sync: 397
since recent update: 382
please fix issue: 361
uninstalling reinstalling app: 358
last update sync: 354
ever since update: 322
app keep crashing: 312


### the count suggest that there are some really serious issues face by customers

## Looking for issues face by our customers

In [24]:
# We will search for the top 5 distinct topics.
N_TOPICS = 5
# We will display the top 12 keywords for each topic.
N_TOP_WORDS = 12

In [27]:
# --- Function to display the topics (Corrected for readability) ---
def display_topics(model, feature_names, n_top_words):
    print("\n--- Top 5 Most Common Issues Discovered by the Model ---")
    for topic_idx, topic in enumerate(model.components_):
        # Joins the top words with a comma and space for clarity
        topic_words = ", ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(f"\nTopic {topic_idx + 1}:")
        print(topic_words)

In [28]:
# Filter for negative reviews (ratings 1 and 2) and drop any empty reviews
negative_reviews = df[df['rating'].isin([1, 2])]['review_description'].dropna().astype(str)
# --- 2. Preprocess the Text Data ---
# Add custom stop words: These are common words in the reviews that don't help identify specific issues.
custom_stop_words = ['fitbit', 'device', 'app', 'watch', 'tracker', 'get', 'use', 'ive', 'im', 'would', 'one', 'wa']
stop_words = list(stopwords.words('english'))
stop_words.extend(custom_stop_words)

In [30]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only letters and spaces
    tokens = word_tokenize(text)
    # Filter out stop words and short words
    filtered_tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return " ".join(filtered_tokens)
print("Preprocessing text data...")
processed_reviews = [preprocess(review) for review in negative_reviews]


Preprocessing text data...


In [39]:
# --- 3. Vectorize Text with TF-IDF ---
# Using TF-IDF helps to highlight words that are important to a review, not just frequent overall.
# min_df=3 means a word must appear in at least 3 reviews to be considered.
vectorizer = TfidfVectorizer(max_df=0.90, min_df=3, stop_words='english')
tfidf = vectorizer.fit_transform(processed_reviews)
feature_names = vectorizer.get_feature_names_out()

# --- 4. Apply Non-Negative Matrix Factorization (NMF) ---
# NMF is excellent for finding additive, interpretable topics.
# random_state=0 ensures the results are the same each time you run the code.
print("Running NMF model to find topics...")
nmf_model = NMF(n_components=N_TOPICS, random_state=0, init='nndsvd')
nmf_model.fit(tfidf)

# --- 5. Display the Results ---
display_topics(nmf_model, feature_names, N_TOP_WORDS)

Running NMF model to find topics...

--- Top 5 Most Common Issues Discovered by the Model ---

Topic 1:
sync, charge, phone, able, doesnt, takes, unable, anymore, time, frustrating, issues, days

Topic 2:
syncing, phone, working, time, stopped, charge, fix, issues, versa, tried, notifications, days

Topic 3:
doesnt, work, properly, time, anymore, dont, used, sleep, steps, track, android, notifications

Topic 4:
wont, connect, let, download, phone, anymore, pair, tried, open, work, stay, updated

Topic 5:
update, latest, new, fix, longer, recent, worked, working, fine, firmware, useless, steps


In [40]:
import torch
from transformers import pipeline

In [41]:
# --- 1. Your Discovered Topics ---
# Paste the actual topic words you discovered from your NMF analysis.
discovered_topics = {
    "Topic 1": "sync, connect, wont, phone, bluetooth, connection, issues, tried, problem, pair, constantly, restart",
    "Topic 2": "stopped, working, notifications, properly, time, months, fine, ago, worked, charge, screen, longer",
    "Topic 3": "steps, tracking, track, sleep, heart, rate, data, count, accuracy, doesnt, accurate, calories",
    "Topic 4": "update, latest, new, version, firmware, after, since, broke, worked, fix, recent, useless",
    "Topic 5": "customer, service, support, bad, experience, premium, pay, useless, features, time, money, waste"
}

# --- 3. Set up the AI Model Pipeline ---
# The 'pipeline' function from the transformers library is the easiest way to use a model.
# We are using Google's Flan-T5 model, which is excellent at following instructions.
# The 'device=0' tells the pipeline to use the GPU if available, making it much faster.
print("Loading the AI model (Flan-T5-Large)... This may take a minute.")
try:
    # Check if a GPU is available and set the device accordingly
    device = 0 if torch.cuda.is_available() else -1
    pipe = pipeline(
        "text2text-generation",
        model="google/flan-t5-large",
        device=device
    )
    print("Model loaded successfully.")
except Exception as e:
    print(f"An error occurred during model loading: {e}")
    print("Please ensure you are in a GPU-enabled environment.")
    pipe = None

# --- 4. Interpret Each Topic ---
if pipe:
    print("\n--- Interpreting Topics Using Open-Source AI ---")
    for title, keywords in discovered_topics.items():
        # This is the instruction we give the model. It's clear and direct.
        prompt = f"""
        Analyze the following keywords discovered from negative customer reviews for a fitness tracker.
        Provide a short title for the main issue and a 2-sentence summary explaining the problem.

        Keywords: "{keywords}"
        """

        # Generate the interpretation
        result = pipe(prompt, max_length=150, num_beams=4, early_stopping=True)
        interpretation = result[0]['generated_text']

        print(f"\n--- {title} ---")
        print(f"Keywords: {keywords}")
        print(f"AI Interpretation: {interpretation}")

Loading the AI model (Flan-T5-Large)... This may take a minute.


Device set to use cpu


Model loaded successfully.

--- Interpreting Topics Using Open-Source AI ---


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



--- Topic 1 ---
Keywords: sync, connect, wont, phone, bluetooth, connection, issues, tried, problem, pair, constantly, restart
AI Interpretation: unable to sync with phone or bluetooth constantly disconnects and restarts


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



--- Topic 2 ---
Keywords: stopped, working, notifications, properly, time, months, fine, ago, worked, charge, screen, longer
AI Interpretation: a fitness tracker stopped working properly and notifications stopped working for months


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



--- Topic 3 ---
Keywords: steps, tracking, track, sleep, heart, rate, data, count, accuracy, doesnt, accurate, calories
AI Interpretation: a fitness tracker that doesn't count steps accurately


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



--- Topic 4 ---
Keywords: update, latest, new, version, firmware, after, since, broke, worked, fix, recent, useless
AI Interpretation: useless fitness tracker broke after latest firmware update


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



--- Topic 5 ---
Keywords: customer, service, support, bad, experience, premium, pay, useless, features, time, money, waste
AI Interpretation: bad customer service and support for a fitness tracker
