In [1]:
# Data handling
import pandas as pd
# NLP tools
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# Vectorization & Topic Modeling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
# Misc utilities
from collections import Counter
import re
import spacy
from tqdm.auto import tqdm
# Transformers for advanced summarization or classification
from transformers import pipeline

In [2]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## PROBLEM 2- Most Common Strength

In [3]:
# --- 1. Load and Filter Data ---
try:
    df = pd.read_csv('FitBitReviews_updated.csv')
except FileNotFoundError:
    print("Error: 'FitBitReviews_updated.csv' not found. Please ensure the file is in the correct directory.")
    exit()

# We will search for the top 5 distinct topics.
N_TOPICS = 5
# We will display the top 12 keywords for each topic.
N_TOP_WORDS = 12

In [5]:
# Filter for negative reviews (ratings 1 and 2) and drop any empty reviews
positive_reviews = df[df['rating'].isin([4, 5])]['review_description'].dropna().astype(str)

In [6]:
# --- 2. Preprocess the Text Data ---
# Add custom stop words: These are common words in the reviews that don't help identify specific issues.
custom_stop_words = ['fitbit', 'device', 'app', 'watch', 'tracker', 'get', 'use', 'ive', 'im', 'would', 'one', 'wa']
stop_words = list(stopwords.words('english'))
stop_words.extend(custom_stop_words)

In [4]:
# --- Function to display the topics (Corrected for readability) ---
def display_topics(model, feature_names, n_top_words):
    print("\n--- Top 5 Most Common Strength Discovered by the Model ---")
    for topic_idx, topic in enumerate(model.components_):
        # Joins the top words with a comma and space for clarity
        topic_words = ", ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(f"\nTopic {topic_idx + 1}:")
        print(topic_words)

In [7]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only letters and spaces
    tokens = word_tokenize(text)
    # Filter out stop words and short words
    filtered_tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return " ".join(filtered_tokens)

In [9]:
print("Preprocessing text data...")
processed_reviews = [preprocess(review) for review in positive_reviews]

Preprocessing text data...


In [10]:
# --- 3. Vectorize Text with TF-IDF ---
# Using TF-IDF helps to highlight words that are important to a review, not just frequent overall.
# min_df=3 means a word must appear in at least 3 reviews to be considered.
vectorizer = TfidfVectorizer(max_df=0.90, min_df=3, stop_words='english')
tfidf = vectorizer.fit_transform(processed_reviews)
feature_names = vectorizer.get_feature_names_out()

# --- 4. Apply Non-Negative Matrix Factorization (NMF) ---
# NMF is excellent for finding additive, interpretable topics.
# random_state=0 ensures the results are the same each time you run the code.
print("Running NMF model to find topics...")
nmf_model = NMF(n_components=N_TOPICS, random_state=0, init='nndsvd')
nmf_model.fit(tfidf)

# --- 5. Display the Results ---
display_topics(nmf_model, feature_names, N_TOP_WORDS)

Running NMF model to find topics...

--- Top 5 Most Common Strength Discovered by the Model ---

Topic 1:
love, keeps, track, sleep, fit, helps, steps, bit, motivated, day, absolutely, charge

Topic 2:
great, works, track, fitness, tracking, tool, sleep, way, far, motivator, product, steps

Topic 3:
good, far, really, works, pretty, like, track, sleep, health, tracking, overall, steps

Topic 4:
easy, track, keeps, helpful, really, like, set, excellent, understand, motivated, helps, sleep

Topic 5:
awesome, track, keeps, sleep, helps, fitness, works, really, motivated, steps, like, fit


In [11]:
import torch
from transformers import pipeline

In [12]:
# --- 2. Your Discovered Topics ---
# Paste the actual topic words you discovered from your NMF analysis.
discovered_topics = {
    "Topic 1": "love, keeps, track, sleep, fit, helps, steps, bit, motivated, day, absolutely, charge",
    "Topic 2": "great, works, track, fitness, tracking, tool, sleep, way, far, motivator, product, steps",
    "Topic 3": "good, far, really, works, pretty, like, track, sleep, health, tracking, overall, steps",
    "Topic 4": "easy, track, keeps, helpful, really, like, set, excellent, understand, motivated, helps, sleep",
    "Topic 5": "awesome, track, keeps, sleep, helps, fitness, works, really, motivated, steps, like, fit"
}

In [13]:
# --- 3. Set up the AI Model Pipeline ---
# The 'pipeline' function from the transformers library is the easiest way to use a model.
# We are using Google's Flan-T5 model, which is excellent at following instructions.
# The 'device=0' tells the pipeline to use the GPU if available, making it much faster.
print("Loading the AI model (Flan-T5-Large)... This may take a minute.")
try:
    # Check if a GPU is available and set the device accordingly
    device = 0 if torch.cuda.is_available() else -1
    pipe = pipeline(
        "text2text-generation",
        model="google/flan-t5-large",
        device=device
    )
    print("Model loaded successfully.")
except Exception as e:
    print(f"An error occurred during model loading: {e}")
    print("Please ensure you are in a GPU-enabled environment.")
    pipe = None


Loading the AI model (Flan-T5-Large)... This may take a minute.


Device set to use cpu


Model loaded successfully.


In [14]:
# --- 4. Interpret Each Topic ---
if pipe:
    print("\n--- Interpreting Topics Using Open-Source AI ---")
    for title, keywords in discovered_topics.items():
        # This is the corrected instruction for POSITIVE reviews.
        prompt = f"""
        Analyze the following keywords discovered from **positive customer reviews** for a fitness tracker.
        Provide a short title for the main **strength** and a 2-sentence summary explaining what customers **like** about the product.

        Keywords: "{keywords}"
        """

        # Generate the interpretation
        result = pipe(prompt, max_length=150, num_beams=4, early_stopping=True)
        interpretation = result[0]['generated_text']

        print(f"\n--- {title} ---")
        print(f"Keywords: {keywords}")
        print(f"AI Interpretation: {interpretation}")


--- Interpreting Topics Using Open-Source AI ---


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



--- Topic 1 ---
Keywords: love, keeps, track, sleep, fit, helps, steps, bit, motivated, day, absolutely, charge
AI Interpretation: keeps track of your steps, sleep, and sleep cycle to help you stay fit and motivated throughout the day


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



--- Topic 2 ---
Keywords: great, works, track, fitness, tracking, tool, sleep, way, far, motivator, product, steps
AI Interpretation: great tool for tracking your steps and sleep and a great way to motivate yourself


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



--- Topic 3 ---
Keywords: good, far, really, works, pretty, like, track, sleep, health, tracking, overall, steps
AI Interpretation: good far really works pretty like tracking sleep health overall


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



--- Topic 4 ---
Keywords: easy, track, keeps, helpful, really, like, set, excellent, understand, motivated, helps, sleep
AI Interpretation: easy to use and keeps track of your workouts and sleep


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



--- Topic 5 ---
Keywords: awesome, track, keeps, sleep, helps, fitness, works, really, motivated, steps, like, fit
AI Interpretation: awesome fitness tracker keeps you motivated and keeps you fit
