In [1]:
import pandas as pd
import json

from pathlib import Path
import sys
import os

# Add project root to Python path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

from src.config import config
from src.data.etl import load_data
from src.model.bert import TextEmbedder, DimensionalityReducer, ClusteringModel, CTFIDFVectorizer
# from src.model.visualize import TopicModelVisualizer
from src.model.eval import TopicModelEvaluator
from src.model.label import LLMTopicNamer

from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
print(config)

Settings Configuration:
  BASE_PATH: /Users/Z00GK5Z/Documents/Workspace/Project/topic_modeling
  DATA_FILE_PATH: /Users/Z00GK5Z/Documents/Workspace/Project/topic_modeling/data/raw/data.json
  SEED: 23
  LLM_CONFIG: {'model_name': 'Qwen/Qwen3-4B'}
  LOG_CONFIG: {'version': 1, 'disable_existing_loggers': False, 'formatters': {'detailed': {'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', 'datefmt': '%Y-%m-%d %H:%M:%S'}, 'simple': {'format': '%(levelname)s - %(message)s'}}, 'handlers': {'console': {'class': 'logging.StreamHandler', 'level': 'INFO', 'formatter': 'simple', 'stream': 'ext://sys.stdout'}, 'file': {'class': 'logging.handlers.RotatingFileHandler', 'level': 'DEBUG', 'formatter': 'detailed', 'filename': '/Users/Z00GK5Z/Documents/Workspace/Project/topic_modeling/output/topic_modeling.log', 'maxBytes': 10485760, 'backupCount': 5, 'encoding': 'utf-8'}}, 'root': {'level': 'INFO', 'handlers': ['console', 'file']}}
  embedding_model_config: {'model_name': 'BAAI/bge-larg

In [3]:
data = load_data()

data.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4.0,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5.0,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5.0,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4.0,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5.0,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [4]:
data = data[data['asin'] == '6073894996']
docs = data['reviewText'].tolist()

print("Number of Docs:",len(docs))

print("Unique ASINs:",data['asin'].nunique())
print("Sample documents:")
docs[:8]

Number of Docs: 37
Unique ASINs: 1
Sample documents:


['Surprisingly, this inexpensive version works just as well and just as reliably as the expensive variety. It has been working for me for months now. No problem. Excellent value.',
 'I have tested this against the griffin dual output unit.I checked the charging current.This unit was charging my galaxy note battery with 70 ma.Griffin was charging with 40 ma! And the griffin was 4 times more expensive.I have not used these for very long. I bought 15 of them, because they are so cheap and because they actually do seem to provide high current.No idea how long they last. I assume they will work fine. I have not been using them much. I did the testing , just to stock up on a high current charger. This passed and I stocked up.',
 'It worked great for the first couple of weeks then it just stopped completely.. so basically a small waste of money.',
 'I love that it has two ports for my phone and ipod. Who wants to be putting too many things in one socket.  Sleek and convenient to store and I j

In [5]:
embedder = TextEmbedder(**config.embedding_model_config)
embeddings = embedder.embed(docs)

print("Embeddings shape:", embeddings.shape)

INFO - Use pytorch device_name: mps
INFO - Load pretrained SentenceTransformer: BAAI/bge-large-en-v1.5
Using device: Apple Silicon GPU (MPS)
  macOS: True


Batches: 100%|██████████| 5/5 [00:01<00:00,  4.86it/s]

Embeddings shape: (37, 1024)





In [6]:
reducer = DimensionalityReducer(**config.dr_config)
reduced_embeddings = reducer.fit_transform(embeddings)

INFO - Initialized UMAP with n_components=5
INFO - Reducing dimensionality from 1024 to 5 using UMAP
INFO - Dimensionality reduction complete: (37, 5)


In [7]:
cluster_model = ClusteringModel(**config.clustering_config)
clusters_labels = cluster_model.fit_predict(reduced_embeddings)

INFO - Initialized HDBSCAN (auto-discovers topics)
INFO - Clustering 37 documents using HDBSCAN
INFO - Clustering complete: Found 5 topics
INFO - Outliers/Noise: 4 documents (10.8%)


In [8]:
docs_per_topic = {}
for doc, label in zip(docs, clusters_labels):
    if label == -1:  
        continue
    if label not in docs_per_topic:
        docs_per_topic[label] = []
    docs_per_topic[label].append(doc)

topic_docs = []
topic_ids = []
for topic_id in sorted(docs_per_topic.keys()):
    topic_docs.append(' '.join(docs_per_topic[topic_id]))
    topic_ids.append(topic_id)


print("Number of topics (excluding outliers):", len(topic_docs))

Number of topics (excluding outliers): 5


In [14]:
docs_per_topic[0]

['It worked great for the first couple of weeks then it just stopped completely.. so basically a small waste of money.',
 'does not have the need amps to charge things like ipads, or hp touchpads. but its super small and compact.',
 "I am disappointed that the 1A didn't work with my iPad.  That's what I get for buying a cheap adapter.",
 'This is a nice charger but you can tell it was made cheaply in China.  When it is charging the phone, the car radio gets LOTS of static.  Not so much that I have to stop charging but like when you are near power lines and the radio station is far away.So, no RF shielding.I gave it 4 stars because it works fine for me, but if you listen to the radio, you might consider it is more like 2 or 3 stars.',
 'After a week only one side works',
 "Only works one side at a time. When you connect two cables, one side stop working and also overheated burning the fuses. I purchased two of them and it's the same problem. Cheap and bad quality.",
 "Didn't last very l

In [15]:
docs_per_topic[1]

['Surprisingly, this inexpensive version works just as well and just as reliably as the expensive variety. It has been working for me for months now. No problem. Excellent value.',
 'I bought this a little skeptical. After I tried it I bought two more. It works great and so far it has lasted for about 3 months. If that changes I will update this review.',
 "I've bough a munch of different things like this over the years. Most wouldn't stay in the jack, or would give out after a few days. This one is GREAT!",
 "I have several of these, from various retailers, all the same, each made in some anonymous Chinese factory, all exactly alike.They're great. They really do work, and they do deliver 2.1 and 1.0 power as they say they will do. They do the job.You wouldn't expect much QC for a $2 electrical product, but they each work just fine.They don't last forever-- about a year or two of active daily use seems about par, then they suddenly die. So do have some replacements on hand. At this pri

In [16]:
docs_per_topic[2]

['Yo get exactly what you order in a timely fashion. And the item is just as described. Great buy if you ask me',
 'It came at last, good looking and the price was good and i believe it is worth the time I waited for it to come to me good job',
 'excellent product, works great , have easy handling, and good quality as it is announced. reached as is shown time and in very good condition thank you very much for everything',
 'I received this product before I expected. It looks pretty good and It works with my Iphone (3GS) and my phone (HTC Evo V 3D). It is a good deal because It is not easy to find something like this for this price',
 'good product at low price.purchased this looking for a smaller charger and I love Griffin products.Free shipping just took a little longer']

In [9]:
config.c_tfidf_config

{'ngram_range': (1, 2)}

In [10]:
ctfidf_vectorizer = CTFIDFVectorizer(**config.c_tfidf_config)
c_tfidf_matrix, vocab = ctfidf_vectorizer.fit_transform(topic_docs)

(5, 939)


In [11]:

        
top_n_words = 10
topic_words_ = {}

for i, topic_id in enumerate(topic_ids):
    # Get scores for this topic
    topic_scores = c_tfidf_matrix[i]
    
    # Get indices of top N words (sorted descending)
    top_indices = topic_scores.argsort()[-top_n_words:][::-1]
    
    # Get words and their scores
    words = [vocab[j] for j in top_indices]
    scores = [topic_scores[j] for j in top_indices]
    
    topic_words_[topic_id] = list(zip(words, scores))

# Create a simple topics_ dictionary (just words, no scores)
topics_ = {
    topic: [word for word, score in words]
    for topic, words in topic_words_.items()
}


topics_

{np.int64(0): ['slot',
  'charge',
  'charging',
  'cheap',
  'design',
  'worked',
  'works',
  'died',
  'slot design',
  'usb'],
 np.int64(1): ['great',
  'wouldn',
  'work',
  'just',
  'travel',
  'helpful',
  'months',
  'little',
  've',
  'problem'],
 np.int64(2): ['good',
  'price',
  'product',
  'price good',
  'looking',
  'easy',
  'time',
  'great',
  'just',
  'works'],
 np.int64(3): ['blue',
  'clear',
  'plug',
  'blue light',
  'pros',
  'car',
  'love',
  'nice',
  'product',
  'light'],
 np.int64(4): ['great',
  'charge',
  'car',
  'devices',
  'works',
  'works great',
  'love',
  'charger',
  'phone',
  'gps']}

In [12]:
config. c_tfidf_config["use_bm25"] = True

ctfidf_vectorizer = CTFIDFVectorizer(**config.c_tfidf_config)
c_tfidf_matrix, vocab = ctfidf_vectorizer.fit_transform(topic_docs)

        
top_n_words = 10
topic_words_ = {}

for i, topic_id in enumerate(topic_ids):
    # Get scores for this topic
    topic_scores = c_tfidf_matrix[i]
    
    # Get indices of top N words (sorted descending)
    top_indices = topic_scores.argsort()[-top_n_words:][::-1]
    
    # Get words and their scores
    words = [vocab[j] for j in top_indices]
    scores = [topic_scores[j] for j in top_indices]
    
    topic_words_[topic_id] = list(zip(words, scores))

# Create a simple topics_ dictionary (just words, no scores)
topics_ = {
    topic: [word for word, score in words]
    for topic, words in topic_words_.items()
}


topics_

(5, 939)


{np.int64(0): ['slot',
  'charge',
  'charging',
  'cheap',
  'design',
  'worked',
  'works',
  'died',
  'slot design',
  'usb'],
 np.int64(1): ['great',
  'wouldn',
  'work',
  'just',
  'travel',
  'helpful',
  'months',
  'little',
  've',
  'problem'],
 np.int64(2): ['good',
  'price',
  'product',
  'price good',
  'looking',
  'easy',
  'time',
  'great',
  'just',
  'works'],
 np.int64(3): ['blue',
  'clear',
  'plug',
  'blue light',
  'pros',
  'car',
  'love',
  'nice',
  'product',
  'light'],
 np.int64(4): ['great',
  'charge',
  'car',
  'devices',
  'works',
  'works great',
  'love',
  'charger',
  'phone',
  'gps']}