In [1]:
import pandas as pd
import json

from pathlib import Path
import sys
import os

# Add project root to Python path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

from src.config import config
from src.data.etl import load_data
from src.model.bert import TextEmbedder, DimensionalityReducer, ClusteringModel, CTFIDFVectorizer
# from src.model.visualize import TopicModelVisualizer
from src.model.eval import TopicModelEvaluator
from src.model.label import LLMTopicNamer

from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
print(config)

Settings Configuration:
  BASE_PATH: /Users/Z00GK5Z/Documents/Workspace/Project/topic_modeling
  DATA_FILE_PATH: /Users/Z00GK5Z/Documents/Workspace/Project/topic_modeling/data/raw/data.json
  SEED: 23
  LLM_CONFIG: {'model_name': 'Qwen/Qwen3-4B'}
  LOG_CONFIG: {'version': 1, 'disable_existing_loggers': False, 'formatters': {'detailed': {'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', 'datefmt': '%Y-%m-%d %H:%M:%S'}, 'simple': {'format': '%(levelname)s - %(message)s'}}, 'handlers': {'console': {'class': 'logging.StreamHandler', 'level': 'INFO', 'formatter': 'simple', 'stream': 'ext://sys.stdout'}, 'file': {'class': 'logging.handlers.RotatingFileHandler', 'level': 'DEBUG', 'formatter': 'detailed', 'filename': '/Users/Z00GK5Z/Documents/Workspace/Project/topic_modeling/output/topic_modeling.log', 'maxBytes': 10485760, 'backupCount': 5, 'encoding': 'utf-8'}}, 'root': {'level': 'INFO', 'handlers': ['console', 'file']}}
  embedding_model_config: {'model_name': 'sentence-tran

In [3]:
data = load_data()

data.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4.0,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5.0,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5.0,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4.0,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5.0,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [None]:
data = data[data['asin'] == '6073894996']
docs = data['reviewText'].tolist()

print("Number of Docs:",len(docs))

print("Unique ASINs:",data['asin'].nunique())
print("Sample documents:")
docs[:8]

Number of Docs: 10
Unique ASINs: 1
Sample documents:


['it worked for the first week then it only charge my phone to 20%. it is a waste of money.',
 "Good case, solid build. Protects phone all around with good access to buttons. Battery charges with full battery lasts me a full day. I usually leave my house around 7am and return at 10pm. I'm glad that it lasts from start to end. 5/5",
 'This is a fantastic case. Very stylish and protects my phone. Easy access to all buttons and features, without any loss of phone reception. But most importantly, it double power, just as promised. Great buy',
 "this case fits perfectly on the s4 and keeps me powerd all day I can't complain! a+ recommend it to all",
 "This is the first battery case I have had for my Galaxy S4. The S4 fits very well, is slim and doesn't add much weight to the Galaxy S4. It doubles the battery life. You can charge either the battery, the phone or both. There is a handy on-off switch with leds to indicate the level of charge.The battery case came on time and was packaged well.

In [5]:
embedder = TextEmbedder(**config.embedding_model_config)
embeddings = embedder.embed(docs)

print("Embeddings shape:", embeddings.shape)

INFO - Use pytorch device_name: mps
INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
Using device: Apple Silicon GPU (MPS)
  macOS: True


Batches: 100%|██████████| 1/1 [00:00<00:00,  9.14it/s]

Embeddings shape: (10, 384)





In [6]:
reducer = DimensionalityReducer(**config.dr_config)
reduced_embeddings = reducer.fit_transform(embeddings)

INFO - Initialized UMAP with n_components=5
INFO - Reducing dimensionality from 384 to 5 using UMAP
INFO - Dimensionality reduction complete: (10, 5)


In [7]:
cluster_model = ClusteringModel(**config.clustering_config)
clusters_labels = cluster_model.fit_predict(reduced_embeddings)

INFO - Initialized HDBSCAN (auto-discovers topics)
INFO - Clustering 10 documents using HDBSCAN
INFO - Clustering complete: Found 0 topics
INFO - Outliers/Noise: 10 documents (100.0%)


In [8]:
docs_per_topic = {}
for doc, label in zip(docs, clusters_labels):
    if label == -1:  
        continue
    if label not in docs_per_topic:
        docs_per_topic[label] = []
    docs_per_topic[label].append(doc)

topic_docs = []
topic_ids = []
for topic_id in sorted(docs_per_topic.keys()):
    topic_docs.append(' '.join(docs_per_topic[topic_id]))
    topic_ids.append(topic_id)


print("Number of topics (excluding outliers):", len(topic_docs))

Number of topics (excluding outliers): 0


In [9]:
config.c_tfidf_config

{'ngram_range': (1, 2)}

In [10]:
ctfidf_vectorizer = CTFIDFVectorizer(**config.c_tfidf_config)
c_tfidf_matrix, vocab = ctfidf_vectorizer.fit_transform(topic_docs)

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:

        
top_n_words = 10
topic_words_ = {}

for i, topic_id in enumerate(topic_ids):
    # Get scores for this topic
    topic_scores = c_tfidf_matrix[i]
    
    # Get indices of top N words (sorted descending)
    top_indices = topic_scores.argsort()[-top_n_words:][::-1]
    
    # Get words and their scores
    words = [vocab[j] for j in top_indices]
    scores = [topic_scores[j] for j in top_indices]
    
    topic_words_[topic_id] = list(zip(words, scores))

# Create a simple topics_ dictionary (just words, no scores)
topics_ = {
    topic: [word for word, score in words]
    for topic, words in topic_words_.items()
}


topics_

{np.int64(0): ['yes',
  'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzi got',
  'final polish',
  'final outcome',
  'final outer',
  'final parting',
  'final piece',
  'final placement',
  'final plus',
  'final point'],
 np.int64(1): ['1575',
  '1604',
  '1608',
  '1610',
  '1607',
  '1605',
  '1606',
  '1578',
  'defectuoso',
  '1575 1604'],
 np.int64(2): ['el',
  'que',
  'es',
  'para',
  'muy',
  'lo',
  'la',
  'en',
  'excelente',
  'producto'],
 np.int64(3): ['tags',
  'nfc',
  'tag',
  'app',
  'tectile',
  'nfc tags',
  'phone',
  'work',
  'program',
  'use'],
 np.int64(4): ['gloves',
  'cold',
  'warm',
  'glove',
  'hands warm',
  'use',
  'finger',
  'touch',
  'hands',
  'agloves'],
 np.int64(5): ['qr',
  'qr code',
  'case',
  'code',
  'findables',
  'scan',
  'app',
  'phone',
  'information',
  'social'],
 np.int64(6): ['good good',
  'good',
  'verg',
  'good verg',
  'god bad',
  'verg good',
  'good god',
  'bad good',
  'god',
  'bad'],
 np.int64(7): ['armband',
  'arm',
  'p

In [None]:
config. c_tfidf_config["use_bm25"] = True

ctfidf_vectorizer = CTFIDFVectorizer(**config.c_tfidf_config)
c_tfidf_matrix, vocab = ctfidf_vectorizer.fit_transform(topic_docs)

        
top_n_words = 10
topic_words_ = {}

for i, topic_id in enumerate(topic_ids):
    # Get scores for this topic
    topic_scores = c_tfidf_matrix[i]
    
    # Get indices of top N words (sorted descending)
    top_indices = topic_scores.argsort()[-top_n_words:][::-1]
    
    # Get words and their scores
    words = [vocab[j] for j in top_indices]
    scores = [topic_scores[j] for j in top_indices]
    
    topic_words_[topic_id] = list(zip(words, scores))

# Create a simple topics_ dictionary (just words, no scores)
topics_ = {
    topic: [word for word, score in words]
    for topic, words in topic_words_.items()
}


topics_

{np.int64(0): ['yes',
  'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzi got',
  'final polish',
  'final outcome',
  'final outer',
  'final parting',
  'final piece',
  'final placement',
  'final plus',
  'final point'],
 np.int64(1): ['1575',
  '1604',
  '1608',
  '1610',
  '1607',
  '1605',
  '1606',
  '1578',
  'defectuoso',
  '1575 1604'],
 np.int64(2): ['el',
  'que',
  'es',
  'para',
  'muy',
  'lo',
  'la',
  'en',
  'excelente',
  'producto'],
 np.int64(3): ['tags',
  'nfc',
  'tag',
  'app',
  'tectile',
  'nfc tags',
  'phone',
  'work',
  'program',
  'use'],
 np.int64(4): ['gloves',
  'cold',
  'warm',
  'glove',
  'hands warm',
  'use',
  'finger',
  'touch',
  'hands',
  'agloves'],
 np.int64(5): ['qr',
  'qr code',
  'case',
  'code',
  'findables',
  'scan',
  'app',
  'phone',
  'information',
  'social'],
 np.int64(6): ['good good',
  'good',
  'verg',
  'good verg',
  'god bad',
  'verg good',
  'good god',
  'bad good',
  'god',
  'bad'],
 np.int64(7): ['armband',
  'arm',
  'p