## Install the required packages

In [1]:
!pip install bertopic



## Install the required packages

In [2]:
# Data manipulation
import pandas as pd

# GPU support and tensor operations
from torch import cuda

# Dimensionality reduction
from umap import UMAP

# Clustering
from hdbscan import HDBSCAN

# Sentence embeddings
from sentence_transformers import SentenceTransformer

# Text vectorization
from sklearn.feature_extraction.text import CountVectorizer

# Topic modeling
from bertopic import BERTopic

# Custom representation for topics
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech

# Custom vectorizer for class-based TF-IDF
from bertopic.vectorizers import ClassTfidfTransformer

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Data Import

In this section, we import data from a CSV file, combine relevant columns, and prepare the dataset for further analysis.

In [3]:
# Import data from CSV file
df = pd.read_csv("/content/drive/Shareddrives/Jowa/CS180 Project/data/data_ai.csv")

# remove rows with empty body and title
df = df.dropna(subset=["body", "title"])
df = df.reset_index(drop=True)

# Create a new DataFrame with combined title and body text
df["text"] = df["title"] + " " + df["body"]

print(f"Data has been imported. There are {len(df)} rows.")

Data has been imported. There are 4272 rows.


In [4]:
docs = df["text"]
titles = df["title"]

Check the current device to use

In [5]:
if cuda.is_available():
    cuda.empty_cache()
    device = cuda.current_device()
    print(f"GPU: {cuda.get_device_name(device)}")
else:
    device = "cpu"
    print("GPU: Not available")


GPU: Tesla T4


### **Setup BERTopic Layers**

**Step 1 - Extract embeddings**

Other Sentence Transformer can be found in this [leaderboard](https://huggingface.co/spaces/mteb/leaderboard)

In [6]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

Pre-calculate embeddings to save time.

In [7]:
embeddings = embedding_model.encode(docs, show_progress_bar=True)

Batches:   0%|          | 0/134 [00:00<?, ?it/s]

**Step 2 - Reduce dimensionality**

In [8]:
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=21522)

**Step 3 - Cluster reduced embeddings**

In [9]:
hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

**Step 4 - Tokenize topics**

In [10]:
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

**Step 5 - Create topic representation**

In [11]:
ctfidf_model = ClassTfidfTransformer()

**Step 6 - Fine-tune topic representations**

In [12]:
keybert = KeyBERTInspired(top_n_words=15)

mmr = MaximalMarginalRelevance(diversity=0.3)

# Part-of-Speech
pos = PartOfSpeech("en_core_web_sm")

In [13]:
representation_model = {
    "KeyBERT": keybert,
    "MMR": mmr,
    "POS": pos,
}

In [14]:
topic_model = BERTopic(
  # Pipeline models
  embedding_model=embedding_model,            # Step 1 - Extract embeddings
  umap_model=umap_model,                      # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,                # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,          # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                  # Step 5 - Extract topic words
  representation_model=representation_model,  # Step 6 - Fine-tune topic represenations

  # Hyperparameters
  top_n_words=15,
  verbose=True
)

In [15]:
topics, probs = topic_model.fit_transform(docs, embeddings)

2024-05-18 08:20:05,588 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-18 08:20:40,750 - BERTopic - Dimensionality - Completed ✓
2024-05-18 08:20:40,752 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-18 08:20:40,904 - BERTopic - Cluster - Completed ✓
2024-05-18 08:20:40,911 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-18 08:20:56,051 - BERTopic - Representation - Completed ✓


In [16]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,POS,Representative_Docs
0,-1,1433,-1_https_model_amp_ai,"[https, model, amp, ai, data, learning, models...","[neural, tensorflow, models, ai, images, train...","[https, amp, ai, models, using, com, training,...","[model, data, learning, models, like, use, tra...",[Can dalle2 perform real reconstruction? Hey t...
1,0,535,0_ai_com_2024_https,"[ai, com, 2024, https, www, https www, news, i...","[ai news, daily ai, ai, new ai, ai models, art...","[ai, 2024, https www, news, intelligence, com ...","[ai, news, intelligence, human, artificial, go...",[One-Minute Daily AI News 3/6/2024 1. **Micros...
2,1,340,1_gpt_4o_gpt 4o_chatgpt,"[gpt, 4o, gpt 4o, chatgpt, turbo, openai, chat...","[gpt 4o, gpt4o, gpt4, chat gpt, use gpt, chatg...","[gpt, gpt 4o, chatgpt, turbo, openai, gpt turb...","[gpt, 4o, turbo, openai, chat, model, new, gpt...",[Lot of questions about the point of the subsc...
3,2,322,2_voice_audio_ai_speech,"[voice, audio, ai, speech, video, text, music,...","[ai voice, text speech, ai tool, voice, transc...","[voice, audio, ai, speech, text, https, song, ...","[voice, audio, ai, speech, video, text, music,...",[I made a simple voice AI assistant using Open...
4,3,297,3_ml_learning_machine learning_machine,"[ml, learning, machine learning, machine, cour...","[learn ml, computer science, ai ml, courses, m...","[ml, learning, machine learning, machine, cour...","[learning, machine, course, research, courses,...",[[D] Yet another post about the necessity of P...
5,4,216,4_ai_tool_chatgpt_chat,"[ai, tool, chatgpt, chat, tools, like, api, us...","[ai tools, ai tool, ai, tools, chatbots, chatb...","[ai, chatgpt, chat, tools, api, pdf, text, app...","[ai, tool, chatgpt, chat, tools, use, pdf, wri...",[Top 6 Fast Essay Writing Tools Hi Everyone! I...
6,5,191,5_rag_llm_llms_language,"[rag, llm, llms, language, model, retrieval, d...","[retrieval, rag, language models, retrieval au...","[rag, llm, llms, language, model, retrieval, d...","[rag, llms, language, model, retrieval, data, ...",[Cognita : A Truly Unified RAG Framework : Par...
7,6,167,6_self_loss_torch_model,"[self, loss, torch, model, epoch, nn, accuracy...","[keras, torch tensor, torch nn, overfitting, d...","[self, loss, torch, model, epoch, nn, training...","[loss, model, epoch, nn, accuracy, def, traini...",[What's wrong with this model? Try to find the...
8,7,123,7_gpu_rtx_gpus_cloud,"[gpu, rtx, gpus, cloud, learning, cpu, machine...","[gpu, gpus, geforce rtx, vram, rtx 4090, ram, ...","[gpu, rtx, gpus, cloud, ram, super, vram, 4090...","[gpu, gpus, cloud, learning, cpu, machine, ram...",[[D] NVIDIA GPU Benchmarks &amp; Comparison [h...
9,8,89,8_learning_deep_deep learning_neural,"[learning, deep, deep learning, neural, networ...","[deep learning, learning deep, neural networks...","[deep, deep learning, networks, neural network...","[learning, deep, deep learning, neural, networ...",[When Theoretical ML will catch up to Practica...
