## **Install packages**

---



In [None]:
!pip install openai
!pip install bertopic
!pip install tiktoken
!pip install cudf-cu12 dask-cudf-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install cuml-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install cugraph-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install --upgrade cupy-cuda12x -f https://pip.cupy.dev/aarch64

Collecting openai
  Downloading openai-1.30.1-py3-none-any.whl (320 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/320.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m194.6/320.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.6/320.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-

Use this if there are some dependencies error

In [None]:
# !pip uninstall -y cupy-cuda11x
# !pip uninstall -y cupy-cuda12x
# !pip uninstall -y cuda-python
# !pip install --upgrade cuda-python

##  **Import libraries**

---



In [None]:
# Data manipulation
import pandas as pd

# OpenAI Library
import openai

# Graphs
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# GPU support and tensor operations
from torch import cuda

## Dimensionality reduction
# from umap import UMAP

# Dimensionality reduction using GPU
from cuml.manifold import UMAP

## Clustering
# from hdbscan import HDBSCAN

# Clustering using GPU
from cuml.cluster import HDBSCAN

# Sentence embeddings
from sentence_transformers import SentenceTransformer

# Text vectorization
from sklearn.feature_extraction.text import CountVectorizer

# Topic modeling
from bertopic import BERTopic

# Custom representation for topics
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech, OpenAI

# Document tokenizer for ai representation
import tiktoken

# Custom vectorizer for class-baseD TF-IDF
from bertopic.vectorizers import ClassTfidfTransformer

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Access colab env
from google.colab import userdata

## **Data Import and Environment Setup**

---



Import data from a CSV file, combine relevant columns.

In [None]:
# Import data from CSV file
df = pd.read_csv("/content/drive/Shareddrives/Jowa/CS180 Project/data/data_ai.csv")

# remove rows with empty body and title
df = df.dropna(subset=["body", "title"])
df = df.reset_index(drop=True)

# Create a new DataFrame with combined title and body text
df["text"] = df["title"] + " " + df["body"]

print(f"Data has been imported. There are {len(df)} rows.")

Check if the environment is using a GPU

In [None]:
if cuda.is_available():
    cuda.empty_cache()
    device = cuda.current_device()
    print(f"GPU: {cuda.get_device_name(device)}")
else:
    device = "cpu"
    print("GPU: Not available")

Initialize OpenAI Client

In [None]:
client = openai.OpenAI(api_key=userdata.get('OPENAI_API_KEY'))

response = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a helpful, respectful and honest assistant for labeling topics."},
    {"role": "user", "content": "Hello!"}
  ]
)

print(response.choices[0].message.content)

In [None]:
prompt = """
You are a helpful, respectful and honest assistant for labeling topics.

Given this example input:
I have a topic that contains the following documents:
 - Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
 - Meat, but especially beef, is the word food in terms of emissions.
 - Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.
Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.

Your reply as the assistant is:
topic: Environmental impacts of eating meat.

Given the example above, now do this:
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 10 words. Make sure it is in the following format:
topic: <topic label>
"""

## **Data Preprocessing**

---



Remove links from text

In [None]:
df["text"] = df["text"].str.replace(r"http\S+", "", regex=True)
df["text"] = df["text"].str.replace(r"www.\S+", "", regex=True)

Remove extra spaces, leading and trailing spaces

In [None]:
df["text"] = df["text"].str.replace(r" +", " ")
df["text"] = df["text"].str.strip()

We will use `docs` as our main working dataset.

In [None]:
docs = df["text"]
titles = df["title"]

## **Setup BERTopic Layers**

---



**Step 1 - Extract embeddings**

Other Sentence Transformer can be found in this [leaderboard](https://huggingface.co/spaces/mteb/leaderboard)

In [None]:
# embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embedding_model = SentenceTransformer("thenlper/gte-small", trust_remote_code=True)

Pre-calculate embeddings to save time.

In [None]:
embeddings = embedding_model.encode(docs, show_progress_bar=True)

**Step 2 - Reduce dimensionality**

In [None]:
umap_model = UMAP(n_neighbors=5, n_components=5, min_dist=0.0, metric='cosine', random_state=21522)

**Step 3 - Cluster reduced embeddings**

In [None]:
hdbscan_model = HDBSCAN(min_cluster_size=20,min_samples=1, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

**Step 4 - Tokenize topics**

In [None]:
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 3))

**Step 5 - Create topic representation**

In [None]:
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

**Step 6 - Fine-tune topic representations**

In [None]:
# KeyBERT Inspired
keybert_model = KeyBERTInspired(top_n_words=20)

# Maximal Marginal Relevance
mmr_model  = MaximalMarginalRelevance(top_n_words=20, diversity=0.3)

# Part-of-Speech
pos_model  = PartOfSpeech(top_n_words=20, model="en_core_web_sm")

# Open AI
openai_tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
openai_model  = OpenAI(client, model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt, nr_docs=20, doc_length=500, tokenizer=openai_tokenizer)

# Create dictionary
representation_model = {
    "KeyBERT": keybert_model,
    "MMR": mmr_model,
    "POS": pos_model,
    "OpenAI": [keybert_model, openai_model],
}

## **Create the Model**

---



In [None]:
topic_model = BERTopic(
  # Pipeline models
  embedding_model=embedding_model,            # Step 1 - Extract embeddings
  umap_model=umap_model,                      # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,                # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,          # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                  # Step 5 - Extract topic words
  representation_model=representation_model,  # Step 6 - Fine-tune topic represenations

  # Hyperparameters
  top_n_words=20,
  verbose=True
)

## **Start Model Training**

---



In [None]:
topics, probs = topic_model.fit_transform(docs, embeddings)

## **Results**

Get number of topics

In [None]:
topic_length = len(topic_model.get_topic_info()) - 1

Update topic label to ChatGPT result

In [None]:
chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_["OpenAI"].items()}
chatgpt_topic_labels[-1] = "Outlier Topic"
topic_model.set_topic_labels(chatgpt_topic_labels)

In [None]:
topic_model.get_topic_info()

In [None]:
def create_wordcloud(model, topic):
    text = {word: value for word, value in model.get_topic(topic)}
    wc = WordCloud(background_color="white", max_words=100)
    wc.generate_from_frequencies(text)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(topic_model.get_topic_info()["CustomName"][topic+1])
    plt.show()

# Show wordcloud for topics
for i in range(topic_length):
    print("Topic", i+1)
    create_wordcloud(topic_model, topic=i)
    print("\n")

In [None]:
topic_model.visualize_topics(custom_labels=True)

In [None]:
# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings, custom_labels=True, hide_annotations=True)

In [None]:
topic_model.visualize_hierarchy(custom_labels=True)

In [None]:
for i in range(1,topic_length):
    topics = topic_model.get_topic(i, full=True)
    res_keybert = [topics["KeyBERT"][j][0] for j in range(15)]
    res_mmr = [topics["MMR"][j][0] for j in range(15)]
    res_pos = [topics["POS"][j][0] for j in range(15)]
    res_openai = topics["OpenAI"][0][0]
    print("Topic ", i)
    print("KeyBERT: ",res_keybert)
    print("MMR: ",res_mmr)
    print("POS: ",res_pos)
    print("OpenAI: ",res_openai)
    print("=" * 50,"\n")