## **Install packages**

---



In [None]:
# %pip install openai
# %pip install bertopic
# %pip install tiktoken
# %pip install cudf-cu12 dask-cudf-cu12 --extra-index-url=https://pypi.nvidia.com
# %pip install cuml-cu12 --extra-index-url=https://pypi.nvidia.com
# %pip install cugraph-cu12 --extra-index-url=https://pypi.nvidia.com
# %pip install --upgrade cupy-cuda12x -f https://pip.cupy.dev/aarch64
# %pip install --upgrade firebase-admin --user
# %pip install torch --user

Use this if there are some dependencies error

In [None]:
# !pip uninstall -y cupy-cuda11x
# !pip uninstall -y cupy-cuda12x
# !pip uninstall -y cuda-python
# !pip install --upgrade cuda-python

##  **Import libraries**

---



In [1]:
# Data manipulation
import pandas as pd

# Regex
import re

# OpenAI Library
import openai

# Firebase
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
from firebase_admin.firestore import ArrayUnion

# Graphs
import matplotlib.pyplot as plt
# from wordcloud import WordCloud

# GPU support and tensor operations
import torch

## Dimensionality reduction
from umap import UMAP

# Dimensionality reduction using GPU
# from cuml.manifold import UMAP

## Clustering
from hdbscan import HDBSCAN

# Clustering using GPU
# from cuml.cluster import HDBSCAN

# Read environment variables
from os import getenv

# Sentence embeddings
from sentence_transformers import SentenceTransformer

# Text vectorization
from sklearn.feature_extraction.text import CountVectorizer

# Topic modeling
from bertopic import BERTopic

# Custom representation for topics
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech, OpenAI

# Document tokenizer for ai representation
import tiktoken

# Custom vectorizer for class-baseD TF-IDF
from bertopic.vectorizers import ClassTfidfTransformer

# Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

# Access colab env
# from google.colab import userdata

  from .autonotebook import tqdm as notebook_tqdm


Firebase setup

In [3]:
cred = credentials.Certificate('../credentials/trenddit-5f2a1-firebase-adminsdk-2u4g4-c6d33d026b.json')

app_name='trenddit'
app = firebase_admin.initialize_app(cred, name=app_name)

In [4]:
db = firestore.client(firebase_admin.get_app(app_name))

## **Data Import and Environment Setup**

---



Import data from a CSV file, combine relevant columns.

In [5]:
# Import data from CSV file
df = pd.read_csv("../data/data_ai.csv")

# remove rows with empty body and title
df = df.dropna(subset=["body", "title"])
df = df.reset_index(drop=True)

# Create a new DataFrame with combined title and body text
df["text"] = df["title"] + " " + df["body"]

print(f"Data has been imported. There are {len(df)} rows.")

Data has been imported. There are 4272 rows.


Initialize OpenAI Client

In [7]:
client = openai.OpenAI(api_key=getenv('OPENAI_API_KEY'))

response = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a helpful, respectful and honest assistant for labeling topics."},
    {"role": "user", "content": "Hello!"}
  ]
)

print(response.choices[0].message.content)

Hello! How can I assist you today?


In [8]:
prompt = """
You are a helpful, respectful and honest assistant for labeling topics.

Given this example input:
I have a topic that contains the following documents:
 - Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
 - Meat, but especially beef, is the word food in terms of emissions.
 - Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.
Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.

Your reply as the assistant is:
topic: Environmental impacts of eating meat.

Given the example above, now do this:
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 10 words. Make sure it is in the following format:
topic: <topic label>
"""

## **Data Preprocessing**

---



Remove links from text

In [9]:
df["text"] = df["text"].str.replace(r"http\S+", "", regex=True)
df["text"] = df["text"].str.replace(r"www.\S+", "", regex=True)

Remove extra spaces, leading and trailing spaces

In [10]:
df["text"] = df["text"].str.replace(r" +", " ")
df["text"] = df["text"].str.strip()

Replace consecutive zeroes with a single zero.

In [11]:
def remove_consecutive_zeros(text):
    # Use regex to replace multiple consecutive 0s with a single 0
    return re.sub(r'0+', '0', text)

df['text'] = df['text'].apply(remove_consecutive_zeros)

We will use `docs` as our main working dataset.

In [12]:
docs = df["text"]
titles = df["title"]



## **Setup BERTopic Layers**

---



**Step 1 - Extract embeddings**

Other Sentence Transformer can be found in this [leaderboard](https://huggingface.co/spaces/mteb/leaderboard)

In [13]:
# embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embedding_model = SentenceTransformer("thenlper/gte-small", trust_remote_code=True)
# embedding_model = SentenceTransformer("avsolatorio/NoInstruct-small-Embedding-v0")

Pre-calculate embeddings to save time.

In [14]:
embeddings = embedding_model.encode(docs, show_progress_bar=True)

Batches:   6%|▌         | 8/134 [04:45<1:14:55, 35.68s/it]


KeyboardInterrupt: 

**Step 2 - Reduce dimensionality**

In [None]:
umap_model = UMAP(n_neighbors=5, n_components=3, min_dist=0.0, metric='cosine', random_state=21522)

**Step 3 - Cluster reduced embeddings**

In [None]:
hdbscan_model = HDBSCAN(min_cluster_size=20, min_samples=1, metric='euclidean', cluster_selection_method='leaf', prediction_data=True)

**Step 4 - Tokenize topics**

In [None]:
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

**Step 5 - Create topic representation**

In [None]:
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

**Step 6 - Fine-tune topic representations**

In [None]:
# KeyBERT Inspired
keybert_model = KeyBERTInspired(top_n_words=10)

# Maximal Marginal Relevance
mmr_model  = MaximalMarginalRelevance(top_n_words=10, diversity=0.3)

# Part-of-Speech
pos_model  = PartOfSpeech(top_n_words=10, model="en_core_web_sm")

# Open AI
openai_tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
openai_model  = OpenAI(client, model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt, nr_docs=10, doc_length=500, tokenizer=openai_tokenizer)

# Create dictionary
representation_model = {
    "KeyBERT": keybert_model,
    "MMR": mmr_model,
    "POS": pos_model,
    "OpenAI": [keybert_model, openai_model],
}

## **Create the Model**

---



In [None]:
topic_model = BERTopic(
  # Pipeline models
  embedding_model=embedding_model,            # Step 1 - Extract embeddings
  umap_model=umap_model,                      # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,                # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,          # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                  # Step 5 - Extract topic words
  representation_model=representation_model,  # Step 6 - Fine-tune topic represenations

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

## **Start Model Training**

---



In [None]:
topics, probs = topic_model.fit_transform(docs, embeddings)

## **Results**

Get number of topics

In [None]:
topic_length = len(topic_model.get_topic_info()) - 1

Update topic label to ChatGPT result

In [None]:
chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_["OpenAI"].items()}
chatgpt_topic_labels[-1] = "Outlier Topic"
topic_model.set_topic_labels(chatgpt_topic_labels)

In [None]:
topic_model.get_topic_info()

Extract the top 10 representative documents

In [None]:
documents = pd.DataFrame({"Document": docs,
                          "ID": range(len(docs)),
                          "Topic": topic_model.topics_})


repr_docs= topic_model._extract_representative_docs(c_tf_idf=topic_model.c_tf_idf_,
                                                          documents=documents,
                                                          topics=topic_model.topic_representations_ ,
                                                          nr_repr_docs=10)

Check if clustering and generated topic name make sense

In [None]:
def wrap_text(text, max_length):
    wrapped_lines = []
    for i in range(0, len(text), max_length):
        wrapped_lines.append(text[i:i+max_length])
    return '\n'.join(wrapped_lines)


for i in range(1):
  print("Topic", i+1, ": ", topic_model.get_topic_info()["CustomName"][i+1])
  print(topic_model.get_topic(i))
  for j in range(len(repr_docs[0][i])):
    print("Representative document ", j+1, ": ", repr_docs[0][i][j], "\n")



Add topics data to Firebase

In [None]:
rep_docs = []
label = ''
keywords = []

base_url = "https://www.reddit.com"
for i in range(10):
    label = topic_model.get_topic_info()["CustomName"][i+1]
    keywords_tuples = topic_model.get_topic(i)
    keywords = [keyword for keyword, score in keywords_tuples]
    print("\nTopic", i+1, ":", label)
    print(keywords)
    for doc in repr_docs[0][i]:
        for value in df["text"]:
            if doc == value:
                link = df.loc[df["text"] == doc, "permalink"].values[0]
                title = df.loc[df["text"] == doc, "title"].values[0]
                rep_docs.append({
                    'content': title,
                    'url': base_url + link
                })
                break
    # data = {"label": label,
    #         "rank": i+1,
    #         "rep_docs": rep_docs,
    #         "keywords": keywords}
    # ref = db.collection('topics').add(data)
    rep_docs = []
    label = ''
    keywords = []

print('Documents updated successfully!')


In [None]:
topic_model.visualize_term_rank()

In [None]:
def create_wordcloud(model, topic):
    text = {word: value for word, value in model.get_topic(topic)}
    wc = WordCloud(background_color="white", max_words=100)
    wc.generate_from_frequencies(text)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(topic_model.get_topic_info()["CustomName"][topic+1])
    plt.show()

# Show wordcloud for topics
for i in range(topic_length):
    print("Topic", i+1)
    create_wordcloud(topic_model, topic=i)
    print("\n")

In [None]:
topic_model.visualize_topics(custom_labels=True)

In [None]:
# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings, custom_labels=True, hide_annotations=True)

In [None]:
topic_model.visualize_hierarchy(custom_labels=True)

In [None]:
for i in range(0,topic_length):
    topics = topic_model.get_topic(i, full=True)
    res_keybert = [topics["KeyBERT"][j][0] for j in range(10)]
    res_mmr = [topics["MMR"][j][0] for j in range(10)]
    res_pos = [topics["POS"][j][0] for j in range(10)]
    res_openai = topics["OpenAI"][0][0]
    print("Topic ", i+1)
    print("KeyBERT: ",res_keybert)
    print("MMR: ",res_mmr)
    print("POS: ",res_pos)
    print("OpenAI: ",res_openai)
    print("=" * 50,"\n")