In [1]:
%pip install bertopic
%pip install accelerate
%pip install bitsandbytes
%pip install xformers
%pip install adjustText


In [15]:
import pandas as pd
import bitsandbytes
from torch import cuda
from torch import bfloat16
import transformers
import accelerate
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration


In [16]:
df = pd.read_csv("../data/data.csv")
data = pd.DataFrame()
data["title"] = df["title"].fillna("")
data["body"] = data["title"] + " " + df["body"].fillna("")
length = len(data)
print(f"Data has been imported. There are {length} rows.")


Data has been imported. There are 5280 rows.


In [17]:
hf_token = "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"

In [18]:

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
print(device)

cpu


In [19]:
# model to use
model_id = 'meta-llama/Meta-Llama-3-8B-Instruct'

# quantization config
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,  # 4-bit quantization
    bnb_4bit_quant_type='nf4',  # Normalized float 4
    bnb_4bit_use_double_quant=True,  # Second quantization after the first
    bnb_4bit_compute_dtype=bfloat16  # Computation type
)

# Llama 3 Tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, token = hf_token, trust_remote_code=True)

# Llama 3 Model
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
    token = hf_token,
)
model.eval()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


ImportError: Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install accelerate` and the latest version of bitsandbytes: `pip install -i https://pypi.org/simple/ bitsandbytes`

In [9]:
# Our text generator
generator = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task='text-generation',
    temperature=0.1,
    max_new_tokens=500,
    repetition_penalty=1.1
)

In [11]:
prompt = """
<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
You are a helpful, respectful and honest assistant for labeling topics.
<|eot_id|>

<|start_header_id|>user<|end_header_id|>
I have a topic that contains the following documents:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.
Based on the information about the topic above, please create a short label of this topic. Then, create a longer description of 2 sentence. Make sure you to only return the label and description and nothing more.
<|eot_id|>

<|start_header_id|>assistant<|end_header_id|>
Environmental impacts of eating meat. Description: This topic explores the environmental consequences of meat consumption, particularly focusing on the emissions associated with beef production. It also delves into the broader discourse surrounding meat consumption, including its cultural, ethical, and health dimensions.
<|eot_id|>

<|start_header_id|>user<|end_header_id|>
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: '[KEYWORDS]'.
Based on the information about the topic above, please create a short label of this topic. Then, create a longer description of 2 sentence. Make sure you to only return the label and description and nothing more.
<|eot_id|>

<|start_header_id|>assistant<|end_header_id|>
"""

In [13]:
# Pre-calculate embeddings
embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5", trust_remote_code=True)
embeddings = embedding_model.encode(data, show_progress_bar=True)

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [15]:


# KeyBERT
keybert = KeyBERTInspired()

# Text generation with Llama 2
llama3 = TextGeneration(generator, prompt=prompt)

# All representation models
representation_model = {
    "KeyBERT": keybert,
    "Llama3": llama3,
}

In [16]:

topic_model = BERTopic(

  # Sub-models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(abstracts, embeddings)

2024-05-16 21:22:57,491 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-16 21:23:10,095 - BERTopic - Dimensionality - Completed ✓
2024-05-16 21:23:10,098 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-16 21:23:10,447 - BERTopic - Cluster - Completed ✓
2024-05-16 21:23:10,473 - BERTopic - Representation - Extracting topics from clusters using representation models.
  0%|          | 0/6 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 17%|█▋        | 1/6 [00:41<03:28, 41.71s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 33%|███▎      | 2/6 [01:00<01:52, 28.05s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 50%|█████     | 3/6 [01:40<01:41, 33.74s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 67%|██████▋   | 4/6 [02:24<01:15, 37.88s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end ge

In [17]:
# Show topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Llama3,Representative_Docs
0,-1,2197,-1_the_to_and_of,"[the, to, and, of, in, for, is, it, that, with]","[learning, ai, llm, model, research, using, op...","[Ego Bias and Its Impact on Human Cognition, ,...",[Thoughts on DSPy \nI have been tinkering with...
1,0,1183,0_the_to_and_in,"[the, to, and, in, of, for, is, learning, my, it]","[learning, tensorflow, ai, ml, learn, training...",[Artificial Intelligence Education and Trainin...,[Help: Unstable training/not learning at all H...
2,1,1017,1_ai_the_to_and,"[ai, the, to, and, of, in, that, https, it, for]","[ai, openai, google, artificial, intelligence,...","[AI Ethics and Philosophy, , , , , , , , , ]",[Why Live Awareness is one of the biggest AI c...
3,2,437,2_gpt_the_it_to,"[gpt, the, it, to, 4o, and, chatgpt, is, of, t...","[chatgpt, openai, gpt, gpt4, chat, ai, memory,...","[AI Development and Applications, , , , , , , ...",[Building Multimodal Apps with GPT-4O I'm sure...
4,3,346,3_the_to_and_of,"[the, to, and, of, for, in, is, llm, with, model]","[nlp, learning, context, model, arxiv, llms, l...",[Technical Analysis of Large Language Models' ...,[[D] LLMs: Why does in-context learning work? ...
5,4,100,4_voice_to_the_speech,"[voice, to, the, speech, and, audio, of, that,...","[voice, speech, voices, tts, audio, ai, pronun...",[Voice Technology and Processing\n\nPlease not...,"[Text-To-Speech with Tone and Mannerism, How d..."


In [18]:
topic_model.get_topic(1, full=True)["Llama3"]

[('AI Ethics and Philosophy', 1),
 ('', 0),
 ('', 0),
 ('', 0),
 ('', 0),
 ('', 0),
 ('', 0),
 ('', 0),
 ('', 0),
 ('', 0)]

In [23]:
for i in range(1,5):
    btopics = topic_model.get_topic(i, full=True)["KeyBERT"]
    bert = [btopics[j][0] for j in range(10)]
    topic = topic_model.get_topic(i, full=True)["Llama3"][0][0]
    print(bert)
    print(topic)
    print("")

['ai', 'openai', 'google', 'artificial', 'intelligence', 'nvidia', 'think', 'it', 'technology', 'www']
AI Ethics and Philosophy

['chatgpt', 'openai', 'gpt', 'gpt4', 'chat', 'ai', 'memory', 'questions', 'user', 'prompt']
AI Development and Applications

['nlp', 'learning', 'context', 'model', 'arxiv', 'llms', 'llm', 'models', 'text', 'tokens']
Technical Analysis of Large Language Models' Contextual Learning Mechanisms

This topic explores the underlying mechanisms behind the improvement in accuracy observed when providing contextual information to large language models (LLMs) during inference. The author seeks a detailed, technical explanation of the process, rather than relying on anthropomorphic descriptions.

['voice', 'speech', 'voices', 'tts', 'audio', 'ai', 'pronunciation', 'speaker', 'recording', 'elevenlabs']
Voice Technology and Processing

Please note that the topic is quite broad and encompasses various subtopics related to voice technology, processing, and manipulation.



In [20]:
llama3_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["Llama3"].values()]
topic_model.set_topic_labels(llama3_labels)

In [21]:
# Pre-reduce embeddings for visualization purposes
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)