Too big would have to separate these somehow

In [1]:
import json
with open("test.json", "r") as f:
    dataset = json.load(f)

In [2]:
# Extract abstracts to train on and corresponding titles
all_texts= [d["text"] for d in dataset]
titles = [d["file_path"] for d in dataset]

texts = all_texts[:20]

In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = str(2)
device = 0

In [4]:
import transformers
model_id = "microsoft/Phi-3.5-mini-instruct"

tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    device_map='auto',
)
model.eval()
generator = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    temperature=0.1,
    max_new_tokens=500,
    repetition_penalty=1.1
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# System prompt describes information given to all conversations
system_prompt = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics.
<</SYS>>
"""

# Our main prompt with documents ([DOCUMENTS]) and keywords ([KEYWORDS]) tags
main_prompt = """
[INST]
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
[/INST]
"""

In [6]:
# Example prompt demonstrating the output we are looking for
example_prompt = """
I have a topic that contains the following documents:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.

The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.

[/INST] Environmental impacts of eating meat
"""

prompt = system_prompt + example_prompt + main_prompt

In [7]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("BAAI/bge-small-en")
embeddings = embedding_model.encode(texts, show_progress_bar=True)

from umap import UMAP
from hdbscan import HDBSCAN

umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Pre-reduce embeddings for visualization purposes
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings)

from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration

# KeyBERT
keybert = KeyBERTInspired()

# MMR
mmr = MaximalMarginalRelevance(diversity=0.3)

# Text generation with Llama 2
llama2 = TextGeneration(generator, prompt=prompt)

# All representation models
representation_model = {
    "KeyBERT": keybert,
    "Llama2": llama2,
    "MMR": mmr,
}

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [8]:
from bertopic import BERTopic

topic_model = BERTopic(

  # Sub-models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(texts, embeddings)



2024-10-02 16:52:56,616 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-10-02 16:52:57,323 - BERTopic - Dimensionality - Completed ✓
2024-10-02 16:52:57,324 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-10-02 16:52:57,326 - BERTopic - Cluster - Completed ✓
2024-10-02 16:52:57,328 - BERTopic - Representation - Extracting topics from clusters using representation models.
  0%|                                                                                          | 0/1 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1330708 > 131072). Running this sequence through the model will result in indexing errors
The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
  0%|                                                                                          | 0/1 [00:04<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 6596.69 GiB. GPU 0 has a total capacity of 47.32 GiB of which 17.29 GiB is free. Including non-PyTorch memory, this process has 29.99 GiB memory in use. Of the allocated memory 29.64 GiB is allocated by PyTorch, and 47.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Show topics
topic_model.get_topic_info()

In [None]:
topic_model.get_topic(1, full=True)["KeyBERT"]

In [None]:
llama2_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["Llama2"].values()]
topic_model.set_topic_labels(llama2_labels)

In [None]:
topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)