In [None]:
%%capture
!pip install bertopic datasets accelerate bitsandbytes xformers adjustText

In [None]:
%%capture
!pip3 install torch==2.2.0

In [None]:
import pandas as pd
import numpy as np
import pickle
import os

import torch
from torch import cuda
from torch import bfloat16
import transformers


# 1. Load dataset

In [None]:
loading_dir='./drive/MyDrive/Topic Mining Project/LLM/'

In [None]:
# 1. load data
class Dataset(torch.utils.data.Dataset):
    def __init__(self):
        self.raw_data = pd.read_json(loading_dir+"data/News_Category_Dataset_v3.json", lines=True)

    def __len__(self):
        return len(self.raw_data)

    def __getitem__(self, i):
        text = self.raw_data['headline'].iloc[i] + ' | ' + self.raw_data['short_description'].iloc[i]
        label = self.raw_data['category'].iloc[i]
        timestamp = self.raw_data['date'].iloc[i]


        return text, label, timestamp


print("loading data...")
dataset = Dataset()
len(dataset)

loading data...


209527

In [None]:
with open(loading_dir+'dataset.pickle', 'wb') as f:
    pickle.dump(dataset, f)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# sample data
documents=dataset[:][0]

In [None]:
timestamp=dataset[:][2]

# 2. Load Llama2-7B model from Huggingface
- Use GPU (V100/T4)
- Download model using `bitsandbytes` library, efficient in memory usage
- First time needs to download
- Adjust transformers.pipeline

In [None]:
my_token='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'
model_id = 'meta-llama/Llama-2-7b-chat-hf'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
print(device)

cuda:0


In [None]:
# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,  # 4-bit quantization
    bnb_4bit_quant_type='nf4',  # Normalized float 4
    bnb_4bit_use_double_quant=True,  # Second quantization after the first
    bnb_4bit_compute_dtype=bfloat16  # Computation type
)
# Llama 2 Tokenizer and model
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id,token=my_token)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
    token=my_token
)
model.eval()

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Lla

In [None]:
generator = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    temperature=0.1,
    max_new_tokens=500,
    repetition_penalty=1.1)

# 3. Prompt Settings
- includes [documents] and [keywords] (keywords from CTFIDF)

In [None]:
# System prompt describes information given to all conversations
system_prompt = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics.
<</SYS>>
"""
# Example prompt demonstrating the output we are looking for
example_prompt = """
I have a topic that contains the following documents:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.

The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.

[/INST] Environmental impacts of eating meat
"""
# Our main prompt with documents ([DOCUMENTS]) and keywords ([KEYWORDS]) tags
main_prompt = """
[INST]
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
[/INST]
"""
prompt = system_prompt + example_prompt + main_prompt

# 4. Other Submodels
- vectorizer_model
- embedding_model
- reduce dimension
- clustering

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model=CountVectorizer(stop_words="english")

In [None]:
from sentence_transformers import SentenceTransformer
# Pre-calculate embeddings
embedding_model = SentenceTransformer("BAAI/bge-small-en")
embeddings = embedding_model.encode(documents,show_progress_bar=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/6548 [00:00<?, ?it/s]

In [None]:
from umap import UMAP
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

In [None]:
# Pre-reduce embeddings for visualization purposes
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [None]:
from hdbscan import HDBSCAN
hdbscan_model = HDBSCAN(min_cluster_size=100, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

from sklearn.cluster import KMeans
kmeans_model = KMeans(n_clusters=100)

# 5. Fit model

In [None]:
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration

# KeyBERT
keybert = KeyBERTInspired()
# Text generation with Llama 2
llama2 = TextGeneration(generator, prompt=prompt, nr_docs=20)
# MMR
# mmr = MaximalMarginalRelevance(diversity=0.3)

# All representation models
representation_model = {
    "KeyBERT": keybert,
    "Llama2": llama2,
    # "MMR": mmr,
}

In [None]:
from bertopic import BERTopic

topic_model = BERTopic(

  # Sub-models
  vectorizer_model=vectorizer_model,
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=kmeans_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True,
  nr_topics=100, # if nr_topics>n_clusters, will combine similar clusters to reduce number
)

# Train model
topics, probs = topic_model.fit_transform(documents, embeddings)

2024-04-03 23:06:00,981 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-04-03 23:12:11,042 - BERTopic - Dimensionality - Completed ✓
2024-04-03 23:12:11,050 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-04-03 23:12:54,150 - BERTopic - Cluster - Completed ✓
2024-04-03 23:12:54,151 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 100/100 [14:02<00:00,  8.43s/it]
2024-04-03 23:27:12,130 - BERTopic - Representation - Completed ✓
2024-04-03 23:27:12,136 - BERTopic - Topic reduction - Reducing number of topics
2024-04-03 23:27:12,139 - BERTopic - Topic reduction - Reduced number of topics from 100 to 100


In [None]:
# Show topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Llama2,Representative_Docs
0,0,926,0_autism_child_autistic_son,"[autism, child, autistic, son, syndrome, disab...","[autism, asperger, autistic, asd, parenting, t...","[Autism, , , , , , , , , ]",[Seven Years Ago Today: A Story About Suicide ...
1,1,2196,1_trump_comey_fbi_russia,"[trump, comey, fbi, russia, mueller, probe, ho...","[comey, trump, reportedly, bannon, wikileaks, ...","[Russia Investigation, , , , , , , , , ]",[Trump Now Insists He Didn't Fire James Comey ...
2,2,3209,2_music_song_songs_singer,"[music, song, songs, singer, band, rapper, alb...","[music, song, musician, songs, spotify, album,...","[Music, , , , , , , , , ]",[The Beautiful Things Music Does To Your Brain...
3,3,2127,3_travel_vacation_traveling_trip,"[travel, vacation, traveling, trip, road, trav...","[traveling, travel, travelers, travels, travel...","[Travel, , , , , , , , , ]",[Hassle-free Holiday Air Travel Tips | Now tha...
4,4,1437,4_abortion_court_parenthood_planned,"[abortion, court, parenthood, planned, supreme...","[abortion, abortions, scotus, republicans, jus...","[Reproductive Rights and Access to Abortion, ,...",[Protests Held Outside Supreme Court Following...
...,...,...,...,...,...,...,...
95,95,3123,95_cancer_health_mental_care,"[cancer, health, mental, care, patients, illne...","[cancer, health, care, medicine, patients, che...","[Cancer Care and Treatment, , , , , , , , , ]",[Young Adult Cancer: Year in Review | Although...
96,96,2411,96_church_confederate_charlottesville_god,"[church, confederate, charlottesville, god, je...","[charlottesville, christians, confederate, chr...","[Religion and Social Justice, , , , , , , , , ]",[Go and Learn What This Means: I Desire Honest...
97,97,1918,97_netflix_thrones_season_game,"[netflix, thrones, season, game, walking, trai...","[hbo, thrones, spoilers, tv, spoiler, episodes...","[Game of Thrones, , , , , , , , , ]",[Must-See Photos Of The 'Game Of Thrones' Cast...
98,98,1845,98_cruz_rubio_ted_marco,"[cruz, rubio, ted, marco, jeb, carson, bush, c...","[rubio, cruz, republicans, nonpartisan, gop, t...","[GOP Primary Race, , , , , , , , , ]",[This Is How Ted Cruz And Marco Rubio Handle T...


In [None]:
# saving and loading methods:
# pickle is not supported
embedding_model = "BAAI/bge-small-en"
topic_model.save(loading_dir, serialization="pytorch", save_ctfidf=True, save_embedding_model=embedding_model)
loaded_model = BERTopic.load(loading_dir+"saved_model") # load model by visiting the dir: saved_model


# Visualization

In [None]:
res=topic_model.visualize_documents(dataset[:][0], reduced_embeddings=reduced_embeddings[:], hide_annotations=True, hide_document_hover=False, custom_labels=True)

In [None]:
res

In [None]:
res.write_html(loading_dir+"plot.html")

In [None]:
topics_over_time = topic_model.topics_over_time(documents, timestamp, nr_bins=20)
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)

In [None]:
topics_per_class = topic_model.topics_per_class(documents, classes=dataset[:][1])
topic_model.visualize_topics_per_class(topics_per_class, top_n_topics=10)

In [None]:
from scipy.cluster import hierarchy as sch
# Hierarchical topics
linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(documents, linkage_function=linkage_function)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [None]:
topics_over_time

In [None]:
embedding_model = "sentence-transformers/BAAI/bge-small-en"
topic_model.save(loading_dir+"Llama2HF", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)