In [1]:
from bertopic import BERTopic
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"] = "0" 

In [58]:
file1 = open('../datasets/Amazon-531/llama_label_50.txt', 'r')
documents = file1.readlines()[:31920]
len(documents)

31920

In [59]:
abstracts = []
for row in documents:
    row_list = row.strip().split(" ")[1:]
    newrow = " ".join(row_list)
    abstracts.append(newrow)

In [60]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(abstracts, show_progress_bar=True)

Batches:   0%|          | 0/998 [00:00<?, ?it/s]

In [5]:
from umap import UMAP

umap_model = UMAP(n_neighbors=5, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

In [6]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

In [8]:
model_id = 'meta-llama/Llama-2-13b-chat-hf'

In [9]:
from torch import bfloat16
import transformers

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,  # 4-bit quantization
    bnb_4bit_quant_type='nf4',  # Normalized float 4
    bnb_4bit_use_double_quant=True,  # Second quantization after the first
    bnb_4bit_compute_dtype=bfloat16  # Computation type
)

In [10]:
# Llama 2 Tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)

# Llama 2 Model
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
)
model.eval()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 5120)
    (layers): ModuleList(
      (0-39): 40 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (k_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (v_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (o_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (up_proj): Linear4bit(in_features=5120, out_features=13824, bias=False)
          (down_proj): Linear4bit(in_features=13824, out_features=5120, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm

In [11]:
# Our text generator
generator = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    temperature=0.1,
    max_new_tokens=500,
    repetition_penalty=1.1
)

In [12]:
# System prompt describes information given to all conversations
system_prompt = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics.
<</SYS>>
"""

In [13]:
# Example prompt demonstrating the output we are looking for
example_prompt = """
I have a topic that contains the following documents:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.

The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.

[/INST] Environmental impacts of eating meat
"""

In [14]:
# Our main prompt with documents ([DOCUMENTS]) and keywords ([KEYWORDS]) tags
main_prompt = """
[INST]
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
[/INST]
"""

In [15]:
prompt = system_prompt + example_prompt + main_prompt

In [16]:
from bertopic.representation import TextGeneration


# Text generation with Llama 2
llama2 = TextGeneration(generator, prompt=prompt)

# All representation models
representation_model = {
    "Llama2": llama2,
}

In [61]:
topic_model = BERTopic(

  # Sub-models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True,
  calculate_probabilities=True,
)

# Train model
topics, probs = topic_model.fit_transform(abstracts, embeddings)


2024-01-18 01:30:57,985 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


2024-01-18 01:31:09,283 - BERTopic - Dimensionality - Completed ✓
2024-01-18 01:31:09,286 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false

In [62]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Llama2,Representative_Docs
0,-1,9181,-1_toys_dolls_customer_service_food_ingredient...,"[toys_dolls, customer_service, food_ingredient...","[Consumer goods and services, , , , , , , , , ]","[customer_service, online_shopping, frustratio..."
1,0,1141,0_home_furnishings_decor_home_furniture_decor_...,"[home_furnishings_decor, home_furniture_decor,...","[Home Furnishings and Baby Products, , , , , ,...","[furniture_baby_kids, home_furnishings_decor, ..."
2,1,779,1_fragrances_perfumes_fragrances_fashion_fragr...,"[fragrances_perfumes, fragrances, fashion_frag...","[Fashion Fragrances, , , , , , , , , ]","[beauty_personal_care, fragrances_perfumes, ta..."
3,2,438,2_baby_gear_accessories_baby_gear_equipment_pa...,"[baby_gear_accessories, baby_gear_equipment, p...","[Parenting Essentials, , , , , , , , , ]","[parenting_baby_care, household_safety_product..."
4,3,332,3_pet_toys_dog_toys_pet_supplies toys_games_pe...,"[pet_toys, dog_toys, pet_supplies toys_games, ...","[Pet Toys and Accessories, , , , , , , , , ]","[pet_toys, dog_toys, interactive_toys, pet_toy..."
...,...,...,...,...,...,...
393,392,20,392_food_vegetarian_food_vegan_vegetarian_food...,"[food_vegetarian, food_vegan, vegetarian_food,...","[Vegetarian and Vegan Food and Recipes, , , , ...","[food_vegetarian, grocery_pantry_staples, tast..."
394,393,20,393_sexual_wellness_adult_products_personal_ca...,"[sexual_wellness, adult_products, personal_car...","[Adult Products and Sexual Wellness, , , , , ,...","[adult_products, sexual_wellness, personal_car..."
395,394,20,394_cleaning_supplies home_maintenance_home_ma...,"[cleaning_supplies home_maintenance, home_main...","[Home maintenance and cleaning supplies, , , ,...","[home_maintenance, cleaning_supplies, dvd_vide..."
396,395,20,395_entertainment leisure_activities_leisure_a...,"[entertainment leisure_activities, leisure_act...","[Recreational gaming and social activities, , ...","[games, entertainment, leisure_activities, gam..."


In [63]:
df = topic_model.get_topic_info()
df.to_csv("../datasets/Amazon-531/bertopic_result/50topic_info_14000.csv", index=False)

In [64]:
topic_model.get_document_info(documents)

Unnamed: 0,Document,Topic,Name,Representation,Llama2,Representative_Docs,Top_n_words,Probability,Representative_document
0,"0: health_personal_care, medical_supplies_equi...",34,34_health_monitors_medical_supplies_equipment ...,"[health_monitors, medical_supplies_equipment h...","[Wearable Health Technology, , , , , , , , , ]","[health_personal_care, medical_supplies_equipm...",health_monitors - medical_supplies_equipment h...,0.067349,False
1,"1: food_beverages, dietary_supplements, packag...",-1,-1_toys_dolls_customer_service_food_ingredient...,"[toys_dolls, customer_service, food_ingredient...","[Consumer goods and services, , , , , , , , , ]","[customer_service, online_shopping, frustratio...",toys_dolls - customer_service - food_ingredien...,0.700633,False
2,"2: food_snacks, shopping_retail, coffee\n",-1,-1_toys_dolls_customer_service_food_ingredient...,"[toys_dolls, customer_service, food_ingredient...","[Consumer goods and services, , , , , , , , , ]","[customer_service, online_shopping, frustratio...",toys_dolls - customer_service - food_ingredien...,0.036208,False
3,"2: shopping_retail, food_beverages, online_sho...",55,55_online_shopping food_snacks_online_shopping...,"[online_shopping food_snacks, online_shopping,...","[Online shopping for food and candy, , , , , ,...","[food_candy, shopping_retail, customer_service...",online_shopping food_snacks - online_shopping ...,0.121055,False
4,"2: shipping_delivery, logistics_supply_chain, ...",182,182_shipping_delivery_customer_service shippin...,"[shipping_delivery, customer_service shipping_...","[E-commerce logistics and customer service, , ...","[customer_service, shipping_delivery, online_s...",shipping_delivery - customer_service shipping_...,0.232849,False
...,...,...,...,...,...,...,...,...,...
31915,"13998: weight_loss, health_personal_care, diet...",161,161_weight_loss_dietary_supplements_weight_los...,"[weight_loss, dietary_supplements, weight_loss...","[Weight Loss Supplements and Nutrition, , , , ...","[weight_loss, dietary_supplements, appetite_su...",weight_loss - dietary_supplements - weight_los...,0.743971,False
31916,"13999: toys_games, lego, movies_tv_shows_juras...",112,112_toys_action_figures_star_wars_action_figur...,"[toys_action_figures, star_wars, action_figure...",[Action figures and collectibles from popular ...,"[toys_action_figures, entertainment_marvel, co...",toys_action_figures - star_wars - action_figur...,0.240691,False
31917,"13999: toys_games_puzzles, building_constructi...",374,374_toys_games_puzzles building_sets_building_...,"[toys_games_puzzles building_sets, building_se...","[Building and Educational Toys, , , , , , , , , ]","[toys_games_puzzles, building_sets, educationa...",toys_games_puzzles building_sets - building_se...,1.000000,False
31918,"14000: toys_kids, musical_instruments, active_...",246,246_musical_instruments_music_instruments_toys...,"[musical_instruments, music_instruments, toys_...","[Kids' Musical Interests, , , , , , , , , ]","[toys_kids, musical_instruments, entertainment...",musical_instruments - music_instruments - toys...,1.000000,False


In [65]:
df = topic_model.get_document_info(documents)
df.to_csv("../datasets/Amazon-531/bertopic_result/50label_predict_14000.csv", index=False)

In [66]:
import numpy as np
with open('../datasets/Amazon-531/bertopic_result/50label_prob_14000.npy', 'wb') as f:
    np.save(f, probs)