## Install the required packages

In [None]:
#!pip install bertopic
#!pip install accelerate
#!pip install bitsandbytes
#!pip install xformers
#!pip install adjustText
#!pip install openai

## Import the required libraries

In [None]:
# Data manipulation
import pandas as pd 

# GPU support and tensor operations
from torch import cuda, bfloat16

# OpenAI API
from openai import OpenAI

# Pre-trained language models
import transformers

# Sentence embeddings
from sentence_transformers import SentenceTransformer

# Dimensionality reduction
from umap import UMAP

# Hierarchical density-based clustering
from hdbscan import HDBSCAN

# Topic modeling
from bertopic import BERTopic

# Advanced topic representations
from bertopic.representation import KeyBERTInspired, TextGeneration

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Check if the gpu is available

In [None]:
# Check if GPU is available
if cuda.is_available():
    cuda.empty_cache()
    device = cuda.current_device()
    print(f"GPU: {cuda.get_device_name(device)}")
else:
    device = "cpu"
    print("GPU: Not available")

## Data Import

In this section, we import data from a CSV file, combine relevant columns, and prepare the dataset for further analysis.

In [None]:
# Import data from CSV file
df = pd.read_csv("/content/drive/Shareddrives/Jowa/CS180 Project/data/data_ai.csv")

# remove rows with empty body and title
df = df.dropna(subset=["body", "title"])
df = df.reset_index(drop=True)

# Create a new DataFrame with combined title and body text
df["text"] = df["title"] + " " + df["body"]

print(f"Data has been imported. There are {len(df)} rows.")

## Data cleaning and preparation

In [None]:
# add code here

## Setup LLM (OpenAI)

In [None]:

# create openai client
client = OpenAI(
    organization="org-bt4Y2g5LqK3lyQ2upFugCwMQ",
    project="proj_z2ZhJGlcXgKgjbntT7GVBGJt",
    api_key="sk-proj-O8pk4FV0yxh5nzx8a3MXT3BlbkFJg1BXrbapl2YTMArBMfYd"
)

from openai import OpenAI
client = OpenAI()

completion = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},
    {"role": "user", "content": "Compose a poem that explains the concept of recursion in programming."}
  ]
)

print(completion.choices[0].message)

In [None]:
hf_token = "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"
model_id = 'meta-llama/Meta-Llama-3-8B-Instruct'

In [None]:
# model to use
model_id = 'meta-llama/Meta-Llama-3-8B-Instruct'

# quantization config
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,  # 4-bit quantization
    bnb_4bit_quant_type='nf4',  # Normalized float 4
    bnb_4bit_use_double_quant=True,  # Second quantization after the first
    bnb_4bit_compute_dtype=bfloat16  # Computation type
)

# Llama 3 Tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, token = hf_token, trust_remote_code=True)

# Llama 3 Model
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
    token = hf_token,

)
model.eval()

In [None]:
# Our text generator
generator = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task='text-generation',
    temperature=0.1,
    max_new_tokens=500,
    repetition_penalty=1.1
)

In [None]:
prompt = "Whats the best way to loss weight, give me 5 steps"
res = generator(prompt)
print(res[0])

In [None]:
prompt = """
<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
You are a helpful, respectful and honest assistant for labeling topics.
<|eot_id|>

<|start_header_id|>user<|end_header_id|>
I have a topic that contains the following documents:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.
Based on the information about the topic above, please create a short label of this topic. Then, create a longer description of 2 sentence. Make sure you to only return the label and description and nothing more.
<|eot_id|>

<|start_header_id|>assistant<|end_header_id|>
Label: Environmental impacts of eating meat.
Description: This topic explores the environmental consequences of meat consumption, particularly focusing on the emissions associated with beef production. It also delves into the broader discourse surrounding meat consumption, including its cultural, ethical, and health dimensions.
<|eot_id|>

<|start_header_id|>user<|end_header_id|>
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: '[KEYWORDS]'.
Based on the information about the topic above, please create a short label of this topic. Then, create a longer description of 2 sentence. Make sure you to only return the label and description and nothing more.
<|eot_id|>

<|start_header_id|>assistant<|end_header_id|>
"""

In [None]:
# Pre-calculate embeddings
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", trust_remote_code=True)
embeddings = embedding_model.encode(dataset, show_progress_bar=True)

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=1)
hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [None]:
# KeyBERT
keybert = KeyBERTInspired(top_n_words=15)

# Text generation with Llama 2
llama3 = TextGeneration(generator, prompt=prompt)

# All representation models
representation_model = {
    "KeyBERT": keybert,
    "Llama3": llama3,
}

In [None]:

topic_model = BERTopic(

  # Sub-models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=15,
  n_gram_range=(1, 2),
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(dataset, embeddings)

In [None]:
# Show topics
topic_model.get_topic_info()

In [None]:
topic_model.get_topic(1, full=True)["KeyBERT"]

In [None]:
for i in range(1,29):
    btopics = topic_model.get_topic(i, full=True)["KeyBERT"]
    bert = [btopics[j][0] for j in range(15)]
    topic = topic_model.get_topic(i, full=True)["Llama3"][0][0]
    doc = topic_model.get_representative_docs(i)[0]
    print("Topic ", i)
    #print(bert)
    print("")
    print(topic)
    print("")

In [None]:
# Pre-reduce embeddings for visualization purposes
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)