In [None]:
#!pip install -Uq bertopic sentence-transformers accelerate multiprocess accelerate bitsandbytes

In [1]:
import pandas as pd
import warnings
from datetime import datetime
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration
from umap import UMAP
import transformers
from torch import cuda, bfloat16

# Suppress warnings
warnings.filterwarnings("ignore")

2025-01-15 11:56:23.905467: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-15 11:56:23.947186: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9360] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-15 11:56:23.947228: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-15 11:56:23.947268: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1537] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-15 11:56:23.956520: I tensorflow/core/platform/cpu_feature_g

### Load Data

In [109]:
# Load and preprocess data
df = pd.read_csv("./dataset/mental_support_subreddit.csv")
df.head()

# Filter for support requests
df_spp = df[df["flair_category"] == "Support Requests"]
df_spp.head()

Unnamed: 0,title,selftext,timestamps,subreddit,link_flair_text,text,flair_category,cleaned_text
12,Almost no one cares about me,My parents don’t care my brother don’t care an...,1708614000,MentalHealthSupport,Venting,Almost no one cares about me : My parents don...,Emotional Expression,almost one care parent dont care brother dont ...
15,Vent,My whole life since I was a child I struggled ...,1708614000,MentalHealthSupport,Venting,Vent : My whole life since I was a child I str...,Emotional Expression,vent whole life since child struggle regulate ...
16,i dont know what to do,i feel like everyone is about to leave me. im ...,1708614000,MentalHealthSupport,Venting,i dont know what to do : i feel like everyone ...,Emotional Expression,dont know everyone leave im problematic im dif...
23,I want to get better but I don’t at the same time,Idk what’s wrong with me but I always spiral f...,1708614000,MentalHealthSupport,Venting,I want to get better but I don’t at the same t...,Emotional Expression,get well dont time idk whats wrong always spir...
24,is it trauma???? or just regular shit im whiny...,I would consider my childhood pleasant and bor...,1708614000,MentalHealthSupport,Venting,is it trauma???? or just regular shit im whiny...,Emotional Expression,trauma regular shit im whiny would consider ch...


In [110]:
# Set up CUDA device
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cuda:0


### Setup LLaMA model

In [118]:
# Configure quantization for efficient memory usage
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

In [None]:
from huggingface_hub import login
#login("token")

In [120]:
# Load model and tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf",
    use_fast=True
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf",
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto'
    # Remove local_files_only=True if files aren't downloaded locally
)
model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096

In [121]:
# Create text generator
generator = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task='text-generation',
    temperature=0.1,
    max_new_tokens=500,
    repetition_penalty=1.1
)

Device set to use cuda:0


In [122]:
prompt = "Could you explain to me how 4-bit quantization works as if I am 5?"
res = generator(prompt)
print(res[0]["generated_text"])

Could you explain to me how 4-bit quantization works as if I am 5?
 nobody likes a know-it-all, but I do like to learn.

So, imagine you have a big box of candy, and inside that box, there are lots of different types of candy, like gummies, peanut butter cups, and lollipops. Now, instead of having all those different types of candy in the box, let's say we only want to have four types of candy: blue, red, yellow, and green.

We can do this by taking some of the candy from the big box and putting it into smaller boxes. Let's say we take one blue gummy, one red peanut butter cup, one yellow lollipop, and one green gummy from the big box and put them into their own little boxes. Now, instead of having a big box of many different types of candy, we have four small boxes with just one type of candy each!

This is kind of like what happens when we use 4-bit quantization. We take a number that has many different values (like a big box of candy) and break it down into smaller parts (like the f

### Prompt Engineer

In [144]:
system_prompt = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics.
<</SYS>>
"""

In [145]:
main_prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: '[KEYWORDS]'.
Based on the information above, please create a concise label for the topic. Only return the label, and nothing else.
[/INST]
"""

In [146]:
example_prompt = """
[INST]
I have a topic that contains the following documents:
- I don’t feel valued in my relationship anymore. It’s like we’re just going through the motions, and I feel so alone.
- We keep arguing over the smallest things. I’m scared we’re drifting apart, but I don’t know how to fix it.
- Sometimes I wonder if we’re even meant to be together. It’s exhausting feeling this way.
The topic is described by the following keywords: 'relationship, arguments, loneliness, drifting apart, feeling unloved, exhaustion'.
Based on the information above, please create a concise label for the topic. Only return the label, and nothing else.
[/INST] Struggles in a Relationship
"""

In [147]:
prompt = system_prompt + main_prompt

### Setup BERTopic representation models

In [148]:
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration

# KeyBERT
keybert = KeyBERTInspired()

# MMR
mmr = MaximalMarginalRelevance(diversity=0.3)

# Text generation with Llama 2
llama2 = TextGeneration(generator, prompt=prompt)

# All representation models
representation_model = {
    "KeyBERT": keybert,
    "Llama2": llama2,
    "MMR": mmr,
}

### Train BERTopic Model

In [149]:
# Create and fit BERTopic model
model = BERTopic(
    verbose=True,
    embedding_model='all-MiniLM-L6-v2',
    representation_model=representation_model,
    min_topic_size=10
)

In [150]:
topics, probabilities = model.fit_transform(df_spp['cleaned_text'].tolist())
topic_results = model.get_topic_info()
print(f"Number of topics: {len(topic_results)}")
topic_results.head(11)

2025-01-15 12:49:53,896 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/86 [00:00<?, ?it/s]

2025-01-15 12:49:57,123 - BERTopic - Embedding - Completed ✓
2025-01-15 12:49:57,124 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-01-15 12:50:02,076 - BERTopic - Dimensionality - Completed ✓
2025-01-15 12:50:02,077 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-01-15 12:50:02,164 - BERTopic - Cluster - Completed ✓
2025-01-15 12:50:02,168 - BERTopic - Representation - Extracting topics from clusters using representation models.
100%|██████████| 32/32 [00:43<00:00,  1.35s/it]
2025-01-15 12:50:46,824 - BERTopic - Representation - Completed ✓


Number of topics: 32


Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Llama2,MMR,Representative_Docs
0,-1,1338,-1_im_get_dont_go,"[im, get, dont, go, know, ive, time, life, peo...","[depression, anxiety, health, struggle, life, ...",[-\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n2-\n\n\n\n\n...,"[life, make, friend, cant, year, say, day, tal...",[thought hi anyone chooses read start say ive ...
1,0,293,0_service_team_health_mental,"[service, team, health, mental, say, get, gp, ...","[therapist, therapy, psychiatrist, med, disord...","[Mental Health Services, , , , , , , , , ]","[service, health, mental, gp, call, appointmen...",[im upset service let sorry post may upset don...
2,1,109,1_job_work_im_get,"[job, work, im, get, mental, health, ive, dont...","[depression, anxiety, health, mental, work, qu...","[Job Stress and Mental Health, , , , , , , , , ]","[work, mental, health, go, week, interview, ca...",[job ruin mental health mood place week today ...
3,2,106,2_school_im_life_dont,"[school, im, life, dont, get, year, job, go, s...","[depression, stress, life, live, pursue, livin...","[Struggling with School and Life, , , , , , , ...","[school, life, study, grade, fail, college, un...",[stuck age life basically finish one goal life...
4,3,94,3_medication_take_mg_psychiatrist,"[medication, take, mg, psychiatrist, ive, week...","[antidepressant, ssri, psychiatrist, mirtazapi...","[Medication Management, , , , , , , , , ]","[medication, take, mg, psychiatrist, sleep, ap...",[cant get rush thought stop f diagnose bipolar...
5,4,60,4_anxiety_attack_panic_im,"[anxiety, attack, panic, im, anxious, go, day,...","[anxiety, anxious, panic, stress, depression, ...","[Anxiety Attack, , , , , , , , , ]","[anxiety, panic, im, anxious, day, start, ches...",[advice mental health bad mental health ever a...
6,5,58,5_mom_dad_parent_father,"[mom, dad, parent, father, mother, get, year, ...","[family, parent, mother, father, dad, mom, hea...","[Family Dynamics and Emotional Struggles, , , ...","[mom, dad, parent, family, would, mum, home, t...",[go crazy past weekend first go happen let exp...
7,6,49,6_love_relationship_never_dont,"[love, relationship, never, dont, know, someon...","[lonely, love, end, single, talk, relationship...","[Love and Relationship Drama, , , , , , , , , ]","[love, relationship, life, time, date, ever, m...",[time wonder love sometimes wonder love ex rea...
8,7,48,7_suicide_suicidal_dont_im,"[suicide, suicidal, dont, im, ive, know, go, h...","[suicidal, suicide, suffer, die, self, kill, h...","[Suicide Risk, , , , , , , , , ]","[suicide, suicidal, dont, harm, commit, die, t...",[treat im suicide risk instead wait become one...
9,8,43,8_friend_talk_dont_people,"[friend, talk, dont, people, group, really, me...","[friendship, talk, conversation, chat, lonely,...","[Toxic Friendship, , , , , , , , , ]","[talk, group, message, care, conversation, say...",[im ignore friend dont know friend ive friend ...


In [151]:
# Create topic visualization
barchart = model.visualize_barchart()

In [152]:
model.get_topic(1, full=True)["Llama2"]

[('Job Stress and Mental Health', 1),
 ('', 0),
 ('', 0),
 ('', 0),
 ('', 0),
 ('', 0),
 ('', 0),
 ('', 0),
 ('', 0),
 ('', 0)]

#### Document embedding visualization

In [153]:
# Get document embeddings
embeddings = model.embedding_model.embed_documents(df_spp['cleaned_text'].tolist())

#Reduce dimensionality for visualization
umap_model = UMAP(n_components=2, random_state=42, metric='cosine')
reduced_embeddings = umap_model.fit_transform(embeddings)

In [154]:
llama2_labels = [label[0][0].split("\n")[0] for label in model.get_topics(full=True)["Llama2"].values()]
model.set_topic_labels(llama2_labels)

In [155]:
model.visualize_documents(df_spp['title'].tolist(), reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)

#### Topic Over Time

In [160]:
def convert_timestamp_to_year_month(timestamp):
    return datetime.fromtimestamp(timestamp)

In [161]:
# Convert timestamps
converted_timestamps = [convert_timestamp_to_year_month(ts) for ts in df_spp['timestamps'].tolist()]

In [162]:
# Create temporal visualization
topics_over_time = model.topics_over_time(
    df_spp['text'].tolist(),
    converted_timestamps,
    datetime_format="%b%M",
    nr_bins=20
)

18it [00:01, 14.41it/s]


In [163]:
model.visualize_topics_over_time(topics_over_time,custom_labels=True)