In [None]:
pip install bertopic datasets accelerate bitsandbytes xformers adjustText

In [None]:
from gensim.test.utils import common_corpus
from gensim import corpora, models, similarities
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import numpy as np
import nltk
nltk.download('stopwords')

In [None]:
import pandas as pd
import gensim

df1 = pd.read_csv("Female_aligned_data.csv")
df2 = pd.read_csv("Male_aligned_data.csv")

df1['group'] = 'female'
df2['group'] = 'male'

df = pd.concat([df1, df2], ignore_index=True)

In [None]:
from nltk.corpus import stopwords

stop = set(stopwords.words('english'))

In [None]:
import re
clean_txt = []
for w in range(len(df['0'])):
    desc = df['0'][w].lower()
    # print(desc.split())
    desc = ' '.join([i for i in desc.split() if i not in stop])
    # print(desc)
    desc = re.sub('[^a-zA-Z]', ' ', desc) # punc
    desc=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",desc) #tags
    desc=re.sub("(\\d|\\W)+"," ",desc)
    clean_txt.append(desc)

In [None]:
df['clean'] = clean_txt
clean_t = [x.split() for x in clean_txt]

In [None]:
id2word = corpora.Dictionary(clean_t)

id2word.filter_tokens(bad_ids=del_ids) # remove unwanted word ids from the dictionary in place
id2word.filter_tokens(bad_ids=del_ids2)

texts = clean_t
corpus = [id2word.doc2bow(text) for text in texts]

### Bertopic and Llama2

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from torch import cuda

model_id = 'meta-llama/Llama-2-13b-chat-hf'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

print(device)

In [None]:
from torch import bfloat16
import transformers

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,  
    bnb_4bit_quant_type='nf4',  
    bnb_4bit_use_double_quant=True,  
    bnb_4bit_compute_dtype=bfloat16 
)

In [None]:
# Llama 2 Tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)

# Llama 2 Model
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
)
model.eval()

In [None]:
# Our text generator
generator = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    temperature=0.1,
    max_new_tokens=500,
    repetition_penalty=1.1
)

In [None]:
# System prompt describes information given to all conversations
system_prompt = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics.
<</SYS>>
"""

In [None]:
# Example prompt demonstrating the output we are looking for
example_prompt = """
I have a topic that contains the following documents:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat
production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.

The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.

[/INST] Environmental impacts of eating meat
"""

In [None]:
# Our main prompt with documents ([DOCUMENTS]) and keywords ([KEYWORDS]) tags
main_prompt = """
[INST]
I have a topic that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
[/INST]
"""

In [None]:
prompt = system_prompt + example_prompt + main_prompt

In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("BAAI/bge-small-en")

# embedding_model = SentenceTransformer("all-distilroberta-v1")

# all-MiniLM-L6-v2
# distilroberta-base

# embedding_model = SentenceTransformer("paraphrase-distilroberta-base-v1")

embeddings = embedding_model.encode(clean_txt, show_progress_bar=True)

In [None]:
#submodels for bertopic

from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.feature_extraction.text import CountVectorizer


umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=20000)
hdbscan_model = HDBSCAN(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
# hdbscan_model = AgglomerativeClustering(n_clusters=20)
# vectorizer_model=CountVectorizer(stop_words="english")

reduced_embeddings = UMAP(n_neighbors=25, n_components=2, min_dist=0.1, metric='cosine', random_state=20000).fit_transform(embeddings)

In [None]:
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration

# KeyBERT
keybert = KeyBERTInspired()

# MMR
mmr = MaximalMarginalRelevance(diversity=0.3)

# Text generation with Llama 2
llama2 = TextGeneration(generator, prompt=prompt)

# All representation models
representation_model = {
    "KeyBERT": keybert,
    "Llama2": llama2,
    "MMR": mmr,
}

In [None]:
#training

In [None]:
from bertopic import BERTopic

topic_model = BERTopic(

  # Sub-models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  # vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True,

  # nr_topics = 10,
  calculate_probabilities=True,
  min_topic_size = 100
)

# Train model
topics, probs = topic_model.fit_transform(clean_txt, embeddings)

In [None]:
freq = topic_model.get_topic_info(); freq

In [None]:
import numpy as np


prob = pd.DataFrame(topic_model.probabilities_)
top = (prob.idxmax(axis=1))

df['top'] = top
conc = pd.concat([df, prob], axis=1)
male = conc[conc['group'] == 'male']
female = conc[conc['group'] == 'female']

prob['top'] =top
prob['group'] = df['group']


topic_distmale = []
topic_distfemale = []

for i in range(len(top.unique())):
    p = prob[prob.top == i]
    topic_distmale.append(p[p.group == 'male'][i].mean())
    topic_distfemale.append(p[p.group == 'female'][i].mean())

x = np.arange(len(top.unique()))

df_ = pd.DataFrame({'topicno': x,
                    'dist_male_predominant': topic_distmale,
                    'dist_female_predominant': topic_distfemale})

In [None]:
freq['dist_male_predominant'] = 0
freq['dist_female_predominant'] = 0

freq['dist_male_predominant'][1:] = topic_distmale
freq['dist_female_predominant'][1:] = topic_distfemale

In [None]:
topics = freq['Representation'].values.tolist()

from gensim.models.coherencemodel import CoherenceModel

cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=id2word, coherence='u_mass')
cm.get_coherence()

coh = cm.get_coherence_per_topic()

freq['coherence'] = coh

In [None]:
freq.to_csv("llama_bertopic.csv")
prob.to_csv("topic_probs_llama.csv")