In [None]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Load your CSV
df = pd.read_csv("/home/liorkob/M.Sc/thesis/data/5k/gpt/processed_verdicts_with_gpt_2.csv")  # make sure there's a 'facts' column
texts = df["extracted_gpt_facts"].dropna().astype(str).tolist()
# model_name = '/home/liorkob/M.Sc/thesis/similarity-model/hebert-mlm-verdicts/final'

# Load a Hebrew-capable embedding model
model_name = "avichr/heBERT"  # or use "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
embedding_model = SentenceTransformer(model_name)

# Create and fit BERTopic
topic_model = BERTopic(language="multilingual", embedding_model=embedding_model)
topics, probs = topic_model.fit_transform(texts)

# Add topics back to your dataframe
df["topic"] = topics

# Show top 5 topics
print(topic_model.get_topic_info().head())

# (Optional) Save results
df.to_csv("verdicts_with_topics.csv", index=False)


2025-05-08 12:47:41.643856: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746697709.924724 1926795 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746697723.214751 1926795 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746697901.415114 1926795 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746697901.415156 1926795 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746697901.415161 1926795 computation_placer.cc:177] computation placer alr

In [3]:
pip install umap

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [12]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from tqdm import tqdm
import torch
import os
# ========== Check CUDA ==========
print("✅ CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("🎯 Using GPU:", torch.cuda.get_device_name(0))

# ========== Load Data ==========
df = pd.read_csv("/home/liorkob/M.Sc/thesis/data/5k/gpt/processed_verdicts_with_gpt_2.csv")
texts = df["extracted_gpt_facts"].dropna().astype(str).tolist()


# ========== Load Fast Multilingual Model ==========
model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
model_name = '/home/liorkob/M.Sc/thesis/similarity-model/hebert-mlm-verdicts/final'

# model_name = "avichr/heBERT" 
embedding_model = SentenceTransformer(model_name)

# ========== Prepare Saving ==========
results = []
save_path = "verdicts_with_topics.csv"

if os.path.exists(save_path):
    print("🔁 Resuming from previous save...")
    results = pd.read_csv(save_path).to_dict("records")

# ========== Process in Batches ==========
# ========== Batch Processing ==========
batch_size = 100
start_idx = len(results)
print("🔍 Starting topic modeling with intermediate saves...\n")

for i in tqdm(range(start_idx, len(texts), batch_size)):
    batch_texts = texts[i:i + batch_size]
    
    # Handle edge case for last small batch
    if len(batch_texts) < 10:
        print(f"⏭️ Skipping small batch of size {len(batch_texts)} at index {i}")
        continue  # not break

    # Get embeddings
    batch_embeddings = embedding_model.encode(batch_texts, show_progress_bar=False)

    # Custom UMAP to avoid k >= N error
    custom_umap = UMAP(n_neighbors=5, n_components=5, min_dist=0.0, metric='cosine')

    # Create BERTopic model with custom UMAP
    batch_model = BERTopic(
        language="multilingual",
        embedding_model=embedding_model,
        umap_model=custom_umap
    )

    # Fit model on batch
    batch_topics, _ = batch_model.fit_transform(batch_texts, batch_embeddings)

    # Save results
    for j, topic in enumerate(batch_topics):
        results.append({"text": batch_texts[j], "topic": topic})

    pd.DataFrame(results).to_csv(save_path, index=False)
    print(f"💾 Saved up to index {i + batch_size}")

# ========== Merge with Original DataFrame ==========
df_with_topics = pd.DataFrame(results)
df_final = df.head(len(df_with_topics)).copy()
df_final["topic"] = df_with_topics["topic"]

# ========== Final Save ==========
df_final.to_csv("verdicts_with_topics_final.csv", index=False)
print("\n✅ Done! Saved 'verdicts_with_topics_final.csv'")


✅ CUDA available: True
🎯 Using GPU: NVIDIA GeForce RTX 4090


No sentence-transformers model found with name /home/liorkob/M.Sc/thesis/similarity-model/hebert-mlm-verdicts/final. Creating a new one with mean pooling.


Some weights of BertModel were not initialized from the model checkpoint at /home/liorkob/M.Sc/thesis/similarity-model/hebert-mlm-verdicts/final and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🔍 Starting topic modeling with intermediate saves...



  3%|▎         | 1/35 [00:01<00:43,  1.29s/it]

💾 Saved up to index 100


  6%|▌         | 2/35 [00:01<00:26,  1.22it/s]

💾 Saved up to index 200


  9%|▊         | 3/35 [00:02<00:20,  1.53it/s]

💾 Saved up to index 300


 11%|█▏        | 4/35 [00:02<00:18,  1.69it/s]

💾 Saved up to index 400


 14%|█▍        | 5/35 [00:03<00:16,  1.82it/s]

💾 Saved up to index 500


 17%|█▋        | 6/35 [00:03<00:15,  1.88it/s]

💾 Saved up to index 600


 20%|██        | 7/35 [00:04<00:15,  1.78it/s]

💾 Saved up to index 700


 23%|██▎       | 8/35 [00:04<00:14,  1.90it/s]

💾 Saved up to index 800


 26%|██▌       | 9/35 [00:05<00:12,  2.04it/s]

💾 Saved up to index 900


 29%|██▊       | 10/35 [00:05<00:11,  2.16it/s]

💾 Saved up to index 1000


 31%|███▏      | 11/35 [00:06<00:10,  2.22it/s]

💾 Saved up to index 1100


 34%|███▍      | 12/35 [00:06<00:10,  2.22it/s]

💾 Saved up to index 1200


 37%|███▋      | 13/35 [00:06<00:10,  2.12it/s]

💾 Saved up to index 1300


 40%|████      | 14/35 [00:07<00:09,  2.11it/s]

💾 Saved up to index 1400


 43%|████▎     | 15/35 [00:07<00:09,  2.04it/s]

💾 Saved up to index 1500


 46%|████▌     | 16/35 [00:08<00:09,  1.99it/s]

💾 Saved up to index 1600


 49%|████▊     | 17/35 [00:09<00:09,  1.97it/s]

💾 Saved up to index 1700


 51%|█████▏    | 18/35 [00:09<00:08,  1.98it/s]

💾 Saved up to index 1800


 54%|█████▍    | 19/35 [00:10<00:08,  1.86it/s]

💾 Saved up to index 1900


 57%|█████▋    | 20/35 [00:10<00:07,  1.88it/s]

💾 Saved up to index 2000


 60%|██████    | 21/35 [00:11<00:07,  1.89it/s]

💾 Saved up to index 2100


 63%|██████▎   | 22/35 [00:11<00:06,  1.86it/s]

💾 Saved up to index 2200


 66%|██████▌   | 23/35 [00:12<00:06,  1.90it/s]

💾 Saved up to index 2300


 69%|██████▊   | 24/35 [00:12<00:05,  1.91it/s]

💾 Saved up to index 2400


 71%|███████▏  | 25/35 [00:13<00:05,  1.88it/s]

💾 Saved up to index 2500


 74%|███████▍  | 26/35 [00:13<00:04,  1.88it/s]

💾 Saved up to index 2600


 77%|███████▋  | 27/35 [00:14<00:04,  1.87it/s]

💾 Saved up to index 2700


 80%|████████  | 28/35 [00:14<00:03,  1.87it/s]

💾 Saved up to index 2800


 83%|████████▎ | 29/35 [00:15<00:04,  1.45it/s]

💾 Saved up to index 2900


 86%|████████▌ | 30/35 [00:16<00:03,  1.51it/s]

💾 Saved up to index 3000


 89%|████████▊ | 31/35 [00:17<00:02,  1.57it/s]

💾 Saved up to index 3100


 91%|█████████▏| 32/35 [00:17<00:01,  1.61it/s]

💾 Saved up to index 3200


 94%|█████████▍| 33/35 [00:18<00:01,  1.45it/s]

💾 Saved up to index 3300


100%|██████████| 35/35 [00:19<00:00,  1.83it/s]

💾 Saved up to index 3400
⏭️ Skipping small batch of size 5 at index 3400






✅ Done! Saved 'verdicts_with_topics_final.csv'


In [1]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Load your data
df = pd.read_csv("verdicts_with_topics_final.csv")
texts = df["extracted_gpt_facts"].dropna().astype(str).tolist()

# Reload the model
embedding_model = SentenceTransformer(model_name)

# Encode everything (this will take some time!)
embeddings = embedding_model.encode(texts, show_progress_bar=True)

# Fit a single topic model
topic_model = BERTopic(language="multilingual", embedding_model=embedding_model)
topics, probs = topic_model.fit_transform(texts, embeddings)

# Now you can explore!
print(topic_model.get_topic(0))


2025-05-13 11:51:40.459944: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747126300.480363 1243532 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747126300.486701 1243532 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747126300.503042 1243532 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747126300.503059 1243532 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747126300.503061 1243532 computation_placer.cc:177] computation placer alr

NameError: name 'pd' is not defined

In [16]:
df[df["topic"] == 0].head()
def get_keywords(topic_id):
    words = topic_model.get_topic(topic_id)
    return ", ".join([w for w, _ in words[:5]])  # top 5 words

df["topic_keywords"] = df["topic"].apply(get_keywords)


In [None]:
topic_model.visualize_barchart(top_n_topics=10)
fig = topic_model.visualize_barchart(top_n_topics=10)
fig.show()  # show in notebook or GUI
