In [None]:
# pip install umap-learn


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Load HeBERT
tokenizer = AutoTokenizer.from_pretrained("avichr/heBERT")
model = AutoModel.from_pretrained("avichr/heBERT")

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Mean pooling over token embeddings (excluding special tokens)
    attention = inputs['attention_mask'].unsqueeze(-1)
    embedding = (outputs.last_hidden_state * attention).sum(1) / attention.sum(1)
    return embedding.squeeze().numpy()


In [None]:
import pandas as pd
df = pd.read_csv("/home/liorkob/M.Sc/thesis/data/wep/processed_verdicts_with_gpt.csv")
verdict_paragraphs = df["extracted_gpt_facts"].dropna().tolist()


In [None]:
embeddings = np.array([get_embedding(text) for text in verdict_paragraphs])


: 

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=42)
labels = kmeans.fit_predict(embeddings)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

for i in range(5):
    cluster_texts = [text for text, label in zip(verdict_paragraphs, labels) if label == i]
    vectorizer = TfidfVectorizer(max_features=1000)
    X = vectorizer.fit_transform(cluster_texts)
    terms = vectorizer.get_feature_names_out()
    mean_scores = np.asarray(X.mean(axis=0)).flatten()
    top_indices = mean_scores.argsort()[-10:][::-1]
    top_words = [terms[ind] for ind in top_indices]
    print(f"Cluster {i} top words:", top_words)


In [None]:
import umap
import matplotlib.pyplot as plt

# Reduce embeddings to 2D
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, random_state=42)
embedding_2d = reducer.fit_transform(embeddings)

# Plot
plt.figure(figsize=(10, 7))
scatter = plt.scatter(embedding_2d[:, 0], embedding_2d[:, 1], c=labels, cmap='tab10', s=50)
plt.title("UMAP projection of clustered verdicts", fontsize=14)
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.colorbar(scatter, label='Cluster')
plt.grid(True)
plt.show()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
k=5
for i in range(k):
    cluster_texts = [text for text, label in zip(verdict_paragraphs, labels) if label == i]
    vectorizer = TfidfVectorizer(max_features=1000)
    X = vectorizer.fit_transform(cluster_texts)
    terms = vectorizer.get_feature_names_out()
    mean_scores = np.asarray(X.mean(axis=0)).flatten()
    top_indices = mean_scores.argsort()[-10:][::-1]
    top_words = [terms[ind] for ind in top_indices]
    print(f"Cluster {i} top words:", top_words)


In [None]:
# pip install gensim


### try 2 -words cluster

In [None]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from collections import defaultdict
import nltk
from transformers import AutoTokenizer
import re

tokenizer = AutoTokenizer.from_pretrained("avichr/heBERT")
def get_full_words(tokens):
    words = []
    current = ""
    for tok in tokens:
        if tok.startswith("##"):
            current += tok[2:]
        else:
            if current:
                words.append(current)
            current = tok
    if current:
        words.append(current)
    return words

# hebrew_stopwords = set(get_stop_words('he'))  # 'he' = Hebrew
hebrew_stopwords = {
    'של', 'על', 'את', 'כי', 'עם', 'זה', 'גם', 'אם', 'או', 'היה', 'היא', 'הוא', 'הם',
    'אבל', 'אני', 'אנחנו', 'אתם', 'אתן', 'אין', 'כל', 'לא', 'כן', 'יש', 'מה', 'מי', 'בו',
    'כך', 'לפי', 'ללא', 'וכן', 'עד', 'רק', 'כמו', 'מאוד', 'זאת', 'הזו', 'אותו', 'אותה'
}
def is_meaningful(word):
    return bool(re.search(r'[א-ת]', word)) and not word.isdigit() and len(word) >= 2

# 1. Load and tokenize
df = pd.read_csv("/home/liorkob/M.Sc/thesis/data/processed_verdicts_with_gpt.csv")
verdict_paragraphs = df["extracted_gpt_facts"].dropna().tolist()
tokenized_sentences = [
    [word for word in get_full_words(tokenizer.tokenize(p)) if word not in hebrew_stopwords]
    for p in verdict_paragraphs
]

filtered_sentences = [
    [word for word in sentence if word not in hebrew_stopwords and is_meaningful(word)]
    for sentence in tokenized_sentences
]

# 2. Train Word2Vec
w2v_model = Word2Vec(sentences=filtered_sentences, vector_size=100, window=5, min_count=5)

# 3. Choose words to cluster
words = list(w2v_model.wv.index_to_key)[:500]  # you can choose top N frequent words
word_vectors = [w2v_model.wv[word] for word in words]

# 4. Cluster
kmeans = KMeans(n_clusters=7, random_state=42)
labels = kmeans.fit_predict(word_vectors)

# 5. Group words by cluster
clusters = defaultdict(list)
for word, label in zip(words, labels):
    clusters[label].append(word)

# 6. Print top words per cluster
for cluster_id, word_list in clusters.items():
    print(f"Cluster {cluster_id} top words: {word_list[:10]}")
