In [None]:
# pip install umap-learn


Note: you may need to restart the kernel to use updated packages.


: 

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Load HeBERT
tokenizer = AutoTokenizer.from_pretrained("avichr/heBERT")
model = AutoModel.from_pretrained("avichr/heBERT")

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # Mean pooling over token embeddings (excluding special tokens)
    attention = inputs['attention_mask'].unsqueeze(-1)
    embedding = (outputs.last_hidden_state * attention).sum(1) / attention.sum(1)
    return embedding.squeeze().numpy()


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df = pd.read_csv("/home/liorkob/thesis/data/processed_verdicts_with_gpt.csv")
verdict_paragraphs = df["extracted_gpt_facts"].dropna().tolist()


In [None]:
embeddings = np.array([get_embedding(text) for text in verdict_paragraphs])


In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=42)
labels = kmeans.fit_predict(embeddings)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

for i in range(5):
    cluster_texts = [text for text, label in zip(verdict_paragraphs, labels) if label == i]
    vectorizer = TfidfVectorizer(max_features=1000)
    X = vectorizer.fit_transform(cluster_texts)
    terms = vectorizer.get_feature_names_out()
    mean_scores = np.asarray(X.mean(axis=0)).flatten()
    top_indices = mean_scores.argsort()[-10:][::-1]
    top_words = [terms[ind] for ind in top_indices]
    print(f"Cluster {i} top words:", top_words)


In [None]:
import umap
import matplotlib.pyplot as plt

# Reduce embeddings to 2D
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, random_state=42)
embedding_2d = reducer.fit_transform(embeddings)

# Plot
plt.figure(figsize=(10, 7))
scatter = plt.scatter(embedding_2d[:, 0], embedding_2d[:, 1], c=labels, cmap='tab10', s=50)
plt.title("UMAP projection of clustered verdicts", fontsize=14)
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.colorbar(scatter, label='Cluster')
plt.grid(True)
plt.show()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

for i in range(k):
    cluster_texts = [text for text, label in zip(verdict_paragraphs, labels) if label == i]
    vectorizer = TfidfVectorizer(max_features=1000)
    X = vectorizer.fit_transform(cluster_texts)
    terms = vectorizer.get_feature_names_out()
    mean_scores = np.asarray(X.mean(axis=0)).flatten()
    top_indices = mean_scores.argsort()[-10:][::-1]
    top_words = [terms[ind] for ind in top_indices]
    print(f"Cluster {i} top words:", top_words)
