In [1]:
import numpy as np

In [24]:
sbert_embeddings = np.load("sbert_umap_25d.npy")
tfidf_embeddings = np.load("tfidf_umap50d.npy")

sbert_clusters = np.load("sbert_clusters.npy")
tfidf_clusters = np.load("tfidf_clusters.npy")

true_labels = np.load("true_labels.npy")
sbert_clusters_kmeans = np.load("sbert_clusters_kmeans.npy")

sloberta_embeddings = np.load("sloberta_umap_25d.npy")
sloberta_clusters = np.load("sloberta_clusters_dbscan.npy")

In [9]:
print(sbert_embeddings.shape)
print(sbert_clusters.shape)

(29493, 25)
(29493,)


In [15]:
import json

def load_combined_jsonl(filename="preprocessed_combined.jsonl"):
    with open(filename, "r", encoding="utf-8") as f:
        return [json.loads(line.strip()) for line in f if line.strip()]

# Example usage
preprocessed_texts = load_combined_jsonl()
print(f"‚úÖ Loaded {len(preprocessed_texts)} preprocessed articles.")
tokenized_texts = [text.split() for text in preprocessed_texts]


‚úÖ Loaded 29493 preprocessed articles.


In [16]:
print(tokenized_texts[0])

['skupina', 'brics', 'nov', 'polnopraven', 'ƒçlanica', 'egipt', 'etiopija', 'iran', 'savdski', 'arabija', 'zdru≈æen', 'arabski', 'emirat', 'polnopraven', 'ƒçlan', 'skupina', 'velik', 'gospodarstvo', 'vzpon', 'brics', 'ƒçlanica', 'voditelj', 'dozdaj≈°nji', 'ƒçlanica', 'brics', 'brazilija', 'rusija', 'indija', 'kitajska', 'ju≈æen', 'afrika', 'sprejetje', 'nov', 'ƒçlanica', 'vrh', 'skupina', 'avgust', 'johannesburg', 'skupina', 'peterica', 'skupina', 'argentina', 'dan', 'nov', 'argentinski', 'predsednik', 'javier', 'milea', 'pismo', 'voditelj', 'brics', 'stali≈°ƒçe', 'nov', 'vlada', '≈°tevilen', 'pogled', 'predhoden', 'oblast', 'pravi', 'ƒças', 'pridru≈æitev', 'skupina', 'argentina', 'ƒçlanica', 'skupina', 'omre≈æje', 'november', 'nov', 'argentinski', 'zunanji', 'ministrica', 'diana', 'mondino', 'milea', 'predvolilen', 'kampanja', 'simpatija', 'zda', 'izrael', 'prekinitev', 'stik', 'brazilija', 'kitajska', 'zanimanje', 'pridru≈æitev', 'skupina', 'navedba', 'ju≈ænoafri≈°ki', 'zunanji', 'mi

In [25]:
import numpy as np
from sklearn.metrics import (
    silhouette_score,
    davies_bouldin_score,
    adjusted_rand_score,
    normalized_mutual_info_score,
)
from sklearn.metrics.pairwise import cosine_similarity
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
import pandas as pd
from tqdm import tqdm

def embedding_alignment_score(embeddings, labels):
    label_set = set(labels)
    if -1 in label_set:
        label_set.remove(-1)

    similarities = []
    for label in label_set:
        cluster_embeddings = embeddings[labels == label]
        if len(cluster_embeddings) < 2:
            continue
        sim_matrix = cosine_similarity(cluster_embeddings)
        upper_triangle = sim_matrix[np.triu_indices_from(sim_matrix, k=1)]
        similarities.append(np.mean(upper_triangle))

    return np.mean(similarities) if similarities else None


def evaluate_all(embeddings, clusters, label_name, tokenized_texts=None, true_labels=None):
    mask = clusters != -1
    # NPMI Coherence (based on tokenized_texts)
    if tokenized_texts is not None:
        dictionary = Dictionary(tokenized_texts)
        cluster_topics = []
        for cluster_id in np.unique(clusters):
            if cluster_id == -1:
                continue  # Skip noise
            docs = [tokenized_texts[i] for i in range(len(clusters)) if clusters[i] == cluster_id]
            if not docs:
                continue
            cluster_dictionary = Dictionary(docs)
            corpus = [cluster_dictionary.doc2bow(doc) for doc in docs]
            # Get top words for topic (just the dictionary keys sorted by frequency)
            word_freq = {}
            for doc in corpus:
                for word_id, freq in doc:
                    word_freq[word_id] = word_freq.get(word_id, 0) + freq
            sorted_words = sorted(word_freq.items(), key=lambda x: -x[1])
            topic_words = [cluster_dictionary[word_id] for word_id, _ in sorted_words[:10]]
            cluster_topics.append(topic_words)

        cm = CoherenceModel(
            topics=cluster_topics,
            texts=tokenized_texts,
            dictionary=dictionary,
            coherence='c_npmi'
        )
        avg_npmi = cm.get_coherence()
    else:
        avg_npmi = None
    metrics = {
        "Embedding": label_name,
        "Silhouette": silhouette_score(embeddings[mask], clusters[mask]) if np.sum(mask) > 1 else None,
        "Davies-Bouldin": davies_bouldin_score(embeddings[mask], clusters[mask]) if np.sum(mask) > 1 else None,
        "ARI": adjusted_rand_score(true_labels, clusters) if true_labels is not None else None,
        "NMI": normalized_mutual_info_score(true_labels, clusters) if true_labels is not None else None,
        "Num Clusters": len(set(clusters)) - (1 if -1 in clusters else 0),
        "Noise Points": np.sum(clusters == -1),
        "Embedding Alignment": embedding_alignment_score(embeddings, clusters),
        "Avg NPMI": avg_npmi
    }
    return metrics

# Evaluate both embeddings
print(evaluate_all(
    embeddings=sloberta_embeddings,
    clusters=sloberta_clusters,
    label_name="SLOBERTA-DBSCAN",
    tokenized_texts=tokenized_texts,  # list of token lists for each doc
    true_labels=true_labels  # optional
))
print(evaluate_all(
    embeddings=sbert_embeddings,
    clusters=sbert_clusters,
    label_name="SBERT",
    tokenized_texts=tokenized_texts,  # list of token lists for each doc
    true_labels=true_labels  # optional
))
print(evaluate_all(
    embeddings=tfidf_embeddings,
    clusters=tfidf_clusters,
    label_name="TFIDF",
    tokenized_texts=tokenized_texts,  # list of token lists for each doc
    true_labels=true_labels  # optional
))
print(evaluate_all(
    embeddings=sbert_embeddings,
    clusters=sbert_clusters_kmeans,
    label_name="SBERT-KMEANS",
    tokenized_texts=tokenized_texts,  # list of token lists for each doc
    true_labels=true_labels  # optional
))


{'Embedding': 'SLOBERTA-DBSCAN', 'Silhouette': 0.31070593, 'Davies-Bouldin': 1.026054296956046, 'ARI': 0.29383417882004975, 'NMI': 0.5639818237269971, 'Num Clusters': 28, 'Noise Points': 0, 'Embedding Alignment': 0.9986762, 'Avg NPMI': 0.09113513693637}
{'Embedding': 'SBERT', 'Silhouette': 0.31920457, 'Davies-Bouldin': 1.0093945344303572, 'ARI': 0.2322979492079684, 'NMI': 0.49488931728741375, 'Num Clusters': 26, 'Noise Points': 0, 'Embedding Alignment': 0.9991281, 'Avg NPMI': 0.08708324263406085}
{'Embedding': 'TFIDF', 'Silhouette': 0.32474416, 'Davies-Bouldin': 1.0185792309972688, 'ARI': 0.17818023641450698, 'NMI': 0.42428634930358, 'Num Clusters': 27, 'Noise Points': 0, 'Embedding Alignment': 0.9994208, 'Avg NPMI': 0.09520921248178979}
{'Embedding': 'SBERT-KMEANS', 'Silhouette': 0.3659727, 'Davies-Bouldin': 0.9945940220041651, 'ARI': 0.2323831969718441, 'NMI': 0.49544835436713974, 'Num Clusters': 26, 'Noise Points': 0, 'Embedding Alignment': 0.9991866, 'Avg NPMI': 0.09023254501353344

embedding: "Do points within the same cluster have higher embedding similarity than across clusters?"
bad on tfidf ... sparse, high dimensional


Silhouette Score: Separation and cohesion of clusters.
Davies-Bouldin Index: Lower is better.
ARI / NMI: Ground truth comparison (if labels are available).
Embedding Alignment: Average cosine similarity within clusters.
Avg NPMI: Semantic coherence of extracted words per cluster (requires tokenized texts).

üü© Summary:
Embedding Alignment is nearly perfect for both (which is expected).