In [1]:
import yaml
import json
from tqdm import tqdm
from sklearn.cluster import KMeans
from umap.umap_ import UMAP
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import numpy as np
import plotly.colors as pc
from itertools import cycle
from sklearn.decomposition import PCA
from keybert import KeyBERT
import yake
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
import os

# Load YAML data
def load_articles(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        articles = yaml.safe_load(file)
    return articles

# Extract representative texts from articles
def get_article_text(article):
    text_parts = [
        article.get('title', ''),
        article.get('lead', ''),
        ' '.join(article.get('paragraphs', [])),
    ]
    return ' '.join(text_parts)

articles = load_articles('articles.yaml')
texts = [get_article_text(article) for article in tqdm(articles, desc="Processing articles")]


  from .autonotebook import tqdm as notebook_tqdm
Processing articles: 100%|██████████| 29493/29493 [00:00<00:00, 86943.56it/s] 


In [6]:
import random
os.environ["PYTHONHASHSEED"] = "42"
random.seed(42)
np.random.seed(42)

In [2]:
   


def cluster_with_kmeans(embeddings, n_clusters=20, random_state=42):
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init='auto')
    clusters = kmeans.fit_predict(embeddings)
    return clusters

def load_preprocessed(filename="preprocessed_combined.jsonl"):
    with open(filename, "r", encoding="utf-8") as f:
        return [json.loads(line.strip()) for line in f if line.strip()]


##############
# Keyword extraction
##############

# YAKE Keyword Extraction
def extract_yake_keywords(text, top_n=20, language='sl'):
    kw_extractor = yake.KeywordExtractor(lan=language, top=top_n)
    keywords = kw_extractor.extract_keywords(text)
    return [kw for kw, _ in keywords]

# KeyBERT Keyword Extraction
def extract_keybert_keywords(keybert_model, text, stop_words, top_n=20):
    keywords = keybert_model.extract_keywords(text, top_n=top_n, stop_words=stop_words)
    return [kw for kw, _ in keywords]

# TF-IDF Keyword Extraction
def extract_tfidf_keywords(cluster_texts, stop_words, top_n=20, ngram_range=(1, 2)):
    # TF-IDF with stopword filtering and n-grams
    vectorizer = TfidfVectorizer(
        max_features=1000,
        stop_words=stop_words,
        ngram_range=ngram_range,
        token_pattern=r'\b\w{3,}\b',  # only tokens with 3+ letters
    )
    tfidf_matrix = vectorizer.fit_transform(cluster_texts)
    feature_array = np.array(vectorizer.get_feature_names_out())
    tfidf_scores = np.asarray(tfidf_matrix.mean(axis=0)).ravel()

    # Top N terms by TF-IDF score
    top_indices = tfidf_scores.argsort()[::-1][:top_n]
    top_keywords = feature_array[top_indices]
    return list(top_keywords)

def extract_cluster_keywords(texts_tokenized, clusters, stop_words, keybert_model):
    """
    Extracts top n-grams (unigrams + bigrams) from preprocessed texts by cluster.
    """
    unique_clusters = np.unique(clusters[clusters >= 0])
    keywords_per_cluster = {}

    for cluster_label in unique_clusters:
        cluster_texts = [preprocessed_texts[i] for i in range(len(texts_tokenized)) if clusters[i] == cluster_label]

        tfidf_keywords = extract_tfidf_keywords(cluster_texts, stop_words)
        yake_keywords = extract_yake_keywords(" ".join(cluster_texts))
        keybert_keywords = extract_keybert_keywords(keybert_model, " ".join(cluster_texts), stop_words)

        print("cluster ", cluster_label, " done")
        # print("tfidf: ", tfidf_keywords)
        # print("yake: ", yake_keywords)
        # print("keybert: ", keybert_keywords)

        keywords_per_cluster[cluster_label] = {
            "tfidf": tfidf_keywords,
            "keybert": keybert_keywords,
            "yake": yake_keywords
        }

    return keywords_per_cluster

def draw_plot(
    umap_embeddings,
    clusters,
    titles=None,
    keywords_per_cluster=None,
    show_group_outlines=True,
    title="Vizualizacija novic rtvslo.si"
):
    import re

    def hex_to_rgb(color):
        if color.startswith("rgb"):
            return tuple(map(int, re.findall(r'\d+', color)))
        color = color.lstrip('#')
        lv = len(color)
        return tuple(int(color[i:i + lv // 3], 16) for i in range(0, lv, lv // 3))

    def get_filtered_points(points, percentile=95):
        center = np.median(points, axis=0)
        dists = np.linalg.norm(points - center, axis=1)
        threshold = np.percentile(dists, percentile)
        return points[dists <= threshold]

    def fit_ellipse_to_points(points, scale=2.4477):
        pca = PCA(n_components=2)
        pca.fit(points)
        center = pca.mean_
        width, height = np.sqrt(pca.explained_variance_) * scale
        angle = np.degrees(np.arctan2(pca.components_[0, 1], pca.components_[0, 0]))

        t = np.linspace(0, 2 * np.pi, 100)
        ellipse_x = center[0] + width * np.cos(t) * np.cos(np.radians(angle)) - height * np.sin(t) * np.sin(np.radians(angle))
        ellipse_y = center[1] + width * np.cos(t) * np.sin(np.radians(angle)) + height * np.sin(t) * np.cos(np.radians(angle))
        return ellipse_x, ellipse_y

    def summarize_keywords(keywords_dict, short=True, max_words=5, size=None, cluster_id=None):
        tfidf = keywords_dict.get("tfidf", [])
        tfidf_preview = ", ".join(tfidf[:max_words])
        if short:
            return tfidf_preview
        else:
            lines = [
                f"Skupina <b>{cluster_id}</b>",
                f"velikost: <b>{size}</b> novic",
                "―" * 20,
                f"<b>tfidf</b>: {', '.join(tfidf[:10])}",
                "―" * 20,
                f"<b>keybert</b>: {', '.join(keywords_dict.get('keybert', [])[:10])}",
                f"<b>yake</b>: {', '.join(keywords_dict.get('yake', [])[:5])}",
            ]
            return "<br>".join(lines)

    # Create main dataframe
    df = pd.DataFrame({
        'x': umap_embeddings[:, 0],
        'y': umap_embeddings[:, 1],
        'cluster': clusters.astype(str),
        'title': titles,
    })

    # Generate short and full label maps
    cluster_sizes = df['cluster'].value_counts().to_dict()
    
    if keywords_per_cluster:
        short_names = {
            cluster_id: summarize_keywords(
                keywords,
                short=True,
                size=cluster_sizes.get(str(cluster_id), 0),
                cluster_id=cluster_id
            )
            for cluster_id, keywords in keywords_per_cluster.items()
        }
        hover_texts = {
            cluster_id: summarize_keywords(
                keywords,
                short=False,
                size=cluster_sizes.get(str(cluster_id), 0),
                cluster_id=cluster_id
            )
            for cluster_id, keywords in keywords_per_cluster.items()
        }
    else:
        short_names = {}
        hover_texts = {}

    
    # Assign group label
    if keywords_per_cluster:
        df['group'] = [f"Skupina {int(c)}: {short_names.get(int(c), 'Unknown')}" for c in clusters]
    else:
        df['group'] = df['cluster']

    unique_groups = sorted(df['group'].unique(), key=lambda g: int(g.split(":")[0].replace("Skupina", "").strip()))

    group_to_cluster = {
        g: int(g.split(":")[0].replace("Skupina", "").strip())
        for g in unique_groups
    }

    color_pool = (
        pc.qualitative.Plotly +
        pc.qualitative.D3 +
        pc.qualitative.Set1 +
        pc.qualitative.Set2 +
        pc.qualitative.Set3 +
        pc.qualitative.Pastel1 +
        pc.qualitative.Pastel2 +
        pc.qualitative.Dark24 +
        pc.qualitative.Alphabet
    )
    color_cycle = cycle(color_pool)
    group_to_color = {group: next(color_cycle) for group in unique_groups}

    fig = go.Figure()

    for group in unique_groups:
        sub_df = df[df['group'] == group]
        fig.add_trace(
            go.Scattergl(
                x=sub_df['x'],
                y=sub_df['y'],
                mode='markers',
                name=group,
                marker=dict(size=4, color=group_to_color[group]),
                legendgroup=group,
                showlegend=True,
                hoverinfo='skip',
            )
        )

        if show_group_outlines and len(sub_df) >= 5:
            points = sub_df[['x', 'y']].to_numpy(dtype=np.float32)
            filtered_points = get_filtered_points(points, percentile=80)

            if len(filtered_points) >= 5:
                ellipse_x, ellipse_y = fit_ellipse_to_points(filtered_points)

                rgb = hex_to_rgb(group_to_color[group])
                rgba_fill = f'rgba({rgb[0]},{rgb[1]},{rgb[2]},0.1)'

                fig.add_trace(
                    go.Scattergl(
                        x=ellipse_x,
                        y=ellipse_y,
                        mode='lines',
                        line=dict(color=group_to_color[group], width=1),
                        fill='toself',
                        fillcolor=rgba_fill,
                        name=group,
                        legendgroup=group,
                        hoverinfo='text',
                        text=[hover_texts.get(group_to_cluster[group], group)] * len(ellipse_x),
                        showlegend=False
                    )
                )

                # Add cluster ID label at center
                cluster_id = group_to_cluster[group]
                fig.add_annotation(
                    x=np.mean(filtered_points[:, 0]),
                    y=np.mean(filtered_points[:, 1]),
                    text=f"Skupina {cluster_id}",
                    showarrow=False,
                    font=dict(size=12, color="black"),
                    align="center",
                    bordercolor="black",
                    borderwidth=1,
                    borderpad=4,
                    bgcolor="white",
                    opacity=0.9
                )

    fig.update_layout(
        title=title,
        width=1200,
        height=800,
        xaxis_title="UMAP 1",
        yaxis_title="UMAP 2",
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=-0.40,
            xanchor="center",
            x=0.5,
            title=None,
            font=dict(size=10)
        ),
        margin=dict(t=80, b=80)
    )

    fig.show()

In [3]:
sbert_embeddings = np.load('sbert_embeddings.npy')

print("UMAP on SBERT embeddings...")
umap_25d = UMAP(n_components=25, metric='cosine', random_state=42).fit_transform(sbert_embeddings)
umap_2d = UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42).fit_transform(sbert_embeddings)

UMAP on SBERT embeddings...


  warn(
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
  warn(


In [9]:
cached_all = False
cached = True

if not cached_all:
    print("downloading necessary stuff...")
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')

    stop_words = set(stopwords.words('slovene'))

print("loading articles...")

sbert_clusters = cluster_with_kmeans(umap_25d, n_clusters=15, random_state=46)

preprocessed_texts = load_preprocessed()
tokenized_texts = [text.split() for text in preprocessed_texts]

print("extracting keywords...")
if cached_all:
    keywords_per_cluster = np.load("extracted_keywords_final.npy", allow_pickle=True).item()
    sbert_clusters = np.load("sbert_clusters_final.npy")

elif cached:
    keywords_per_cluster = np.load("extracted_keywords_final.npy", allow_pickle=True).item()

    # verify if cluster sequence is the same as the one expected in imported dictionary
    unique_clusters = np.unique(sbert_clusters[sbert_clusters >= 0])
    for cluster_label in unique_clusters:
        cluster_texts = [preprocessed_texts[i] for i in range(len(tokenized_texts)) if sbert_clusters[i] == cluster_label]
        tfidf_keywords = extract_tfidf_keywords(cluster_texts, list(stop_words))

        if keywords_per_cluster[cluster_label]['tfidf'][:2] == tfidf_keywords[:2]:
            print("cluster ", cluster_label, " done")
        else:
            print("cluster ", cluster_label, " didn't match")
else:
    print("downloading keybert...")
    keybert_model = KeyBERT(model='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')


    keywords_per_cluster = extract_cluster_keywords(tokenized_texts, sbert_clusters, list(stop_words), keybert_model)

# cached all
np.save("sbert_clusters_final.npy", sbert_clusters)

draw_plot(
umap_embeddings=umap_2d,
clusters=sbert_clusters,
keywords_per_cluster=keywords_per_cluster,
show_group_outlines=True
)

downloading necessary stuff...
loading articles...


[nltk_data] Downloading package punkt to /Users/matjeez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matjeez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/matjeez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


extracting keywords...
cluster  0  didn't match
cluster  1  didn't match
cluster  2  didn't match
cluster  3  didn't match
cluster  4  didn't match
cluster  5  didn't match
cluster  6  didn't match
cluster  7  didn't match
cluster  8  didn't match
cluster  9  didn't match
cluster  10  didn't match
cluster  11  didn't match
cluster  12  didn't match
cluster  13  didn't match
cluster  14  didn't match
