In [1]:
import yaml
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from umap.umap_ import UMAP
import hdbscan
from sklearn.metrics import silhouette_score, davies_bouldin_score
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load YAML data
def load_yaml_articles(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        articles = yaml.safe_load(file)
    return articles


articles = load_yaml_articles('articles.yaml')

In [3]:
from urllib.parse import urlparse
import re

def extract_topics_from_url(url, skip_last_n=2):
    path = urlparse(url).path
    parts = [part for part in path.strip("/").split("/") if part]

    if len(parts) > skip_last_n:
        topics = parts[:-skip_last_n]
    else:
        topics = []

    # Replace hyphens with spaces, remove numbers if needed, clean up
    topics = [re.sub(r'[-_]+', ' ', topic) for topic in topics]

    return topics


In [4]:
# Assuming articles are loaded like this:
# articles = load_yaml_articles('articles.yaml')

for article in articles:
    article['url_topics'] = extract_topics_from_url(article['url'])

# Verify:
print(articles[0]['url_topics'])

['svet', 'preberite tudi']


In [5]:
# Extract representative texts from articles
def get_article_text(article):
    text_parts = [
        article.get('title', ''),
        article.get('lead', ''),
        ' '.join(article.get('paragraphs', [])),
    ]
    return ' '.join(text_parts)
    
# Prepare data
texts = [get_article_text(article) for article in tqdm(articles, desc="Processing articles")]


Processing articles: 100%|██████████| 29493/29493 [00:00<00:00, 167513.84it/s]


In [6]:
# ----------- TF-IDF Embeddings ----------- #
print("Generating TF-IDF embeddings...")
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)

Generating TF-IDF embeddings...


In [9]:
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

# TODO: report
# tfidf_dense = tfidf_matrix.toarray()

# # Reduce dimensionality to something manageable (e.g. 50D)
# pca = PCA(n_components=50, random_state=42)
# tfidf_pca = pca.fit_transform(tfidf_dense)

svd = TruncatedSVD(n_components=100, random_state=42)
tfidf_reduced = svd.fit_transform(tfidf_matrix)

In [10]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, perplexity=40, n_iter=1000, random_state=42)
tfidf_tsne_2d = tsne.fit_transform(tfidf_reduced)



### FILL EVERYTHING

In [8]:
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

def fix_cluster_noise_with_knn(embeddings, cluster_labels, n_neighbors=5):
    """
    Reassigns HDBSCAN noise points (-1 labels) using KNN fallback.
    
    Args:
        embeddings (np.ndarray): 2D or reduced embeddings (e.g., UMAP).
        cluster_labels (np.ndarray): Cluster labels from HDBSCAN or approximate_predict().
        n_neighbors (int): Number of neighbors for KNN.

    Returns:
        np.ndarray: Final labels with no -1s (all points assigned).
    """
    cluster_labels = np.array(cluster_labels)
    noise_mask = cluster_labels == -1

    if not np.any(noise_mask):
        print("✅ No noise points to reassign. Returning original labels.")
        return cluster_labels

    print(f"🔄 Reassigning {np.sum(noise_mask)} noise points using KNN...")

    # Prepare training data (only confidently labeled points)
    X_train = embeddings[~noise_mask]
    y_train = cluster_labels[~noise_mask]

    # Train KNN and predict noise points
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    reassigned = knn.predict(embeddings[noise_mask])

    # Merge reassigned labels back into full label array
    final_labels = cluster_labels.copy()
    final_labels[noise_mask] = reassigned

    print("✅ All points now have a cluster assignment.")
    return final_labels


In [18]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from sklearn.preprocessing import LabelEncoder

# ----------- Clustering with HDBSCAN ----------- #
print("Clustering with HDBSCAN...")
from sklearn.decomposition import TruncatedSVD

clusterer = hdbscan.HDBSCAN(min_cluster_size=30)
tfidf_clusters = clusterer.fit_predict(tfidf_reduced)

# ----------- Evaluation Metrics ----------- #
def evaluate_clusters(embeddings_2d, labels, embedding_type):
    mask = labels != -1  # Exclude noise points for fair evaluation
    if np.sum(mask) < 2:
        print(f"{embedding_type}: Not enough clustered points for evaluation.")
        return None, None

    silhouette = silhouette_score(embeddings_2d[mask], labels[mask])
    
    davies_bouldin = davies_bouldin_score(embeddings_2d[mask], labels[mask])
    print(f"{embedding_type} Silhouette Score: {silhouette:.3f}")
    print(f"{embedding_type} Davies-Bouldin Index: {davies_bouldin:.3f}")
    return silhouette, davies_bouldin

#print("\nEvaluating TF-IDF clusters:")
#evaluate_clusters(tfidf_embeddings, tfidf_clusters, "TF-IDF")

#print("\nEvaluating SBERT clusters:")
#evaluate_clusters(sbert_embeddings, sbert_clusters, "SBERT")

topic_strings = [
    "/".join(article['url_topics']) if article['url_topics'] else "no-topic"
    for article in articles
]
le = LabelEncoder()
topic_labels = le.fit_transform(topic_strings)

# Evaluate clustering
ari_score = adjusted_rand_score(topic_labels, tfidf_clusters)
nmi_score = normalized_mutual_info_score(topic_labels, tfidf_clusters)
print(ari_score, nmi_score)

Clustering with HDBSCAN...
-0.0008520930430855831 0.010570438559168286


In [13]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
import alphashape
from shapely.geometry import Polygon, MultiPolygon

def add_alpha_shape_outline(fig, points, group_label, col, alpha=0.2, opacity=0.1):
    try:
        shape = alphashape.alphashape(points, alpha)
        if shape is None:
            return
        if isinstance(shape, Polygon):
            polygons = [shape]
        elif isinstance(shape, MultiPolygon):
            polygons = list(shape.geoms)
        else:
            return

        for poly in polygons:
            x = list(poly.exterior.coords.xy[0])
            y = list(poly.exterior.coords.xy[1])

            fig.add_trace(
                go.Scattergl(
                    x=x,
                    y=y,
                    mode='lines',
                    line=dict(color='black', width=1, dash='dot'),
                    fill='toself',
                    fillcolor=f'rgba(0,0,0,{opacity})',
                    name=group_label,
                    legendgroup=group_label,
                    hoverinfo='text',
                    text=[group_label] * len(x),
                    showlegend=False
                ),
                row=1,
                col=col
            )
    except Exception as e:
        print(f"⚠️ Could not draw outline for group '{group_label}': {e}")

def draw_points(
    fig,
    subplot_title,
    col,
    umap_embeddings,
    clusters,
    titles,
    categories=None,
    rename_map=None,
    show_group_outlines=True,
    outline_opacity=0.05,
):
    df = pd.DataFrame({
        'x': umap_embeddings[:, 0],
        'y': umap_embeddings[:, 1],
        'cluster': clusters.astype(str),
        'title': titles,
    })

    if rename_map:
        df['group'] = [rename_map.get(int(c), "Other / Noise")[:20] for c in clusters]
    elif categories is not None:
        df['group'] = pd.Series(categories).astype(str)
    else:
        df['group'] = df['cluster']

    unique_groups = sorted(df['group'].unique())

    for group in unique_groups:
        sub_df = df[df['group'] == group]
        fig.add_trace(
            go.Scattergl(
                x=sub_df['x'],
                y=sub_df['y'],
                mode='markers',
                name=group,
                marker=dict(size=4),
                legendgroup=group,
                showlegend=True,
                hoverinfo='skip',  # ⛔ disables hover for points
            ),
            row=1,
            col=col
        )


        if show_group_outlines and len(sub_df) >= 3:
            points = sub_df[['x', 'y']].to_numpy(dtype=np.float32)
            add_alpha_shape_outline(fig, points, group, col, alpha=5, opacity=outline_opacity)

    fig.update_xaxes(title_text="UMAP 1", row=1, col=col)
    fig.update_yaxes(title_text="UMAP 2", row=1, col=col)
    fig.layout.annotations[col - 1].update(text=subplot_title)


In [19]:
print(len(set(tfidf_clusters)))

4


In [15]:
import json

def load_combined_jsonl(filename="preprocessed_combined.jsonl"):
    with open(filename, "r", encoding="utf-8") as f:
        return [json.loads(line.strip()) for line in f if line.strip()]

# Example usage
preprocessed_texts = load_combined_jsonl()
print(f"✅ Loaded {len(preprocessed_texts)} preprocessed articles.")


✅ Loaded 29493 preprocessed articles.


In [16]:
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# You should pass in a stopword list — ideally Slovene
import nltk
from nltk.corpus import stopwords

nltk.download('punkt')

# Setup stopwords & lemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('slovene'))
def extract_cluster_keywords(texts, clusters, stop_words=list(stop_words), top_n=20, ngram_range=(1, 2)):
    """
    Extracts top n-grams (unigrams + bigrams) from preprocessed texts by cluster.
    """
    unique_clusters = np.unique(clusters[clusters >= 0])
    keywords_per_cluster = {}

    for cluster_label in unique_clusters:
        # cluster_texts = [texts[i] for i in range(len(texts)) if clusters[i] == cluster_label]
        # if not cluster_texts:
        #     continue
        cluster_texts = [preprocessed_texts[i] for i in range(len(texts)) if clusters[i] == cluster_label]
        cluster_titles = [articles[i]['title'] for i in range(len(texts)) if clusters[i] == cluster_label]
        if not cluster_texts:
            continue

        # TF-IDF with stopword filtering and n-grams
        vectorizer = TfidfVectorizer(
            max_features=1000,
            stop_words=stop_words,
            ngram_range=ngram_range,
            token_pattern=r'\b\w{3,}\b',  # only tokens with 3+ letters
        )
        tfidf_matrix = vectorizer.fit_transform(cluster_texts)
        feature_array = np.array(vectorizer.get_feature_names_out())
        tfidf_scores = np.asarray(tfidf_matrix.mean(axis=0)).ravel()

        # Top N terms by TF-IDF score
        top_indices = tfidf_scores.argsort()[::-1][:top_n]
        top_keywords = feature_array[top_indices]

        keywords_per_cluster[cluster_label] = list(top_keywords)
        print(f"🧠 Cluster {cluster_label} → {', '.join(top_keywords)}")
        for i in range(len(cluster_titles[:5])):
            print("----", cluster_titles[i])

    return keywords_per_cluster

# Label from keywords
def keywords_to_sentence(keywords, lang='sl'):
    if not keywords:
        return "Drugo / Brez teme" if lang == 'sl' else "Other / No topic"

    if len(keywords) == 1:
        return f"Tematika: {keywords[0]}" if lang == 'sl' else f"Topic: {keywords[0]}"

    limit = 0
    body = ", ".join(keywords[:-1-limit]) + f" in {keywords[-1-limit]}"
    return (
        f"{body}" if lang == 'sl'
        else f"{body}"
    )



[nltk_data] Downloading package punkt to /Users/matjeez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matjeez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/matjeez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
# categories = [
#     article['url_topics'][0] if len(article['url_topics']) >= 1 else "no-topic"
#     for article in articles
# ]
# categories = [
#     "/".join(article['url_topics']) if article['url_topics'] else "no-topic"
#     for article in articles
# ]

from plotly.subplots import make_subplots

fig = make_subplots(rows=1, cols=2, subplot_titles=("Clusters", "Categories"))

#outliers = detect_2d_outliers(sbert_umap, generalized_topic_labels, max_percentile=80)

# Filter or annotate
# filtered_points = ~outliers
# titles_filtered = np.array([article['title'] for article in articles])[filtered_points]
# clusters_filtered = np.array(sbert_clusters)[filtered_points]
# categories_filtered = np.array(generalized_topic_labels)[filtered_points]
# sbert_umap_filtered = sbert_umap[filtered_points]


# 1. Extract keywords per cluster
keywords_per_cluster = extract_cluster_keywords(texts, tfidf_clusters)

# 2. Summarize them
cluster_name_map = {
    cluster_id: keywords_to_sentence(keywords)  # or keywords_to_summary()
    for cluster_id, keywords in keywords_per_cluster.items()
}

# Left: Clusters
draw_points(fig, "Clusters", col=1, 
            umap_embeddings=tfidf_tsne_2d,
            #umap_embeddings=tsne_2d,
            clusters=tfidf_clusters,
            titles=[article['title'] for article in articles],
            rename_map=cluster_name_map,#cluster_name_map,
            show_group_outlines=True)

# Right: Categories
# draw_points(fig, "Categories", col=2, 
#             umap_embeddings=sbert_umap,
#             #umap_embeddings=tsne_2d,
#             clusters=sbert_clusters,  # still required for shape
#             titles=[article['title'] for article in articles],
#             categories=generalized_topic_labels,
#             show_group_outlines=False)
# draw_points(fig, "Clusters", col=2, 
#             umap_embeddings=sbert_umap,
#             #umap_embeddings=tsne_2d,
#             clusters=sbert_clusters,
#             titles=[article['title'] for article in articles],
#             categories=None,
#             show_group_outlines=True)

fig.update_layout(
    height=500,
    width=1000,
    title_text="Clusters vs. Categories",
    legend_title_text="Legend",
)

fig.show()


🧠 Cluster 0 → športen, idol, športen idol, hobi, klubski, družben omrežje, velik uspeh, velik, reprezentančen klubski, reprezentančen, uspeh reprezentančen, uspeh, družben, omrežje, prisotnost, prisotnost družben, instagram, rokomet, rokomet hobi, omrežje instagram
---- Tamara Mavsar
---- Tilen Kodrin
---- Jani Kovačič
---- Borut Mačkovšek
---- Alen Pajenk
🧠 Cluster 1 → leto, tekma, mesto, slovenija, dober, nov, slovenski, velik, minuta, človek, država, točka, igra, sezona, dan, zadnji, film, čas, delo, svet
---- Skupina BRICS odslej z novimi polnopravnimi članicami
---- Po odstopih Urevc in Ličefa edina slovenska predstavnica ostaja Mandeljc 
---- V veljavi podražitve napotitev delavcev
---- Prenehala je obstajati separatistična republika Gorski Karabah
---- Boston za vodilnimi Rangersi zaostaja samo še točko
🧠 Cluster 2 → evro, liter, evro liter, cena, olje, kurilen olje, kurilen, bencin, gorivo, centa, naften, derivat, naften derivat, dizelski gorivo, dizelski, oktanski, oktanski be

