In [3]:
import yaml
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from umap.umap_ import UMAP
import hdbscan
from sklearn.metrics import silhouette_score, davies_bouldin_score
import numpy as np

In [4]:
# Load YAML data
def load_yaml_articles(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        articles = yaml.safe_load(file)
    return articles


articles = load_yaml_articles('articles.yaml')

In [5]:
print(articles[0])

{'authors': ['A. P. J.'], 'date': datetime.datetime(2024, 1, 1, 8, 15, 3), 'lead': 'Egipt, Etiopija, Iran, Savdska Arabija in Združeni arabski emirati so postali polnopravni člani skupine največjih gospodarstev v vzponu (BRICS). Ta se je tako razširila na deset članic.', 'n_comments': 2, 'paragraphs': ['Voditelji dozdajšnjih članic BRICS-a – Brazilije, Rusije, Indije, Kitajske in Južne Afrike – so se o sprejetju novih članic dogovorili na 15. vrhu skupine avgusta lani v Johannesburgu. V skupino so jih povabili šest – poleg peterice, ki se je skupini pridružila danes, še Argentino, ki je pred dnevi uradno sporočila, da se ne bo včlanila.', 'Novi argentinski predsednik Javier Milei je v pismu voditeljem BRICS-a pojasnil, da se stališča nove vlade v številnih pogledih razlikujejo od predhodnih oblasti, tako da trenutno ni pravi čas za pridružitev skupini.', 'Da Argentina ne bo članica skupine, je sicer na omrežju X že konec novembra zapisala nova argentinska zunanja ministrica Diana Mondi

In [6]:
# Extract representative texts from articles
def get_article_text(article):
    text_parts = [
        article.get('title', ''),
        article.get('lead', ''),
        ' '.join(article.get('paragraphs', [])),
    ]
    return ' '.join(text_parts)
    
# Prepare data
texts = [get_article_text(article) for article in tqdm(articles, desc="Processing articles")]


Processing articles: 100%|██████████| 29493/29493 [00:00<00:00, 279994.95it/s]


In [7]:
# ----------- TF-IDF Embeddings ----------- #
print("Generating TF-IDF embeddings...")
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features
tfidf_embeddings = tfidf_vectorizer.fit_transform(texts)

Generating TF-IDF embeddings...


In [8]:
# ----------- SBERT Embeddings ----------- #
#print("Generating SBERT embeddings...")
#sbert_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
#sbert_embeddings = sbert_model.encode(texts, batch_size=64, show_progress_bar=True)

#np.save('sbert_embeddings.npy', sbert_embeddings)

In [9]:
sbert_embeddings = np.load('sbert_embeddings.npy')

In [None]:
# ----------- UMAP Reduction ----------- #
print("Reducing dimensionality with UMAP...")
umap_reducer = UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)

print("UMAP on TF-IDF embeddings...")
tfidf_umap = umap_reducer.fit_transform(tfidf_embeddings)

print("UMAP on SBERT embeddings...")
sbert_umap = umap_reducer.fit_transform(sbert_embeddings)


Reducing dimensionality with UMAP...
UMAP on TF-IDF embeddings...


  warn(
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


UMAP on SBERT embeddings...


: 

t-SNE tends to form tight, visually appealing clusters. It emphasizes local neighborhood relationships strongly. While visually clearer, it can over-emphasize local neighborhoods, potentially fragmenting larger semantic clusters into artificially small subgroups.
UMAP achieves a balance between local and global structure, providing a slightly more faithful semantic representation at the expense of somewhat less visually dramatic cluster separation compared to t-SNE. UMAP embeddings typically preserve semantic continuity and relationships between clusters better than t-SNE.

In [None]:
from sklearn.metrics.pairwise import cosine_distances

# ----------- Clustering with HDBSCAN ----------- #
print("Clustering with HDBSCAN...")
clusterer_tfidf = hdbscan.HDBSCAN(min_cluster_size=70, metric='euclidean')
tfidf_clusters = clusterer_tfidf.fit_predict(tfidf_umap)

cosine_distance_matrix = cosine_distances(sbert_umap)

# Cluster with precomputed cosine distances
clusterer_sbert = hdbscan.HDBSCAN(min_cluster_size=70, metric='precomputed')
sbert_clusters = clusterer_sbert.fit_predict(cosine_distance_matrix)

# ----------- Evaluation Metrics ----------- #
def evaluate_clusters(embeddings_2d, labels, embedding_type):
    mask = labels != -1  # Exclude noise points for fair evaluation
    if np.sum(mask) < 2:
        print(f"{embedding_type}: Not enough clustered points for evaluation.")
        return None, None

    silhouette = silhouette_score(embeddings_2d[mask], labels[mask])
    davies_bouldin = davies_bouldin_score(embeddings_2d[mask], labels[mask])
    print(f"{embedding_type} Silhouette Score: {silhouette:.3f}")
    print(f"{embedding_type} Davies-Bouldin Index: {davies_bouldin:.3f}")
    return silhouette, davies_bouldin

print("\nEvaluating TF-IDF clusters:")
evaluate_clusters(tfidf_umap, tfidf_clusters, "TF-IDF")

print("\nEvaluating SBERT clusters:")
evaluate_clusters(sbert_umap, sbert_clusters, "SBERT")

Clustering with HDBSCAN...


In [40]:
import os
import pandas as pd
import plotly.express as px

def draw_points(fig, subplot_title, col, umap_embeddings, clusters, titles):
    df = pd.DataFrame({
        'x': umap_embeddings[:, 0],
        'y': umap_embeddings[:, 1],
        'cluster': clusters.astype(str),  # Convert clusters to string for categorical color mapping
        'title': titles,
    })

    # Create Plotly interactive visualization
    scatter = px.scatter(
        df, x='x', y='y',
        color='cluster',
        hover_data=['title']
    )

    for trace in scatter.data:
        fig.add_trace(trace, row=1, col=col)

    fig.update_xaxes(title_text="UMAP 1", row=1, col=col)
    fig.update_yaxes(title_text="UMAP 2", row=1, col=col)
    fig.layout.annotations[col - 1].update(text=subplot_title)


In [45]:
print(len(set(sbert_clusters)))
print(len(set(tfidf_clusters)))

52
69


In [44]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

titles = [article.get('title', 'No title') for article in articles]


fig = make_subplots(rows=1, cols=2, subplot_titles=("Embedding 1", "Embedding 2"))

draw_points(fig, "sbert", 1, sbert_umap, sbert_clusters, titles)
draw_points(fig, "tfidf", 2, tfidf_umap, tfidf_clusters, titles)

fig.update_layout(height=500, width=1000, title_text="Comparison of Two Embeddings")


fig.show()

In [48]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import numpy as np

# Setup stopwords & lemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('slovene'))  # change language if necessary
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = re.findall(r'\b\w+\b', text.lower())
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words and len(t) > 2]
    return ' '.join(tokens)

def extract_clean_keywords(texts, clusters, top_n=10):
    unique_clusters = np.unique(clusters[clusters >= 0])

    keywords_per_cluster = {}

    for cluster_label in unique_clusters:
        cluster_texts = [preprocess_text(texts[i]) for i in range(len(texts)) if clusters[i] == cluster_label]

        # TF-IDF on processed texts (stopwords already removed)
        vectorizer = TfidfVectorizer(max_features=top_n)
        tfidf_matrix = vectorizer.fit_transform(cluster_texts)
        keywords = vectorizer.get_feature_names_out()

        keywords_per_cluster[cluster_label] = keywords
        print(f"Cluster {cluster_label} keywords: {', '.join(keywords)}")

    return keywords_per_cluster

# Example Usage:
extract_clean_keywords(texts, sbert_clusters, top_n=10)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matjeez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/matjeez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Cluster 0 keywords: lahko, minuti, slovenija, tekme, tekmi, tekmo, veliko, več, zdaj, zelo
Cluster 1 keywords: dallas, dončić, igre, new, podaj, skokov, tekme, točk, točkami, zadel
Cluster 2 keywords: angeles, boston, edmonton, florida, kopitar, los, minuti, new, nhl, tekme
Cluster 3 keywords: dejal, gaze, gazi, hamas, izrael, izraelska, izraelske, ljudi, več, vojska
Cluster 4 keywords: delnic, delnice, dolarjev, lahko, leta, leto, odstotka, odstotkov, rast, več
Cluster 5 keywords: dosegel, elektrike, kilometrov, kjer, ljudi, najmanj, oblasti, orkan, več, žrtev
Cluster 6 keywords: dirke, dirki, dirko, formule, mesto, norris, red, sezone, verstappen, več
Cluster 7 keywords: lahko, leta, luni, nasa, polet, spacex, starship, vesoljsko, več, video
Cluster 8 keywords: dirke, dirki, franciji, lahko, pogačar, roglič, sekund, več, zdaj, zmago
Cluster 9 keywords: finalu, lahko, mesto, mestu, prevc, svetovnega, tekmi, več, zdaj, zelo
Cluster 10 keywords: fra, igri, krogu, niz, nizu, ru, turnirja

{0: array(['lahko', 'minuti', 'slovenija', 'tekme', 'tekmi', 'tekmo',
        'veliko', 'več', 'zdaj', 'zelo'], dtype=object),
 1: array(['dallas', 'dončić', 'igre', 'new', 'podaj', 'skokov', 'tekme',
        'točk', 'točkami', 'zadel'], dtype=object),
 2: array(['angeles', 'boston', 'edmonton', 'florida', 'kopitar', 'los',
        'minuti', 'new', 'nhl', 'tekme'], dtype=object),
 3: array(['dejal', 'gaze', 'gazi', 'hamas', 'izrael', 'izraelska',
        'izraelske', 'ljudi', 'več', 'vojska'], dtype=object),
 4: array(['delnic', 'delnice', 'dolarjev', 'lahko', 'leta', 'leto',
        'odstotka', 'odstotkov', 'rast', 'več'], dtype=object),
 5: array(['dosegel', 'elektrike', 'kilometrov', 'kjer', 'ljudi', 'najmanj',
        'oblasti', 'orkan', 'več', 'žrtev'], dtype=object),
 6: array(['dirke', 'dirki', 'dirko', 'formule', 'mesto', 'norris', 'red',
        'sezone', 'verstappen', 'več'], dtype=object),
 7: array(['lahko', 'leta', 'luni', 'nasa', 'polet', 'spacex', 'starship',
        've

In [None]:
from transformers import pipeline

# Summarization pipeline
generator = pipeline('text2text-generation', model='google/flan-t5-base')

def keywords_to_summary(keywords):
    prompt = "Summarize the topic: " + ', '.join(keywords)
    summary = generator(prompt, max_length=30, do_sample=False)[0]['generated_text']
    return summary

# Example usage:
keywords = ['energy', 'climate', 'renewable', 'solar', 'investment']
summary = keywords_to_summary(keywords)
print(summary)
