In [2]:
import polars as pl
from pathlib import Path
from loguru import logger
from bertopic import BERTopic
# import umap.umap_ as umap
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
import numpy as np
from src.utils import StopwordCollector
from collections import defaultdict

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
model_embeddings = {
    'text-embedding-3-large': {
        'data': pl.read_parquet(Path(r'data\embeddings_text-embedding-3-large.parquet'))
    },
    'all-mpnet-base-v2': {
        'data': pl.read_parquet(Path(r'data\embeddings_all-mpnet-base-v2.parquet'))
    },
    'all-MiniLM-L12-v2': {
        'data': pl.read_parquet(Path(r'data\embeddings_all-MiniLM-L12-v2.parquet'))
    }
}

for k,v in model_embeddings.items():
    v['data_v'] = v['data'].select(['id', 'chunks', 'chunks_embeddings']).explode(['chunks', 'chunks_embeddings'])


for k,v in model_embeddings.items():
    model_path = Path(f'models/{k}_topic_model.pkl')   
    model_embeddings[k]['topic_model'] = BERTopic.load(model_path) 

stopwords = StopwordCollector(
    texts=model_embeddings['text-embedding-3-large']['data']['content'].to_list(),
    authors=model_embeddings['text-embedding-3-large']['data']['author'].to_list()).get_stopwords()

if 'topic_model' not in model_embeddings['text-embedding-3-large'].keys():

    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
    hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

    vectorizer = CountVectorizer(
        stop_words=stopwords,
        ngram_range=(1, 3),
        min_df=3,
        max_df=0.8)

    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

    for k,v in model_embeddings.items():
        logger.info(f'Fitting BERTopic model for {k}...')
        temp_topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer, ctfidf_model=ctfidf_model)
        docs = v['data_v']['chunks'].to_list()
        embeddings = np.array([np.array(sub) for sub in v['data_v']['chunks_embeddings'].to_list()], dtype=np.float64)
        topics, probs = temp_topic_model.fit_transform(docs, embeddings)
        v['topics'] = topics
        v['probs'] = probs
        v['topic_model'] = temp_topic_model
        model_path = Path(f'models/{k}_topic_model.pkl')
        model_path.parent.mkdir(parents=True, exist_ok=True)
        v['topic_model'].save(model_path)
        logger.info(f'Model saved to {model_path}')

In [48]:
def assign_topic_per_doc(df:pl.DataFrame): 
    """
    Assign one dominant topic per document id based on chunk-level topic probabilities.
    Topic -1 is automatically ignored if its probability is 0.

    Parameters:
    - df: polars.DataFrame with columns ["id", "topic", "topic_probability"]

    Returns:
    - res: polars.DataFrame with columns ["id", "dominant_topic"]
    """
    data = df.select(["id", "topic", "topic_probability"]).to_numpy()
    topic_scores = defaultdict(lambda: defaultdict(float))  # id -> topic -> summed score
    
    for doc_id, topic, prob in data:
        if topic == -1:
            continue  # skip noise (0.0 anyway)
        topic_scores[doc_id][topic] += prob

    # Select the topic with max score for each doc_id
    dominant_topics = {
        doc_id: max(topic_dict.items(), key=lambda x: x[1])[0]
        for doc_id, topic_dict in topic_scores.items()
    }
    res = {int(doc_id): int(topic) for doc_id, topic in dominant_topics.items()}

    res = pl.DataFrame({
        "id": list(dominant_topics.keys()),
        "dominant_topic": list(dominant_topics.values())
    }).cast({"id": pl.Int64, "dominant_topic": pl.Int32})
    return res

In [8]:
dominant_topic_per_doc = assign_topic_per_doc(model_embeddings['all-mpnet-base-v2']['data_v'])

NameError: name 'assign_topic_per_doc' is not defined

In [61]:
df = model_embeddings['all-mpnet-base-v2']['data']
df = df.join(dominant_topic_per_doc, on='id', how='left')
df = df.with_columns([
    pl.col("date").cast(pl.Datetime("ms")).alias("date")
])

In [None]:
import polars as pl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from collections import Counter

date_col: str = "date"
topic_col: str = "dominant_topic"
k: int = 5

# Step 1: Filter top-k most common topics
topk_topics = [
    topic for topic, _ in Counter(df[topic_col].to_list()).most_common(k)
]

# Step 2: Add month column
df = df.with_columns([
    pl.col(date_col).dt.truncate("1mo").alias("month")
])

# Step 3: Compute total document count per month
monthly_total = (
    df
    .group_by("month")
    .agg(pl.count().alias("total_docs"))
)

# Step 4: Compute topic count per month for top-k topics
filtered_df = (
    df
    .filter(pl.col(topic_col).is_in(topk_topics))
    .group_by(["month", topic_col])
    .agg(pl.count().alias("topic_count"))
)

# Step 5: Join total counts and compute percentage
joined = (
    filtered_df
    .join(monthly_total, on="month")
    .with_columns((
        (pl.col("topic_count") / pl.col("total_docs") * 100).alias("percentage")
    ))
    .sort(["month", topic_col])
)

# Step 6: Build topic → month → percentage dictionary
date_topic_percent = {}
for row in joined.iter_rows():
    month, topic, _, _, percent = row
    date_topic_percent.setdefault(month, {})[topic] = percent

# All unique sorted months
all_months = sorted(date_topic_percent.keys())

# Prepare series for each topic
topic_series = {topic: [] for topic in topk_topics}
for month in all_months:
    topic_counts = date_topic_percent.get(month, {})
    for topic in topk_topics:
        topic_series[topic].append(topic_counts.get(topic, 0))

# Step 7: Plot
plt.figure(figsize=(12, 6))
for topic, percentages in topic_series.items():
    plt.plot(all_months, percentages, label=f"Topic {topic}")

plt.xlabel("Month")
plt.ylabel("Relative Frequency (%)")
plt.title(f"Top-{k} Topics as % of Monthly Docs")
plt.legend(title="Topic")
plt.grid(True)
plt.tight_layout()
plt.xticks(rotation=45)
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m"))
plt.show()


In [68]:
model_embeddings['all-mpnet-base-v2']['topic_model'].get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,17233,-1_emu_pandemic_mandate_mediumterm,"[emu, pandemic, mandate, mediumterm, pact, sho...","[ By contrast, we can observe a lot of mutual ..."
1,0,1083,0_rd_education_workers_ict,"[rd, education, workers, ict, lisbon, employme...",[ The last decades have seen an enormous incre...
2,1,1038,1_basically_qe_answer_actually,"[basically, qe, answer, actually, eurozone, yo...","[, the transparent and consistent implementati..."
3,2,826,2_green_climaterelated_carbon_emissions,"[green, climaterelated, carbon, emissions, env...","[ part, those transition paths require carbon ..."
4,3,745,3_mr_translation_translation mr_chairman,"[mr, translation, translation mr, chairman, ma...",[ that our monetary policy strategy is of a fo...
...,...,...,...,...,...
234,233,15,233_contagion_tarp_unpalatable_taxpayers ultim...,"[contagion, tarp, unpalatable, taxpayers ultim...",[ these questions. These issues were the focus...
235,234,15,234_bulgarian_bulgaria_governor iskrov_iskrov,"[bulgarian, bulgaria, governor iskrov, iskrov,...",[ forms a good example of what such cooperatio...
236,235,15,235_broad aggregates_ongoing tensions_m1_borro...,"[broad aggregates, ongoing tensions, m1, borro...",[ Regarding risks to price stability over the ...
237,236,15,236_hiking cycles_hiking_cumulative percentage...,"[hiking cycles, hiking, cumulative percentage,...",[ decisions affect the wider economy – a proce...


In [11]:
import re
import string
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np

# Step 1: Text preprocessing
def preprocess_texts(texts, stopword_list):
    cleaned_texts = []
    for text in texts:
        # Lowercase
        text = text.lower()
        # Remove punctuation
        text = re.sub(rf"[{re.escape(string.punctuation)}]", " ", text)
        # Tokenize
        tokens = word_tokenize(text)
        # Remove stopwords and non-alphabetic tokens
        tokens = [t for t in tokens if t.isalpha() and t not in stopword_list]
        cleaned_texts.append(tokens)
    return cleaned_texts

# Step 2: Train Word2Vec model
from gensim.models import Word2Vec

def train_word2vec(cleaned_texts, vector_size=100, window=5, min_count=2, workers=4, epochs=20):
    model = Word2Vec(
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=workers,
        sg=1  # skip-gram
    )
    model.build_vocab(cleaned_texts)
    
    # Required: total_words for training sanity check
    total_words = sum(len(sentence) for sentence in cleaned_texts)
    model.train(cleaned_texts, total_examples=len(cleaned_texts), total_words=total_words, epochs=epochs)
    
    return model


# Step 3: Generate text embeddings by averaging word vectors
def get_text_embeddings(texts, model):
    embeddings = []
    for tokens in texts:
        vectors = [model.wv[word] for word in tokens if word in model.wv]
        if vectors:
            avg_vector = np.mean(vectors, axis=0)
        else:
            avg_vector = np.zeros(model.vector_size)  # fallback if no known words
        embeddings.append(avg_vector)
    return embeddings

# === Example Usage ===
# import nltk resources if needed
import nltk
nltk.download('punkt')

texts = model_embeddings['all-mpnet-base-v2']['data']['content'].to_list()

# Define your stopword list
custom_stopwords = set(stopwords.words("english"))  # or provide your own list

# Clean & tokenize
cleaned = preprocess_texts(texts, custom_stopwords)

# Train model
w2v_model = train_word2vec(cleaned, vector_size=100)

# Get embeddings
embeddings = get_text_embeddings(cleaned, w2v_model)

# embeddings is now a List[np.ndarray] of shape (len(texts), vector_size)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\povhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
embeddings[0].shape

(100,)

In [None]:
from pathlib import Path
import polars as pl
from loguru import logger
from src.utils import StopwordCollector
from src.evaluation import topic_quality

FILE_DIR = Path(r'C:\Users\povhi\OneDrive\01_Uni\03_TU Wien\MSc Data Science\09_Interdisciplinary_Project\ECB_Topic_Chronicles\src') # Path(__file__).parent

class BERTopicPipeline(): 
    def __init__(self, embedding_model_name:str): 
        self.embedding_model_name = embedding_model_name
        self.data_dir_path = FILE_DIR.parent / 'data'
        self.data = self._load_embedding_data()
        self.data_long = self._long_format(self.data)

    def _long_format(self, 
                     data: pl.DataFrame, 
                     select_cols:list= ['id', 'chunks', 'chunks_embeddings'],
                     explode_columns:list = ['chunks', 'chunks_embeddings']) -> pl.DataFrame:
        """
        Convert the data to long format, which means the explode columns are expanded into multiple rows.
        """
        return data.select(select_cols).explode(explode_columns)
    
    def _load_embedding_data(self) -> pl.DataFrame:
        """
        Load the embedding data for the specified model.
        """
        self.data_path = self.data_dir_path / f'embeddings_{self.embedding_model_name}.parquet'

        if self.data_path.exists(): 
            logger.info(f"Loading embedding data for {self.embedding_model_name} from {self.data_path.as_posix()}")
            data = pl.read_parquet(self.data_path)
            logger.info(f"Loaded {len(data)} rows of embedding data for {self.embedding_model_name}.")
            logger.ingo(f"Data columns: {data.columns}")
            return data
        else: 
            raise FileNotFoundError(f"Embedding data for {self.embedding_model_name} not found in {self.data_path}. Please create the embeddings first using embdding.py script.")

    def _get_embeddings(self): 
        return np.array([np.array(sub) for sub in self.data_long['chunks_embeddings'].to_list()], dtype=np.float64)

    def load_existing_model(self) -> BERTopic:
        """
        Load the existing BERTopic model for the specified embedding model.
        """
        model_path = FILE_DIR.parent / 'models' / f'{self.embedding_model_name}_topic_model.pkl'
        if model_path.exists():
            logger.info(f"Loading existing BERTopic model from {model_path.as_posix()}")
            self.topic_model = BERTopic.load(model_path)
        else:
            raise FileNotFoundError(f"BERTopic model for {self.embedding_model_name} not found in {model_path}. Please create the model first using topic_modeling.py script.")

    def get_stopwords(self):
        if not hasattr(self, 'stopwords'):
            self.stopwords = StopwordCollector(
                texts=self.data['content'].to_list(),
                authors=self.data['author'].to_list()).get_stopwords()
        else: 
            return self.stopwords

    def create_bert_model(self, 
                          stopwords=None, 
                          umap_model=None, 
                          hdbscan_model=None, 
                          vectorizer_model=None, 
                          ctfidf_model=None, 
                          save = False):
        """
        Create a BERTopic model with the specified parameters.
        If no parameters are provided, default models will be used.
        """
        if stopwords is None: 
            stopwords = self.get_stopwords()

        if umap_model is None: 
            umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

        if hdbscan_model is None: 
            hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

        if vectorizer_model is None: 
            vectorizer_model = CountVectorizer(
                stop_words=stopwords,
                ngram_range=(1, 3),
                min_df=3,
                max_df=0.8)

        if ctfidf_model is None: 
            ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

        logger.info(f'Fitting BERTopic model for {self.embedding_model_name}...')
        self.topic_model = BERTopic(
            umap_model=umap_model, 
            hdbscan_model=hdbscan_model, 
            vectorizer_model=vectorizer_model, 
            ctfidf_model=ctfidf_model)

        self.docs = self.data_long['chunks'].to_list()
        self.embeddings = self._get_embeddings()
        self.topics, self.probs = self.topic_model.fit_transform(self.docs, self.embeddings)
        
        if save: 
            model_path = FILE_DIR.parent / 'models' / f'{self.embedding_model_name}_topic_model.pkl'
            model_path.parent.mkdir(parents=True, exist_ok=True)
            self.topic_model.save(model_path)
            logger.info(f'Model saved to {model_path}')

    def get_topic_quality(self, top_n_words = 10): 
        """
        Get the topic quality metrics for the current BERTopic model.
        """
        if not hasattr(self, 'topic_model'):
            raise ValueError("BERTopic model has not been created yet. Please call create_bert_model()/load_existing_model() first.")
        if not hasattr(self, 'docs'):
            self.docs = self.data_long['chunks'].to_list()
        if not hasattr(self, 'embeddings'): 
            self.embeddings = self._get_embeddings()

        return topic_quality(
            model = self.topic_model, 
            documents = self.docs,
            embeddings = self.embeddings,
            top_n_words=top_n_words)