In [1]:
import os
env_vars = {'TOKENIZERS_PARALLELISM':'true'}
os.environ.update(env_vars)

In [2]:
import pandas as pd
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
import spacy

from pathlib import Path

In [3]:
root = Path('./..').resolve()
cleaned_data = root / "cleaned_data"

In [4]:
data = pd.read_parquet(cleaned_data / "combined_data_2013_2022_per_paragraph.parquet")

In [5]:
text_split = data[data["text_split"].notna()]["text_split"].to_list()
time= data[data["text_split"].notna()]["year"].to_list()

In [6]:
umap_model =  UMAP(n_neighbors=5, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(ngram_range=(1, 4), stop_words="english")
representation_model = {
    "KeyBERT": KeyBERTInspired(),
    "MMR": MaximalMarginalRelevance(diversity=0.3),
}

topic_model = BERTopic(
    embedding_model="multi-qa-mpnet-base-dot-v1",
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model
)

In [None]:
topics, probs = topic_model.fit_transform(text_split)

In [None]:
topics_info = topic_model.get_topic_info()

In [None]:
topics_info

In [None]:
df2 = pd.DataFrame(topic_model.get_document_info(text_split))