In [None]:
from bertopic import BERTopic
from flair.embeddings import TransformerDocumentEmbeddings
from hdbscan import HDBSCAN
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP


In [None]:
base_folder = "data"


# TOPIC VARIETY

In this notebook, every article is assigned a single topic, corresponding to a news story (COVID-19, elections, ...).
To find these topics, we train a BERTopic Topic Model.

## Input Data

* `item_metadata.csv` - Metadata of all articles that were recommended
    * item (int) - Item identifier
    * text (str) - Article text

## Output

This notebook will write 2 files to the `base_folder`:

* `bertopic_base_model` - The trained BERTopic model
* `item_metadata_w_tags.csv` - Item metadata augmented with a tag column containing a Topic ID.
    * item (int)
    * text (str)
    * tag (int)

In [None]:

recommended_articles = pd.read_csv(f"{base_folder}/item_metadata.csv")[['item', 'text']]
recommended_articles.head()

In [None]:
# Change to the correct language
from spacy.lang.en import STOP_WORDS


In [None]:

hdbscan_model = HDBSCAN(
    min_cluster_size=10,
    min_samples=10,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
)

topic_model = BERTopic(
    language='english', # Change this to multilingual when text is non-english
    min_topic_size=10, 
    vectorizer_model=CountVectorizer(stop_words='english', ngram_range=(1,2)),
    hdbscan_model=hdbscan_model
)


In [None]:


docs = recommended_articles["text"].values
topics, probs = topic_model.fit_transform(docs)
recommended_articles["tag"] = topics


In [None]:
# Store the model so we can use it in the future.
model_name = "bertopic_base_model"
topic_model.save(f"{base_folder}/{model_name}")


In [None]:
recommended_articles[["item", "text", "tag"]].to_csv(f"{base_folder}/item_metadata_w_tags.csv", index=False)