In [None]:
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer

from octis.dataset.dataset import Dataset
from octis.models.model import AbstractModel
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.models.CTM import CTM

from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.data_preparation import bert_embeddings_from_file
from contextualized_topic_models.evaluation.measures import CoherenceCV

from gensim import corpora
from gensim.models import LdaModel
import gensim.downloader as api
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora

import os
import numpy as np
import pandas as pd

from skopt.space.space import Real, Categorical, Integer

import warnings

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", message=" `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above. and should_run_async(code)")

In [None]:
df = pd.read_csv('Load your data!')
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.head(3)

In [None]:
# Diversity
import re

doc_list = []

for idx, row in df.iterrows():
    document = str(row['Article'])
    doc_list.append(document)

print(len(doc_list))

newdocs = [re.sub(' +',' ', doc.strip().replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').replace('"', '\'').replace('\x0c', '').replace('\x1c', '')) for doc in doc_list]

# write the two files needed to create an OCTIS dataset
with open("corpus.tsv", "w") as f :
    f.write("\n".join(map(str, newdocs)))
f.close()
words = []
for line in newdocs :
  words.extend(line.split())

with open("vocabulary.txt", "w") as f :
    f.write("\n".join(map(str, words)))
f.close()

# Coherence & Diversity Calculation

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
topic_range = range(2, 11)

tokenized_documents = [doc.split() for doc in df['Article']]
dictionary = corpora.Dictionary(tokenized_documents)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]

In [None]:
# LDA coherence, diversity

coherence_values_lda = []
diversity_values_lda = []

for num_topics in topic_range:
    #Load LDA model

    lda_topics = []
    for topic_id, topic_words in lda_model.print_topics():
        #Preprocessing on a word-by-word basis

    #Save the topics

    #Calculate diversity and save

    #Calcuate coherence and save

In [None]:
# nmf coherence, diversity
from gensim.models import Nmf

os.environ["TOKENIZERS_PARALLELISM"] = "false"

coherence_values_nmf = []
diversity_values_nmf = []

for num_topics in topic_range:
    #Load NMF model

    nmf_topics = []
    for topic_id, topic_words in nmf_model.print_topics():
         #Preprocessing on a word-by-word basis

    #Save the topics

    #Calculate diversity and save

    #Calcuate coherence and save

In [None]:
# CTM coherence
topic_range = range(2,11)
coherence_values_ctm = []
diversity_values_ctm = []

qt = TopicModelDataPreparation("all-mpnet-base-v2")

for num_topics in topic_range:
    #Load CTM model and Train!

    #Save the topics

    #Calcuate coherence and save

In [None]:
# CTM diversity
training_dataset = qt.fit(text_for_contextual=df['Article'], text_for_bow=df['tokenized_text'])

diversity_values_ctm = []

for num_topics in topic_range:
    #Load CTM model and Train!

    #Save the topics

    #Calculate diversity and save

In [None]:
# BERTopic
embedding_model = SentenceTransformer("multi-qa-miniLM-L6-cos-v1")
umap_model = UMAP(n_neighbors=30, n_components=3, min_dist=0.2, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=16, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer()

In [None]:
# BERTopic coherence, diversity
os.environ["TOKENIZERS_PARALLELISM"] = "false"

coherence_values_bertopic = []
diversity_values_bertopic = []

for num_topics in topic_range:

    #Train BERTopic model and get topics

    bertopic_topics = [
        [topicwords[0] for topicwords in topic_model.get_topic(i)[:]]
        for i in range(len(result_get_topic)-1)]

    result = dict()
    result['topics'] = bertopic_topics

    documents = pd.DataFrame({"Document": df['tokenized_text'],
                          "ID": range(len(df['tokenized_text'])),
                          "Topic": topics})
    documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
    cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

    vectorizer = topic_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()

    words = vectorizer.get_feature_names_out()

    topic_words = [[words for words, _ in topic_model.get_topic(topic)]
                for topic in range(len(set(topics))-1)]

    #Calculate Coherence and Diversity

# Topic Modeling Results

In [None]:
topic_model = BERTopic(embedding_model=embedding_model,
                       umap_model=umap_model,
                       hdbscan_model=hdbscan_model,
                       vectorizer_model=vectorizer_model,
                       ctfidf_model=ctfidf_model,
                       top_n_words = 12,
                       nr_topics=8)
topics, probs = topic_model.fit_transform(df['tokenized_text'])

In [None]:
result_get_topic = topic_model.get_topic_info()
result_get_topic

In [None]:
topic_weight = topic_model.get_topics()
topic_weight

In [None]:
topic_weight = pd.DataFrame.from_dict(topic_weight, orient='index')
topic_weight

In [None]:
topic_model.visualize_barchart(top_n_topics=25)

In [None]:
topic_model.visualize_heatmap(width=800, height=600)

In [None]:
topic_model.visualize_topics()