In [None]:
from contextualized_topic_models.models.ctm import CTM
from contextualized_topic_models.utils.data_preparation import bert_embeddings_from_file, bert_embeddings_from_list
import os
import numpy as np
import pickle
from contextualized_topic_models.datasets.dataset import CTMDataset
from contextualized_topic_models.utils.data_preparation import TextHandler

### Load The Data

In [None]:
handler = TextHandler("preprocessed_docs.txt")
handler.prepare() # create vocabulary and training data 

# generate BERT data
train_bert = bert_embeddings_from_file('unpreprocessed_docs.txt', "distiluse-base-multilingual-cased")
training_dataset = CTMDataset(handler.bow, train_bert, handler.idx2token)

### Train the Fully Contextualized Topic Model

In [None]:
num_topics = 50
ctm = CTM(input_size=len(handler.vocab), bert_input_size=512, num_epochs=100, hidden_sizes = (100, ),
            inference_type="contextual", n_components=num_topics, num_data_loader_workers=0)

ctm.fit(training_dataset) # run the model

In [None]:
ctm.get_topic_lists(5) # get the top-5 words lists

### Test the model on unseen documents in unseen languages

In [None]:
testing_bert_italian = bert_embeddings_from_file('unpreprocessed_docs_italian.txt', "distiluse-base-multilingual-cased")
testing_dataset_italian = CTMDataset(testing_bert_italian, testing_bert_italian, [])

In [None]:
# we sample n times and average to get a more accurate estimate of the document-topic distribution

# document-topic distribution 
thetas = ctm.get_thetas(testing_dataset_italian, n_samples=100) 

#list of the predicted topics for each testing document
predicted_topics = ctm.get_predicted_topics(testing_dataset_italian, n_samples=100) 
