In [6]:
from contextualized_topic_models.models.cotm import COTM
from contextualized_topic_models.utils.data_preparation import to_bow
from contextualized_topic_models.utils.data_preparation import embed_documents
import os
import json
import numpy as np
import pickle
from contextualized_topic_models.datasets.dataset import LMTopicDataset

### Load The Data

In [7]:
from contextualized_topic_models.utils.data_preparation import VocabAndTextFromFile

In [8]:
vocab_obj = VocabAndTextFromFile("../contextualized_topic_models/data/gnews/GoogleNews.txt")

In [9]:
vocab, training_ids = vocab_obj.create_vocab_and_index()


"""
you can also generate your embeddings:
from contextualized_topic_models.utils.data_preparation import embed_document

train_bert = embed_documents("../contextualized_topic_models/data/gnews/GoogleNews.txt", "distiluse-base-multilingual-cased")


"""

with open("../contextualized_topic_models/data/gnews/bert_embeddings_gnews", "rb") as filino:
    train_bert = pickle.load(filino)



In [10]:
train_bow = to_bow(training_ids, len(vocab))
idx2token = {v: k for (k, v) in vocab.items()}

training_data = LMTopicDataset(train_bow, train_bert, idx2token)


In [12]:
cotm = COTM(input_size=len(vocab), bert_input_size=len(train_bert[0]),  inferencetype="contextual",
                n_components=50, model_type="prodLDA",
              hidden_sizes=(100, ), activation='softplus', dropout=0.2,
              learn_priors=True, batch_size=200, lr=2e-3, momentum=0.99,
              solver='adam', num_epochs=2, reduce_on_plateau=False, 
                  num_data_loader_workers=0)

cotm.fit(training_data)


Settings: 
               N Components: 50
               Topic Prior Mean: 0.0
               Topic Prior Variance: 0.98
               Model Type: prodLDA
               Hidden Sizes: (100,)
               Activation: softplus
               Dropout: 0.2
               Learn Priors: True
               Learning Rate: 0.002
               Momentum: 0.99
               Reduce On Plateau: False
               Save Dir: None
Epoch: [1/2]	Samples: [11108/22216]	Train Loss: 92.77190522525994	Time: 0:00:07.715325
Epoch: [2/2]	Samples: [22216/22216]	Train Loss: 80.9538423004535	Time: 0:00:13.965166


In [13]:
cotm.get_topic_lists(5)[0:5]

[['xbox', 'nokia', 'lumia', 'patent', 'jellyfish'],
 ['president', 'fda', 'climate', 'mingles', 'afghan'],
 ['nsa', 'porn', 'habit', 'spied', 'discredit'],
 ['watkins', 'ian', 'guilty', 'singer', 'baby'],
 ['channel', 'table', 'philippine', 'typhoon', 'online']]

### Evaluate the Model

In [14]:
from contextualized_topic_models.evaluation.measures import TopicDiversity, CoherenceNPMI,\
    CoherenceWordEmbeddings,InvertedRBO

In [15]:
td = TopicDiversity(cotm.get_topic_lists(25))
td.score(topk=25)


0.6

In [16]:
rbo = InvertedRBO(cotm.get_topic_lists(10))
rbo.score()

0.019312351303177842

In [9]:
# evaluation of coherence on a word embedding space. if word2vec_file is specified, it retrieves the word embeddings 
# file (in word2vec format) otherwise 'word2vec-google-news-300' is downloaded using gensim's APIs 
word2vec_path = "your\\path\\to\\word2vec.bin"
we_coh = CoherenceWordEmbeddings(word2vec_path=word2vec_path,
                                 topics=cotm.get_topic_lists(10),binary=True)
we_coh.score(topk=10)

In [17]:
with open(os.path.join('../contextualized_topic_models/data/gnews', 'GoogleNews.txt'),"r") as fr:
    texts = [doc.split() for doc in fr.read().splitlines()]
npmi = CoherenceNPMI(texts=texts, topics=cotm.get_topic_lists(10))
npmi.score()

-0.19744172154597336

In [None]:
vocab = os.path.join('../contextualized_topic_models/data/gnews', 'vocab.pkl')
vocab = json.load(open(vocab, 'r'))
idx2token = {v: k for (k, v) in vocab.items()}

vocab_size = len(vocab)
train = np.load(os.path.join('../contextualized_topic_models/data/gnews', 'train.txt.pkl'), encoding='latin1', allow_pickle=True)
train_bow = to_bow(train, vocab_size)

with open("../contextualized_topic_models/data/gnews/bert_embeddings_gnews", "rb") as filino:
    train_bert = pickle.load(filino)