In [1]:
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from scipy.special import softmax
import multiprocessing as mp
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch
from torch.nn import functional as F
from sklearn.preprocessing import normalize

# Installing ProdLDA
**Restart notbook after the installation!!**

In [2]:
!git clone https://github.com/estebandito22/PyTorchAVITM

Cloning into 'PyTorchAVITM'...
remote: Enumerating objects: 19052, done.[K
remote: Total 19052 (delta 0), reused 0 (delta 0), pack-reused 19052[K
Receiving objects: 100% (19052/19052), 132.62 MiB | 13.90 MiB/s, done.
Resolving deltas: 100% (89/89), done.
Checking out files: 100% (37/37), done.


# 1. Creation of synthetic corpus

We consider a scenario with n parties, each of them as an associated corpus.
To generate the corpus associated with each of the parties, we consider a common beta distribution (word-topic distribution), but we freeze different topics/ assign different asymmetric Dirichlet priors favoring different topics at the time of generating the document that composes each party's corpus.

## 1.1. Function for permuting the Dirichlet prior at each node

In [3]:
def rotateArray(arr, n, d):
    temp = []
    i = 0
    while (i < d):
        temp.append(arr[i])
        i = i + 1
    i = 0
    while (d < n):
        arr[i] = arr[d]
        i = i + 1
        d = d + 1
    arr[:] = arr[: i] + temp
    return arr

## 1.2. Topic modeling and node settings

In [4]:
# Topic modeling settings
vocab_size = 5000
n_topics = 50
beta = 1e-2
alpha = 1/n_topics
n_docs = 1000
nwords = (150, 250) #Min and max lengths of the documents

# Nodes settings
n_nodes = 5
frozen_topics = 5
prior_frozen = frozen_topics * [alpha]
own_topics = int((n_topics-frozen_topics)/n_nodes)
prior_nofrozen = own_topics * [alpha] + (n_topics-frozen_topics-own_topics) * [alpha/10000]
#print(prior_frozen + prior_nofrozen)

## 1.3. Topics generation (common for all nodes)

In [5]:
topic_vectors = np.random.dirichlet(vocab_size*[beta], n_topics)
print('Probabilidades ordenadas para el primer vector de tópicos:')
print(np.sort(topic_vectors[0])[::-1])
print(topic_vectors.shape)

Probabilidades ordenadas para el primer vector de tópicos:
[0.06719642 0.05684285 0.03974969 ... 0.         0.         0.        ]
(50, 5000)


In [6]:
#Here we compare alignment of the topic_vector matrix with itself and with another randomly generated matrix
print('Tópicos (equivalentes) identificados correctamente (true):', np.sum(np.max(np.sqrt(topic_vectors).dot(np.sqrt(topic_vectors.T)), axis=0)))
topic_vectors2 = np.random.dirichlet(vocab_size*[beta], n_topics)
print('Tópicos (equivalentes) identificados correctamente (random):', np.sum(np.max(np.sqrt(topic_vectors2).dot(np.sqrt(topic_vectors.T)), axis=0)))

Tópicos (equivalentes) identificados correctamente (true): 50.00000000000006
Tópicos (equivalentes) identificados correctamente (random): 3.8182884263328414


## 1.4. Generation of document topic proportions and documents for each node


In [7]:
# Step 2 - generation of document topic proportions
doc_topics_all = []
for i in np.arange(n_nodes):
    doc_topics = np.random.dirichlet(prior_frozen + prior_nofrozen, n_docs)
    prior_nofrozen = rotateArray(prior_nofrozen, len(prior_nofrozen), own_topics)
    doc_topics_all.append(doc_topics)

In [8]:
# Step 3 - Document generation
documents_all = []
z_all = []

for i in np.arange(n_nodes):
    documents = [] # Document words
    #z = [] # Assignments
    for docid in np.arange(n_docs):
        doc_len = np.random.randint(low=nwords[0], high=nwords[1])
        this_doc_words = []
        #this_doc_assigns = []
        for wd_idx in np.arange(doc_len):
            tpc = np.nonzero(np.random.multinomial(1, doc_topics_all[i][docid]))[0][0]
            #this_doc_assigns.append(tpc)
            word = np.nonzero(np.random.multinomial(1, topic_vectors[tpc]))[0][0]
            this_doc_words.append('wd'+str(word))
        #z.append(this_doc_assigns)
        documents.append(this_doc_words)
    documents_all.append(documents)
    #z_all.append(z)

In [9]:
np.savez('synthetic_10000_beta_1.npz', n_nodes = n_nodes, vocab_size=vocab_size, n_topics=n_topics, frozen_topics = frozen_topics, beta=beta, alpha=alpha,
        n_docs=n_docs, nwords=nwords, topic_vectors=topic_vectors, doc_topics=doc_topics_all,
        documents=documents_all, z=z_all)

  val = np.asanyarray(val)


In [10]:
doc_topics_all_gt = doc_topics_all

# 2. ProdLDA model at node 0

In [12]:
cd /content/PyTorchAVITM/pytorchavitm/datasets

/content/PyTorchAVITM/pytorchavitm/datasets


In [13]:
from bow import BOWDataset

In [14]:
cd /content/PyTorchAVITM

/content/PyTorchAVITM


In [15]:
from pytorchavitm import AVITM

In [16]:
def train_avitm(docs_train):
  cv = CountVectorizer(input='content', lowercase=True, stop_words='english',
                      max_df=0.99, min_df=0.01, binary=False)
  
  docs = [" ".join(docs_train[i]) for i in np.arange(len(docs_train))]

  train_bow = cv.fit_transform(docs)
  train_bow = train_bow.toarray()

  idx2token = cv.get_feature_names()
  input_size = len(idx2token)

  id2token = {k: v for k, v in zip(range(0, len(idx2token)), idx2token)}

  train_data = BOWDataset(train_bow, idx2token)

  avitm = AVITM(input_size=input_size, n_components=n_topics, model_type='prodLDA',
                hidden_sizes=(100, 100), activation='softplus', dropout=0.2,
                learn_priors=True, batch_size=64, lr=2e-3, momentum=0.99,
                solver='adam', num_epochs=100, reduce_on_plateau=False)
  
  avitm.fit(train_data)

  return train_data, avitm, id2token

In [17]:
def get_doc_topic_distribution(avitm, dataset, n_samples=20):
    avitm.model.eval()

    loader = DataLoader(
            avitm.train_data, batch_size=avitm.batch_size, shuffle=True,
            num_workers=mp.cpu_count())

    pbar = tqdm(n_samples, position=0, leave=True)

    final_thetas = []
    for sample_index in range(n_samples):
        with torch.no_grad():
            collect_theta = []

            for batch_samples in loader:
                X = batch_samples['X']

                if avitm.USE_CUDA:
                  X = X.cuda()

                # forward pass
                avitm.model.zero_grad()
                
                with torch.no_grad():
                  posterior_mu, posterior_log_sigma = avitm.model.inf_net(X)

                  # Generate samples from theta
                  theta = F.softmax(
                          avitm.model.reparameterize(posterior_mu, posterior_log_sigma), dim=1)
                  theta = avitm.model.drop_theta(theta)

                collect_theta.extend(theta.cpu().numpy().tolist())

            pbar.update(1)
            pbar.set_description("Sampling: [{}/{}]".format(sample_index + 1, n_samples))

            final_thetas.append(np.array(collect_theta))
    pbar.close()
    return np.sum(final_thetas, axis=0) / n_samples

In [18]:
def get_topic_word_distribution(avtim_model):
  topic_word_matrix = avtim_model.model.beta.cpu().detach().numpy()
  return softmax(topic_word_matrix, axis=1)

In [39]:
def convert_topic_word_to_init_size(vocab_size, model, ntopics,
                                    id2token, all_words):
    """It converts the topic-word distribution matrix obtained from the
    training of a model into a matrix with the dimensions of the original
    topic-word distribution, assigning zeros to those words that are not
    present in the corpus. 
    It is only of use in case we are training a model over a synthetic dataset,
    so as to later compare the performance of the attained model in what regards
    to the similarity between the original and the trained model.

    Args:
        * vocab_size (int):       Size of the synethic'data vocabulary.
        * model (AVITM):          Model whose topic-word matrix is being transformed.
        * ntopics (int):          Number of topics of the trained model.
        * id2token (List[tuple]): Mappings with the content of the document-term matrix.
        * all_words (List[str]):  List of all the words of the vocabulary of size vocab_size.

    Returns:
        * ndarray: Normalized transormed topic-word distribution.
    """
    w_t_distrib = np.zeros((ntopics, vocab_size), dtype=np.float64)
    wd = get_topic_word_distribution(model)
    for i in np.arange(ntopics):
        for idx, word in id2token.items():
            for j in np.arange(len(all_words)):
                if all_words[j] == word:
                    w_t_distrib[i, j] = wd[i][idx]
                    break
    normalized_array = normalize(w_t_distrib,axis=1,norm='l1')
    return normalized_array
    

In [None]:
corpus_node = documents_all[0]
train_data, avitm, id2token = train_avitm(corpus_node)

### Document-topic distributions

In [20]:
doc_topic = get_doc_topic_distribution(avitm, train_data, n_samples=5) # get all the topic predictions
print("Document-topic distribution node 0")
print(np.array(doc_topic).shape)

Sampling: [5/5]: : 5it [00:00,  6.17it/s]

Document-topic distribution node 0
(1000, 50)





### Word-topic distributions 

In [42]:
all_words = ['wd'+str(word) for word in np.arange(vocab_size+1) if word > 0]

In [40]:
word_topic = convert_topic_word_to_init_size(vocab_size, avitm, n_topics, id2token, all_words)
word_topic.shape

# 3. Centralized ProdLDA model

In [23]:
documents_centr = [*documents_all[0], *documents_all[1], *documents_all[2], *documents_all[3], *documents_all[4]]
len(documents_centr)

5000

In [None]:
train_data_centr, avitm_centr, id2token_centr = train_avitm(documents_centr)

In [25]:
doc_topic_centr = get_doc_topic_distribution(avitm_centr, train_data_centr, n_samples=5) # get all the topic predictions
print(doc_topic_centr.shape)

Sampling: [5/5]: : 5it [00:02,  1.91it/s]

(5000, 50)





In [26]:
word_topic_centr = convert_topic_word_to_init_size(vocab_size, avitm_centr, n_topics, id2token_centr, all_words)
word_topic_centr.shape

# 4. Evaluation

In [27]:
doc_topic_centr_node_0 = doc_topic_centr[0:1000,:]

### Doc-topics

In [30]:
sim_mat_theoretical = np.sqrt(doc_topics_all[0]).dot(np.sqrt(doc_topics_all[0].T))
sim_mat_actual = np.sqrt(doc_topic).dot(np.sqrt(doc_topic.T))
print('Difference in evaluation of doc similarity (node 0):', np.sum(np.abs(sim_mat_theoretical - sim_mat_actual))/n_docs)

sim_mat_actual_centr = np.sqrt(doc_topic_centr_node_0).dot(np.sqrt(doc_topic_centr_node_0.T))
print('Difference in evaluation of doc similarity (centr):', np.sum(np.abs(sim_mat_theoretical - sim_mat_actual_centr))/n_docs)

Difference in evaluation of doc similarity (node 0): 694.4910918189195
Difference in evaluation of doc similarity (centr): 630.8609969128047


### Topic-words

In [41]:
print('Tópicos (equivalentes) evaluados correctamente (node 0):', np.sum(np.max(np.sqrt(word_topic).dot(np.sqrt(topic_vectors.T)), axis=0)))
print('Tópicos (equivalentes) evaluados correctamente (centr):', np.sum(np.max(np.sqrt(word_topic_centr).dot(np.sqrt(topic_vectors.T)), axis=0)))

Tópicos (equivalentes) evaluados correctamente (node 0): 5.022558925927156
Tópicos (equivalentes) evaluados correctamente (centr): 6.526784932472249
