In [1]:
!pip install python-Levenshtein

import numpy as np
import pandas as pd
import zipfile as zp
from pathlib import Path
from gensim.utils import check_output
from sklearn.preprocessing import normalize
from scipy.special import softmax
import shutil
from subprocess import check_output
import torch
from tqdm import tqdm
import colored
import itertools

You should consider upgrading via the '/export/usuarios_ml4ds/lbartolome/topicmodeler/.venv/bin/python3 -m pip install --upgrade pip' command.[0m[33m
[0m

In [2]:
mallet_path = '/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet'

In [3]:
modelsdir = Path("/export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test")

In [4]:
def printgr(text):
    print(colored.stylize(text, colored.fg('green')))

## **0. Results from previous executions**


---

![](https://drive.google.com/uc?export=view&id=1xdRQCkU_-6nLmWj95RntU-g8c-mFDYaQ)

---

![](https://drive.google.com/uc?export=view&id=1afnevRhfACV4XokzvAknNXR3NUy5HNiy)

---

![](https://drive.google.com/uc?export=view&id=1vHuM-XT3-NhwfjsouoSc0dec3dMiH1Zi)


---

![](https://drive.google.com/uc?export=view&id=1CnN0zOJ6d-sSnJZ0m6poZabkcfU_mpwz)


---

![](https://drive.google.com/uc?export=view&id=1E2ifsJsp20LvatH3hFwwdhdaB6a99pPF)



## **1. Creation of synthetic corpus**


---

We consider a scenario with n parties, each of them as an associated corpus.
To generate the corpus associated with each of the parties, we consider a common beta distribution (word-topic distribution), but we freeze different topics/ assign different asymmetric Dirichlet priors favoring different topics at the time of generating the document that composes each party's corpus.

We consider two scenarios for the last step (documents generation) according to whether we are utilzing LDA or ProdLDA's generative process.

* **According to LDA's generative process**
    **for** each document $d$ **do**  
    &nbsp;&nbsp;Draw topic distribution $\theta \sim Dirichlet(\alpha)$  
    &nbsp;&nbsp;&nbsp;&nbsp;**for** each word at position $n$ **do**  
    &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Sample topic $z_n \sim Multinomial(1,\theta)$  
    &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Sample word $w_n \sim Multinomial(1, \beta_{z_n})$  
    &nbsp;&nbsp;&nbsp;&nbsp;**end**  
    **end**

    where the distribution of $w_n \sim \beta$, $\theta$ is $Multinomial(1, \beta\theta)$

* **According to ProdLDA generative process**
    In comparisson to LDA's generative process:
    - $\beta$ is **unnormalized**
    - the conditional distribution of wn is defined as $w_n \mid \beta, \theta \sim Multinomial(1, \sigma(\beta\theta))$

### *Auxiliary functions for synthetic corpus generation*

In [5]:
def rotateArray(arr, n, d):
    temp = []
    i = 0
    while (i < d):
        temp.append(arr[i])
        i = i + 1
    i = 0
    while (d < n):
        arr[i] = arr[d]
        i = i + 1
        d = d + 1
    arr[:] = arr[: i] + temp
    return arr

In [6]:
def generateSynthetic(just_inf, vocab_size, n_topics, beta, alpha, n_docs,
                      n_docs_inf, n_docs_global_inf, nwords, alg, n_nodes,
                      frozen_topics, prior_frozen, own_topics, prior_nofrozen):
    
    if just_inf:
        n_total_docs = n_docs_global_inf
    else:
        n_total_docs = n_docs + n_docs_inf

    # Step 1 - generation of topics
    topic_vectors = np.random.dirichlet(vocab_size*[beta], n_topics)
    
    # Step 2 - generation of document topic proportions
    doc_topics_all = []
    for i in np.arange(n_nodes):
        doc_topics = np.random.dirichlet(prior_frozen + prior_nofrozen, n_total_docs)
        prior_nofrozen = rotateArray(prior_nofrozen, len(prior_nofrozen), own_topics)
        doc_topics_all.append(doc_topics)
        
    # Step 3 - Document generation
    documents_all = []
    z_all = []

    for i in np.arange(n_nodes):
        print("Generating document words for node ", str(i))
        documents = [] # Document words
        #z = [] # Assignments
        for docid in tqdm(np.arange(n_total_docs)):
            doc_len = np.random.randint(low=nwords[0], high=nwords[1])
            this_doc_words = []
            #this_doc_assigns = []
            for wd_idx in np.arange(doc_len):

                tpc = np.nonzero(np.random.multinomial(1, doc_topics_all[i][docid]))[0][0]
                #this_doc_assigns.append(tpc)
                if alg == "lda":
                    word = np.nonzero(np.random.multinomial(1, topic_vectors[tpc]))[0][0]
                else: #prodlda
                    pval = np.power(topic_vectors[tpc], doc_topics_all[i][docid][tpc])
                    weights = torch.tensor(pval, dtype=torch.float) # create a tensor of weights
                    word = torch.multinomial(weights, 1).numpy()[0]
                    #pval = normalize(pval[:,np.newaxis], norm='l1', axis=0).ravel()
                    #word = np.nonzero(np.random.multinomial(1, b))[0][0]
                this_doc_words.append('wd'+str(word))
            #z.append(this_doc_assigns)
            documents.append(this_doc_words)
        documents_all.append(documents)
        #z_all.append(z)
    
    return topic_vectors, doc_topics_all, documents_all

### *Topic modeling and Centralized settings*

In [7]:
n_nodes = 25

In [8]:
# Topic modeling settings
vocab_size = 5000
n_topics = 50
beta = 1e-2
alpha = 50/n_topics#1/n_topics
n_docs = 1000
n_docs_inf = 1000
n_docs_global_inf = int(n_docs / n_nodes)
nwords = (150, 250) #Min and max lengths of the documents
alg = "lda" #"prod"

tm_settings = {
    "vocab_size": vocab_size,
    "n_topics": n_topics,
    "beta": beta,
    "alpha": alpha,
    "n_docs": n_docs,
    "n_docs_inf": n_docs_inf,
    "n_docs_global_inf": n_docs_global_inf,
    "nwords": nwords,
    "alg": alg
}

In [9]:
# Centralized settings
frozen_topics = 5
prior_frozen = frozen_topics * [alpha]
own_topics = int((n_topics-frozen_topics)/n_nodes)
prior_nofrozen = own_topics * [alpha] + (n_topics-frozen_topics-own_topics) * [alpha/10000]

centralized_settings = {
    "n_nodes": n_nodes,
    "frozen_topics": frozen_topics,
    "prior_frozen": prior_frozen,
    "own_topics": own_topics,
    "prior_nofrozen": prior_nofrozen
}

### *1.1. Generation of training + validation corpus `inf1`*

We generate `ndocs + n_docs_inf` documents for each node, from which `ndocs`are used for training and `n_docs_inf`for validation (`inf1`).

In [10]:
topic_vectors, doc_topics_all, documents_all = generateSynthetic(False, **tm_settings, **centralized_settings)

Generating document words for node  0


100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:49<00:00, 40.26it/s]


Generating document words for node  1


100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:50<00:00, 39.90it/s]


Generating document words for node  2


100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:50<00:00, 39.97it/s]


Generating document words for node  3


100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:49<00:00, 40.57it/s]


Generating document words for node  4


100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:51<00:00, 39.17it/s]


Generating document words for node  5


100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:50<00:00, 39.74it/s]


Generating document words for node  6


100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:50<00:00, 39.90it/s]


Generating document words for node  7


100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:50<00:00, 39.50it/s]


Generating document words for node  8


100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:50<00:00, 39.48it/s]


Generating document words for node  9


100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:50<00:00, 39.36it/s]


Generating document words for node  10


100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:50<00:00, 39.96it/s]


Generating document words for node  11


100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:50<00:00, 39.89it/s]


Generating document words for node  12


100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:49<00:00, 40.28it/s]


Generating document words for node  13


100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:50<00:00, 39.70it/s]


Generating document words for node  14


100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:00<00:00, 32.86it/s]


Generating document words for node  15


100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:03<00:00, 31.49it/s]


Generating document words for node  16


100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:04<00:00, 31.10it/s]


Generating document words for node  17


100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:03<00:00, 31.57it/s]


Generating document words for node  18


100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:04<00:00, 30.86it/s]


Generating document words for node  19


100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:02<00:00, 31.96it/s]


Generating document words for node  20


100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:03<00:00, 31.29it/s]


Generating document words for node  21


100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:02<00:00, 31.99it/s]


Generating document words for node  22


100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:03<00:00, 31.51it/s]


Generating document words for node  23


100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:03<00:00, 31.60it/s]


Generating document words for node  24


100%|███████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:03<00:00, 31.71it/s]


### *1.2. Generation of second validation corpus `inf2`*

We generate `ndocs/n_nodes` documents for each node, and concatenate all of them in the variable `global_inf_corpus`. This constitutes the second inference corpus (`inf2`) in which both centralized and individual approaches are tested, i.e., does the centralized approach learns to predict documents generated from different topics, better than the non-collaborative approach?

In [11]:
topic_vectors2, doc_topics_all2, documents_all2 = generateSynthetic(True, **tm_settings, **centralized_settings)

Generating document words for node  0


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 32.32it/s]


Generating document words for node  1


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 30.56it/s]


Generating document words for node  2


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 31.39it/s]


Generating document words for node  3


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 30.10it/s]


Generating document words for node  4


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 33.19it/s]


Generating document words for node  5


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 31.73it/s]


Generating document words for node  6


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 32.04it/s]


Generating document words for node  7


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 33.09it/s]


Generating document words for node  8


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 33.14it/s]


Generating document words for node  9


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 30.65it/s]


Generating document words for node  10


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 31.58it/s]


Generating document words for node  11


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 34.64it/s]


Generating document words for node  12


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 33.47it/s]


Generating document words for node  13


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 31.52it/s]


Generating document words for node  14


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 32.17it/s]


Generating document words for node  15


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 30.84it/s]


Generating document words for node  16


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 31.89it/s]


Generating document words for node  17


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 32.14it/s]


Generating document words for node  18


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 31.84it/s]


Generating document words for node  19


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 31.06it/s]


Generating document words for node  20


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 31.18it/s]


Generating document words for node  21


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 29.55it/s]


Generating document words for node  22


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 32.78it/s]


Generating document words for node  23


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 32.56it/s]


Generating document words for node  24


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:01<00:00, 31.25it/s]


In [12]:
global_inf_corpus = list(itertools.chain.from_iterable(documents_all2))
print(len(global_inf_corpus))


for i in range(len(doc_topics_all2)):
    if i == 0:
        global_inf_doc_topics = doc_topics_all2[i]
    else:
        global_inf_doc_topics = np.concatenate((global_inf_doc_topics,doc_topics_all2[i])) 
print(global_inf_doc_topics.shape)

1000
(1000, 50)


## **2. Mallet**


---

### *Auxiliary functions*

In [13]:
def create_model_folder(modelname, modelsdir):
    
    # Create model folder and save model training configuration
    modeldir = modelsdir.joinpath(modelname)
    
    if modeldir.exists():

        # Remove current backup folder, if it exists
        old_model_dir = Path(str(modeldir) + '_old/')
        if old_model_dir.exists():
            shutil.rmtree(old_model_dir)

        # Copy current model folder to the backup folder.
        shutil.move(modeldir, old_model_dir)
        print(f'-- -- Creating backup of existing model in {old_model_dir}')

    modeldir.mkdir()
    configFile = modeldir.joinpath('trainconfig.json')
    
    return modeldir, configFile

In [14]:
def train(modelname, modelsdir, corpus):
    
    # Create model folder
    modeldir, configFile = create_model_folder(modelname, modelsdir)
    
    # Create corpus txt file
    corpusFile = modeldir.joinpath("corpus.txt")
    with open(corpusFile, 'w') as fout:
        [fout.write(str(idx) + ' 0 ' + ' '.join(doc) + '\n') for idx,doc in enumerate(corpus)]
    
    # Create corpus mallet file
    corpusMallet = modeldir.joinpath('corpus.mallet')

    cmd = mallet_path + \
        ' import-file --preserve-case --keep-sequence ' + \
        '--remove-stopwords --token-regex "' + '[\p{L}\p{N}][\p{L}\p{N}\p{P}]*' + \
        '" --input %s --output %s'
    cmd = cmd % (corpusFile, corpusMallet)

    try:
        print(f'-- -- Running command {cmd}')
        check_output(args=cmd, shell=True)
    except:
        print('-- -- Mallet failed to import data. Revise command')
    
    # Perform Mallet train
    with open(configFile, 'w', encoding='utf8') as fout:
        fout.write('input = ' + corpusMallet.resolve().as_posix() + '\n')
        fout.write('num-topics = ' + str(n_topics) + '\n')
        fout.write('alpha = 1\n')
        fout.write('optimize-interval = 10\n')
        fout.write('num-threads = 4\n')
        fout.write('num-iterations = 1000\n')
        fout.write('doc-topics-threshold = 0\n')
        fout.write('output-doc-topics = ' +
                    modeldir.joinpath('doc-topics.txt').resolve().as_posix() + '\n')
        fout.write('word-topic-counts-file = ' +
                   modeldir.joinpath('word-topic-counts.txt').resolve().as_posix() + '\n')
        fout.write('output-topic-keys = ' +
                   modeldir.joinpath('topickeys.txt').resolve().as_posix() + '\n')
        fout.write('inferencer-filename = ' + 
                   modeldir.joinpath('inferencer.mallet').resolve().as_posix() + '\n')
    cmd = mallet_path + ' train-topics --config ' + configFile.resolve().as_posix()
    print(cmd)
    try:
        print(
            f'-- -- Training mallet topic model. Command is {cmd}')
        check_output(args=cmd, shell=True)
    except:
        print('-- -- Model training failed. Revise command')
    
    return modeldir

In [15]:
def extractPipe(modeldir):
    
    path_corpus = modeldir.joinpath('corpus.mallet')
    if not path_corpus.is_file():
        print('-- Pipe extraction: Could not locate corpus file')
        return

    # Create auxiliary file with only first line from the original corpus file
    path_txt = modeldir.joinpath('corpus.txt')
    with path_txt.open('r', encoding='utf8') as f:
        first_line = f.readline()
        
    path_aux = modeldir.joinpath('corpus_aux.txt')
    with path_aux.open('w', encoding='utf8') as fout:
        fout.write(first_line + '\n')

    # We perform the import with the only goal to keep a small file containing the pipe
    print('-- Extracting pipeline')
    path_pipe = modeldir.joinpath('import.pipe')
    
    cmd = mallet_path + \
        ' import-file --use-pipe-from %s --input %s --output %s'   
    cmd = cmd % (path_corpus, path_aux, path_pipe)

    try:
        print(f'-- Running command {cmd}')
        check_output(args=cmd, shell=True)
    except:
        print('-- Failed to extract pipeline. Revise command')

    # Remove auxiliary file
    path_aux.unlink()

    return 

In [16]:
def inference(modeldir, corpus_inf):

    # A proper corpus should exist with the corresponding ipmortation pipe
    path_pipe = modeldir.joinpath('import.pipe')
    if not path_pipe.is_file():
        print('-- Inference error. Importation pipeline not found')
        return

    # Get inferencer
    inferencer = modeldir.joinpath('inferencer.mallet')
    
    # File for performing inference on
    corpus_file = modeldir.joinpath("corpus_inf.txt")
    with open(corpus_file, 'w') as fout:
        [fout.write(str(idx) + ' 0 ' + ' '.join(doc) + '\n') for idx,doc in enumerate(corpus_inf)]

    # The following files will be generated in the same folder
    corpus_mallet_inf = modeldir.joinpath('corpus_inf.mallet')  # mallet serialized
    doc_topics_file = modeldir.joinpath('doc-topics-inf.txt')

    # Import data to mallet
    print('-- Inference: Mallet Data Import')
   
    cmd = mallet_path + \
        ' import-file --use-pipe-from %s --input %s --output %s'
    cmd = cmd % (path_pipe, corpus_file, corpus_mallet_inf)

    try:
        print(f'-- Running command {cmd}')
        check_output(args=cmd, shell=True)
    except:
        print(
            '-- Mallet failed to import data. Revise command')
        return

    # Get topic proportions
    print('-- Inference: Inferring Topic Proportions')
    num_iterations = 100
    doc_topic_thr = 0

    cmd = mallet_path + \
        ' infer-topics --inferencer %s --input %s --output-doc-topics %s ' + \
        ' --doc-topics-threshold ' + str(doc_topic_thr) + \
        ' --num-iterations ' + str(num_iterations)
    cmd = cmd % (inferencer, corpus_mallet_inf, doc_topics_file)

    try:
        print(f'-- Running command {cmd}')
        check_output(args=cmd, shell=True)
    except:
        print('-- Mallet inference failed. Revise command')
        return

    return

In [17]:
def eval_betas(beta, topic_vectors):
    print('Tópicos (equivalentes) evaluados correctamente:')
    score = np.sum(np.max(np.sqrt(beta).dot(np.sqrt(topic_vectors.T)), axis=0))
    printgr(score)
    return score

In [18]:
def eval_thetas(thetas_theoretical, thetas_actual, n_docs):
    sim_mat_theoretical = np.sqrt(thetas_theoretical).dot(np.sqrt(thetas_theoretical.T))
    sim_mat_actual = np.sqrt(thetas).dot(np.sqrt(thetas.T))
    print('Difference in evaluation of doc similarity:')
    score = np.sum(np.abs(sim_mat_theoretical - sim_mat_actual))/n_docs
    printgr(score)
    return score

### *2.1. Centralized approach*

In [19]:
# Define corpus
corpus = [doc for docs_node in documents_all for doc in docs_node[0:n_docs]]
print(len(corpus))

# Train model 
modelname = "mallet_centralized"
modeldir = train(modelname, modelsdir, corpus)

25000
-- -- Creating backup of existing model in /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized_old
-- -- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "[\p{L}\p{N}][\p{L}\p{N}\p{P}]*" --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus.txt --output /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus.mallet
/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet train-topics --config /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/trainconfig.json
-- -- Training mallet topic model. Command is /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/

Mallet LDA: 50 topics, 6 topic bits, 111111 topic mask
Data loaded.
max tokens: 249
total tokens: 4987605
<10> LL/token: -7.94901
<20> LL/token: -7.0516
<30> LL/token: -6.83818
<40> LL/token: -6.73442

0	0.02	wd2721 wd1274 wd3212 wd1905 wd3701 wd3501 wd725 wd2024 wd3302 wd69 wd4689 wd2115 wd317 wd4677 wd4149 wd1484 wd4831 wd4283 wd603 wd4200 
1	0.02	wd1698 wd1233 wd3357 wd4545 wd2463 wd155 wd4514 wd4480 wd1437 wd4127 wd697 wd4629 wd2698 wd1552 wd2197 wd206 wd2864 wd602 wd2760 wd3365 
2	0.02	wd1698 wd1233 wd3357 wd4545 wd2463 wd155 wd4514 wd1437 wd4480 wd4127 wd697 wd4629 wd2698 wd1552 wd2197 wd602 wd2864 wd206 wd2760 wd3365 
3	0.02	wd562 wd379 wd3654 wd2163 wd3311 wd4197 wd1556 wd3020 wd597 wd1256 wd3382 wd2441 wd4038 wd2800 wd881 wd4553 wd4719 wd2838 wd1541 wd3837 
4	0.02	wd3551 wd66 wd1332 wd4049 wd2676 wd3479 wd1453 wd2648 wd2586 wd2547 wd939 wd2032 wd844 wd7 wd178 wd3132 wd1356 wd1727 wd2763 wd3974 
5	0.02	wd295 wd4382 wd2689 wd2872 wd1341 wd2020 wd763 wd3171 wd2103 wd4821 wd1316 w

In [20]:
# Get betas
betas = np.zeros((tm_settings["n_topics"], tm_settings["vocab_size"]))
with open(modeldir.joinpath('word-topic-counts.txt').resolve().as_posix(), 'r', encoding='utf8') as fin:
    for line in fin.readlines():
        tokens = line.split()[1:]
        pos = int(tokens[0][2:])
        for el in tokens[1:]:
            tpc = int(el.split(':')[0])
            cnt = int(el.split(':')[1])
            betas[tpc,pos] = cnt

betas = normalize(betas,axis=1,norm='l1')

# Get thetas
thetas = np.loadtxt(modeldir.joinpath('doc-topics.txt').resolve().as_posix(), delimiter='\t', dtype=np.float32)[:,2:][:n_docs,:]
printgr(thetas.shape)
thetas[thetas<3e-3] = 0
thetas = normalize(thetas,axis=1,norm='l1')
thetas_theoretical = doc_topics_all[0][0:n_docs]

# Eval betas and thetas
betas_21 = eval_betas(betas, topic_vectors)
thetas_21 = eval_thetas(thetas_theoretical, thetas, n_docs)

[38;5;2m(1000, 50)[0m
Tópicos (equivalentes) evaluados correctamente:
[38;5;2m30.015168665581044[0m
Difference in evaluation of doc similarity:
[38;5;2m422.7253461956504[0m


#### 2.1.1. Inference on the centralized approach with `inf1`

In [21]:
# Define inference corpus
corpus_inf = [doc for docs_node in documents_all for doc in docs_node[n_docs:(n_docs+n_docs_inf)]]
print(len(corpus_inf))

# Perform inference
extractPipe(modeldir)
inference(modeldir, corpus_inf)

25000
-- Extracting pipeline
-- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --use-pipe-from /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus.mallet --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus_aux.txt --output /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/import.pipe
-- Inference: Mallet Data Import
-- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --use-pipe-from /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/import.pipe --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus_inf.txt --output /export/usuarios_ml4ds/lbartolome/data/pr

In [22]:
# Get inferred thetas
thetas_inf = np.loadtxt(modeldir.joinpath('doc-topics-inf.txt').resolve().as_posix(), delimiter='\t', dtype=np.float32)[:,2:][:n_docs,:]
thetas_inf[thetas_inf<3e-3] = 0
thetas_inf = normalize(thetas_inf,axis=1,norm='l1')
thetas_theoretical = doc_topics_all[0][n_docs:(n_docs+n_docs_inf)]

# Eval thetas
thetas_211 = eval_thetas(thetas_theoretical, thetas_inf, n_docs_inf)

Difference in evaluation of doc similarity:
[38;5;2m426.45521341110384[0m


#### 2.1.2. Inference on the centralized approach with with `inf2`

In [23]:
# Define inference corpus
corpus_inf = global_inf_corpus
print(len(corpus_inf))

# Perform inference
extractPipe(modeldir)
inference(modeldir, corpus_inf)

1000
-- Extracting pipeline
-- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --use-pipe-from /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus.mallet --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus_aux.txt --output /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/import.pipe
-- Inference: Mallet Data Import
-- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --use-pipe-from /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/import.pipe --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus_inf.txt --output /export/usuarios_ml4ds/lbartolome/data/pro

In [24]:
# Get inferred thetas
thetas_inf = np.loadtxt(modeldir.joinpath('doc-topics-inf.txt').resolve().as_posix(), delimiter='\t', dtype=np.float32)[:,2:][:n_docs,:]
thetas_inf[thetas_inf<3e-3] = 0
thetas_inf = normalize(thetas_inf,axis=1,norm='l1')
print(thetas_inf.shape)
thetas_theoretical = global_inf_doc_topics
print(thetas_theoretical.shape)
# Eval thetas
thetas_212 = eval_thetas(thetas_theoretical, thetas_inf, n_docs_inf)

(1000, 50)
(1000, 50)
Difference in evaluation of doc similarity:
[38;5;2m307.2826246516663[0m


### *2.2. Just in one node approach*

In [25]:
# Define corpus
corpus = documents_all[0][0:n_docs]
print(len(corpus))

# Train model 
modelname = "mallet_node0"
modeldir = train(modelname, modelsdir, corpus)

1000
-- -- Creating backup of existing model in /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0_old
-- -- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "[\p{L}\p{N}][\p{L}\p{N}\p{P}]*" --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/corpus.txt --output /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/corpus.mallet
/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet train-topics --config /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/trainconfig.json
-- -- Training mallet topic model. Command is /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet t

Mallet LDA: 50 topics, 6 topic bits, 111111 topic mask
Data loaded.
max tokens: 249
total tokens: 200168
<10> LL/token: -7.88803
<20> LL/token: -7.36069
<30> LL/token: -7.22429
<40> LL/token: -7.1477

0	0.02	wd3551 wd1332 wd1453 wd2586 wd2648 wd2547 wd178 wd2763 wd3974 wd570 wd2589 wd2620 wd2602 wd1623 wd769 wd2799 wd2454 wd146 wd813 wd1211 
1	0.02	wd2721 wd3551 wd4689 wd1332 wd3302 wd2648 wd3212 wd3701 wd1905 wd880 wd2602 wd4022 wd603 wd3974 wd2589 wd813 wd3681 wd2799 wd4066 wd4197 
2	0.02	wd3551 wd1332 wd2648 wd1453 wd2547 wd2586 wd178 wd570 wd3974 wd2763 wd2799 wd1623 wd3750 wd3442 wd2589 wd769 wd813 wd2602 wd3064 wd4632 
3	0.02	wd1562 wd3551 wd3245 wd880 wd2384 wd1618 wd3034 wd4873 wd298 wd212 wd1018 wd2574 wd3074 wd492 wd2365 wd3965 wd2463 wd4614 wd1332 wd1373 
4	0.02	wd3551 wd1453 wd2648 wd1332 wd2586 wd178 wd2547 wd2763 wd2721 wd2325 wd4652 wd3750 wd1274 wd3974 wd2799 wd769 wd4632 wd570 wd2454 wd2404 
5	0.02	wd4663 wd3365 wd2965 wd2894 wd2612 wd2463 wd2442 wd2388 wd2325 wd4930 w

In [26]:
# Get betas
betas = np.zeros((tm_settings["n_topics"], tm_settings["vocab_size"]))
with open(modeldir.joinpath('word-topic-counts.txt').resolve().as_posix(), 'r', encoding='utf8') as fin:
    for line in fin.readlines():
        tokens = line.split()[1:]
        pos = int(tokens[0][2:])
        for el in tokens[1:]:
            tpc = int(el.split(':')[0])
            cnt = int(el.split(':')[1])
            betas[tpc,pos] = cnt

betas = normalize(betas,axis=1,norm='l1')

# Get thetas
thetas = np.loadtxt(modeldir.joinpath('doc-topics.txt').resolve().as_posix(), delimiter='\t', dtype=np.float32)[:,2:][:n_docs,:]

thetas[thetas<3e-3] = 0
thetas = normalize(thetas,axis=1,norm='l1')
thetas_theoretical = doc_topics_all[0][0:n_docs]
print(thetas_theoretical.shape)

# Eval betas and thetas
betas_22 = eval_betas(betas, topic_vectors)
thetas_22 = eval_thetas(thetas_theoretical, thetas, n_docs)

(1000, 50)
Tópicos (equivalentes) evaluados correctamente:
[38;5;2m9.965826482161603[0m
Difference in evaluation of doc similarity:
[38;5;2m38.31733924349465[0m


#### 2.2.1. Inference on the just in one node approach with with `inf1`

In [27]:
# Define inference corpus
corpus_inf = documents_all[0][n_docs:(n_docs+n_docs_inf)]
print(len(corpus_inf))

# Perform inference
extractPipe(modeldir)
inference(modeldir, corpus_inf)

1000
-- Extracting pipeline
-- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --use-pipe-from /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/corpus.mallet --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/corpus_aux.txt --output /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/import.pipe
-- Inference: Mallet Data Import
-- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --use-pipe-from /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/import.pipe --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/corpus_inf.txt --output /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated

In [28]:
# Get inferred thetas
thetas_inf = np.loadtxt(modeldir.joinpath('doc-topics-inf.txt').resolve().as_posix(), delimiter='\t', dtype=np.float32)[:,2:][:n_docs,:]
thetas_inf[thetas_inf<3e-3] = 0
thetas_inf = normalize(thetas_inf,axis=1,norm='l1')
thetas_theoretical = doc_topics_all[0][n_docs:(n_docs+n_docs_inf)]

# Eval thetas
thetas_221 = eval_thetas(thetas_theoretical, thetas_inf, n_docs_inf)

Difference in evaluation of doc similarity:
[38;5;2m118.28287554533989[0m


#### 2.2.2. Inference on the just in one node approach with `inf2`

In [29]:
# Define inference corpus
corpus_inf = global_inf_corpus
print(len(corpus_inf))

# Perform inference
extractPipe(modeldir)
inference(modeldir, corpus_inf)

1000
-- Extracting pipeline
-- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --use-pipe-from /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/corpus.mallet --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/corpus_aux.txt --output /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/import.pipe
-- Inference: Mallet Data Import
-- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --use-pipe-from /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/import.pipe --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/corpus_inf.txt --output /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated

In [30]:
# Get inferred thetas
thetas_inf = np.loadtxt(modeldir.joinpath('doc-topics-inf.txt').resolve().as_posix(), delimiter='\t', dtype=np.float32)[:,2:][:n_docs,:]
thetas_inf[thetas_inf<3e-3] = 0
thetas_inf = normalize(thetas_inf,axis=1,norm='l1')
thetas_theoretical = global_inf_doc_topics

# Eval thetas
thetas_222 = eval_thetas(thetas_theoretical, thetas_inf, n_docs_inf)

Difference in evaluation of doc similarity:
[38;5;2m155.48133444601865[0m


## **3. ProdLDA**


---

In [31]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse

In [32]:
cd /export/usuarios_ml4ds/lbartolome/topicmodeler

/export/usuarios_ml4ds/lbartolome/topicmodeler


In [33]:
from src.topicmodeling.neural_models.pytorchavitm.datasets.bow_dataset import BOWDataset
from src.topicmodeling.neural_models.pytorchavitm.avitm_network.avitm import AVITM

### *Auxiliary functions*

In [34]:
def convert_topic_word_to_init_size(vocab_size, model, model_type,
                                    ntopics, id2token, all_words):
    """It converts the topic-word distribution matrix obtained from the training of a model into a matrix with the dimensions of the original topic-word distribution, assigning zeros to those words that are not present in the corpus. 
    It is only of use in case we are training a model over a synthetic dataset, so as to later compare the performance of the attained model in what regards to the similarity between the original and the trained model.

    Args:
        * vocab_size (int):       Size of the synethic'data vocabulary.
        * model (AVITM/CTM):      Model whose topic-word matrix is being transformed.
        * model_type (str):       Type of the trained model (e.g. AVITM)
        * ntopics (int):          Number of topics of the trained model.
        * id2token (List[tuple]): Mappings with the content of the document-term matrix.
        * all_words (List[str]):  List of all the words of the vocabulary of size vocab_size.

    Returns:
        * ndarray: Normalized transormed topic-word distribution.
    """
    if model_type == "avitm":
        w_t_distrib = np.zeros((ntopics, vocab_size), dtype=np.float64)
        wd = model.get_topic_word_distribution()
        for i in np.arange(ntopics):
            for idx, word in id2token.items():
                for j in np.arange(len(all_words)):
                    if all_words[j] == word:
                        w_t_distrib[i, j] = wd[i][idx]
                        break
        normalized_array = normalize(w_t_distrib,axis=1,norm='l1')
        return normalized_array
    else:
        print("Method not impleemnted for the selected model type")
        return None

In [35]:
def train_avitm(modelname, modelsdir, corpus):
    
    # Create model folder
    modeldir, configFile = create_model_folder(modelname, modelsdir)
    
    # Create corpus in ProdLDA format (BoWDataset)
    cv = CountVectorizer(input='content', lowercase=True, stop_words='english', binary=False)
    docs = [" ".join(corpus[i]) for i in np.arange(len(corpus))]
    train_bow = cv.fit_transform(docs).toarray()
    idx2token = cv.get_feature_names_out()
    train_dataset = BOWDataset(train_bow, idx2token)
    input_size = len(idx2token)
    id2token = {k: v for k, v in zip(range(0, len(idx2token)), idx2token)}
    
    avitm = AVITM(logger=None,
              input_size=input_size,
              n_components=n_topics,
              model_type="prodLDA",
              hidden_sizes=(100, 100),
              activation='softplus',
              dropout=0.2,
              learn_priors=True,
              batch_size=64,
              lr=2e-3,
              momentum=0.99,
              solver='adam',
              num_epochs=100,
              reduce_on_plateau=False,
              topic_prior_mean=0.0,
              topic_prior_variance=None,
              num_samples=10,
              num_data_loader_workers=0,
              verbose=True)
    avitm.fit(train_dataset)
    
    return modeldir, avitm, cv, id2token, idx2token

### *3.1. Centralized approach*

In [36]:
# Define corpus
corpus = [doc for docs_node in documents_all for doc in docs_node[0:n_docs]]
print(len(corpus))

# Train model 
modelname = "prod_centralized"
modeldir, avitm, cv, id2token, idx2token = train_avitm(modelname, modelsdir, corpus)

25000
-- -- Creating backup of existing model in /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/prod_centralized_old
Settings: 
                N Components: 50
                Topic Prior Mean: 0.0
                Topic Prior Variance: None
                Model Type: prodLDA
                Hidden Sizes: (100, 100)
                Activation: softplus
                Dropout: 0.2
                Learn Priors: True
                Learning Rate: 0.002
                Momentum: 0.99
                Reduce On Plateau: False
                Save Dir: None


Epoch: [100/100]	 Seen Samples: [2500000/2500000]	Train Loss: 1563.81712078125	Time: 0:00:06.235803: : 100it [10:10,  6.11s/it]
Sampling: [20/20]: : 20it [01:17,  3.89s/it]


In [37]:
# Get betas
betas = avitm.get_topic_word_distribution()
all_words = ['wd'+str(word) for word in np.arange(vocab_size+1) if word > 0]
betas = convert_topic_word_to_init_size(vocab_size=vocab_size,
                                        model=avitm,
                                        model_type="avitm",
                                        ntopics=n_topics,
                                        id2token=id2token,
                                        all_words=all_words)

# Get thetas
thetas = np.asarray(avitm.get_doc_topic_distribution(avitm.train_data))[0:n_docs,:]
thetas[thetas < 3e-3] = 0
thetas = normalize(thetas, axis=1, norm='l1')
thetas_theoretical = doc_topics_all[0][0:n_docs]

# Eval betas and thetas
betas_31 = eval_betas(betas, topic_vectors)
thetas_31 = eval_thetas(thetas_theoretical, thetas, n_docs)

Sampling: [20/20]: : 20it [01:18,  3.94s/it]


Tópicos (equivalentes) evaluados correctamente:
[38;5;2m8.45163662527067[0m
Difference in evaluation of doc similarity:
[38;5;2m84.84709719327337[0m


#### 3.1.1. Inference on the centralized approach with `inf1`

In [38]:
# Define inference corpus
corpus_inf = [doc for docs_node in documents_all for doc in docs_node[n_docs:(n_docs+n_docs_inf)]]
print(len(corpus_inf))


# Get inferred thetas
docs_val_conv = [" ".join(corpus_inf[i]) for i in np.arange(len(corpus_inf))]
val_bow = cv.transform(docs_val_conv)
val_bow = val_bow.toarray()
val_data = BOWDataset(val_bow, idx2token)

thetas_inf = np.asarray(avitm.get_doc_topic_distribution(val_data))[0:n_docs,:]
thetas_inf[thetas_inf < 3e-3] = 0
thetas_inf = normalize(thetas_inf, axis=1, norm='l1')

thetas_theoretical = doc_topics_all[0][n_docs:(n_docs+n_docs_inf)]

# Eval thetas
thetas_311 = eval_thetas(thetas_theoretical, thetas_inf, n_docs_inf)

25000


Sampling: [20/20]: : 20it [01:19,  3.96s/it]


Difference in evaluation of doc similarity:
[38;5;2m137.55107687431135[0m


#### 3.1.2. Inference on the centralized approach with with `inf2`

In [39]:
# Define inference corpus
corpus_inf = global_inf_corpus
print(len(corpus_inf))

# Get inferred thetas
docs_val_conv = [" ".join(corpus_inf[i]) for i in np.arange(len(corpus_inf))]
val_bow = cv.transform(docs_val_conv)
val_bow = val_bow.toarray()
val_data = BOWDataset(val_bow, idx2token)

thetas_inf = np.asarray(avitm.get_doc_topic_distribution(val_data))[0:n_docs,:]
thetas_inf[thetas_inf < 3e-3] = 0
thetas_inf = normalize(thetas_inf, axis=1, norm='l1')

thetas_theoretical = global_inf_doc_topics

# Eval thetas
thetas_312 = eval_thetas(thetas_theoretical, thetas_inf, n_docs_inf)

1000


Sampling: [20/20]: : 20it [00:36,  1.84s/it]

Difference in evaluation of doc similarity:
[38;5;2m161.78405839707912[0m





### *3.2. Just in one node approach*

In [40]:
# Define corpus
corpus = documents_all[0][0:n_docs]
print(len(corpus))

# Train model 
modelname = "prodlda_node0"
modeldir, avitm, cv, id2token, idx2token = train_avitm(modelname, modelsdir, corpus)

# Get betas
betas = avitm.get_topic_word_distribution()
all_words = ['wd'+str(word) for word in np.arange(vocab_size+1) if word > 0]
betas = convert_topic_word_to_init_size(vocab_size=vocab_size,
                                        model=avitm,
                                        model_type="avitm",
                                        ntopics=n_topics,
                                        id2token=id2token,
                                        all_words=all_words)

# Get thetas
thetas = np.asarray(avitm.get_doc_topic_distribution(avitm.train_data))
thetas[thetas < 3e-3] = 0
thetas = normalize(thetas, axis=1, norm='l1')
thetas_theoretical = doc_topics_all[0][0:n_docs]

# Eval betas and thetas
betas_32 = eval_betas(betas, topic_vectors)
thetas_32 = eval_thetas(thetas_theoretical, thetas, n_docs)

1000
-- -- Creating backup of existing model in /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/prodlda_node0_old
Settings: 
                N Components: 50
                Topic Prior Mean: 0.0
                Topic Prior Variance: None
                Model Type: prodLDA
                Hidden Sizes: (100, 100)
                Activation: softplus
                Dropout: 0.2
                Learn Priors: True
                Learning Rate: 0.002
                Momentum: 0.99
                Reduce On Plateau: False
                Save Dir: None


Epoch: [100/100]	 Seen Samples: [100000/100000]	Train Loss: 1409.938140625	Time: 0:00:01.867781: : 100it [03:16,  1.97s/it]
Sampling: [20/20]: : 20it [00:37,  1.86s/it]
Sampling: [20/20]: : 20it [00:34,  1.75s/it]

Tópicos (equivalentes) evaluados correctamente:
[38;5;2m5.128350820216635[0m
Difference in evaluation of doc similarity:
[38;5;2m134.69750122323586[0m





#### 3.2.1. Inference on the just in one node approach with `inf1`

In [41]:
# Define inference corpus
corpus_inf = documents_all[0][n_docs:(n_docs+n_docs_inf)]
print(len(corpus_inf))

# Get inferred thetas
docs_val_conv = [" ".join(corpus_inf[i]) for i in np.arange(len(corpus_inf))]
val_bow = cv.transform(docs_val_conv)
val_bow = val_bow.toarray()
val_data = BOWDataset(val_bow, idx2token)

thetas_inf = np.asarray(avitm.get_doc_topic_distribution(val_data))
thetas_inf[thetas_inf < 3e-3] = 0
thetas_inf = normalize(thetas_inf, axis=1, norm='l1')

thetas_theoretical = doc_topics_all[0][n_docs:(n_docs+n_docs_inf)]

# Eval thetas
thetas_321 = eval_thetas(thetas_theoretical, thetas_inf, n_docs_inf)

1000


Sampling: [20/20]: : 20it [00:35,  1.77s/it]

Difference in evaluation of doc similarity:
[38;5;2m179.62707169588805[0m





#### 2.2.2. Inference on the just in one node approach with `inf2`

In [42]:
# Define inference corpus
corpus_inf = global_inf_corpus
print(len(corpus_inf))

# Get inferred thetas
docs_val_conv = [" ".join(corpus_inf[i]) for i in np.arange(len(corpus_inf))]
val_bow = cv.transform(docs_val_conv)
val_bow = val_bow.toarray()
val_data = BOWDataset(val_bow, idx2token)

thetas_inf = np.asarray(avitm.get_doc_topic_distribution(val_data))
thetas_inf[thetas_inf < 3e-3] = 0
thetas_inf = normalize(thetas_inf, axis=1, norm='l1')

thetas_theoretical = global_inf_doc_topics

# Eval thetas
thetas_322 = eval_thetas(thetas_theoretical, thetas_inf, n_docs_inf)

1000


Sampling: [20/20]: : 20it [00:35,  1.77s/it]

Difference in evaluation of doc similarity:
[38;5;2m161.3290456853711[0m





In [43]:
print("_"*115)
print(f"{'Mallet-Centr':>50}\
        {'Mallet-Node0':>8}\
        {'ProdLDA-Centr':>6}\
        {'ProdLDA-Node0':>10}"
     )
print("-"*115)
print(f"{'Nr topics correctly evaluated:':<30}\
        {betas_21:>10.3f}\
        {betas_22:>11.3f}\
        {betas_31:>13.3f}\
        {betas_32:>13.3f}"
     )
print(f"{'Difference in doc sim:':<30}\
        {thetas_21:>10.3f}\
        {thetas_22:>11.3f}\
        {thetas_31:>13.3f}\
        {thetas_32:>13.3f}"
     )
print(f"{'Difference in doc sim inf1:':<30}\
        {thetas_211:>10.3f}\
        {thetas_221:>11.3f}\
        {thetas_311:>13.3f}\
        {thetas_321:>13.3f}"
     )
print(f"{'Difference in doc sim inf2:':<30}\
        {thetas_212:>10.3f}\
        {thetas_222:>11.3f}\
        {thetas_312:>13.3f}\
        {thetas_322:>13.3f}"
     )
print("_"*115)
print()

print("_"*115)
print(f"{'Stats':<15}\
        {'Nr nodes':>8}\
        {'Nr topics':>8}\
        {'Nr frozen':>8}\
        {'Alpha':>8}\
        {'Beta':>8}\
        {'Ndocs':>8}"
     )
print("-"*115)
print(f"{'':>15}\
        {n_nodes:>8}\
        {n_topics:>8}\
        {frozen_topics:>8}\
        {alpha:>8}\
        {beta:>8}\
        {n_docs:>8}"
     )
print("_"*115)

___________________________________________________________________________________________________________________
                                      Mallet-Centr        Mallet-Node0        ProdLDA-Centr        ProdLDA-Node0
-------------------------------------------------------------------------------------------------------------------
Nr topics correctly evaluated:            30.015              9.966                8.452                5.128
Difference in doc sim:                   422.725             38.317               84.847              134.698
Difference in doc sim inf1:              426.455            118.283              137.551              179.627
Difference in doc sim inf2:              307.283            155.481              161.784              161.329
___________________________________________________________________________________________________________________

__________________________________________________________________________________________________