In [1]:
import numpy as np
import pandas as pd
import zipfile as zp
from pathlib import Path
from gensim.utils import check_output
from sklearn.preprocessing import normalize
from scipy.special import softmax
import shutil
from subprocess import check_output
import torch
from tqdm import tqdm



In [2]:
mallet_path = '/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet'

# 1. Creation of synthetic corpus
We consider a scenario with n parties, each of them as an associated corpus.
To generate the corpus associated with each of the parties, we consider a common beta distribution (word-topic distribution), but we freeze different topics/ assign different asymmetric Dirichlet priors favoring different topics at the time of generating the document that composes each party's corpus.

In [3]:
def rotateArray(arr, n, d):
    temp = []
    i = 0
    while (i < d):
        temp.append(arr[i])
        i = i + 1
    i = 0
    while (d < n):
        arr[i] = arr[d]
        i = i + 1
        d = d + 1
    arr[:] = arr[: i] + temp
    return arr

In [24]:
# Topic modeling settings
vocab_size = 5000
n_topics = 50
beta = 1e-2
alpha = 1/n_topics
n_docs = 10000
n_docs_inf = 10000
n_docs += n_docs_inf
nwords = (150, 250) #Min and max lengths of the documents
alg = "lda" #"prod"

In [5]:
# Nodes settings
n_nodes = 5
frozen_topics = 5
prior_frozen = frozen_topics * [alpha]
own_topics = int((n_topics-frozen_topics)/n_nodes)
prior_nofrozen = own_topics * [alpha] + (n_topics-frozen_topics-own_topics) * [alpha/10000]

In [6]:
# Step 1 - generation of topics
topic_vectors = np.random.dirichlet(vocab_size*[beta], n_topics)
print('Probabilidades ordenadas para el primer vector de tópicos:')
print(np.sort(topic_vectors[0])[::-1])
print(topic_vectors.shape)

Probabilidades ordenadas para el primer vector de tópicos:
[6.93194658e-002 4.44549314e-002 4.36735821e-002 ... 1.38660722e-312
 0.00000000e+000 0.00000000e+000]
(50, 5000)


In [7]:
#Here we compare alignment of the topic_vector matrix with itself and with another randomly generated matrix
print('Tópicos (equivalentes) identificados correctamente (true):', np.sum(np.max(np.sqrt(topic_vectors).dot(np.sqrt(topic_vectors.T)), axis=0)))
topic_vectors2 = np.random.dirichlet(vocab_size*[beta], n_topics)
print('Tópicos (equivalentes) identificados correctamente (random):', np.sum(np.max(np.sqrt(topic_vectors2).dot(np.sqrt(topic_vectors.T)), axis=0)))

Tópicos (equivalentes) identificados correctamente (true): 50.00000000000004
Tópicos (equivalentes) identificados correctamente (random): 3.692321484117879


In [8]:
# Step 2 - generation of document topic proportions
doc_topics_all = []
for i in np.arange(n_nodes):
    doc_topics = np.random.dirichlet(prior_frozen + prior_nofrozen, n_docs)
    prior_nofrozen = rotateArray(prior_nofrozen, len(prior_nofrozen), own_topics)
    doc_topics_all.append(doc_topics)

## According to LDA's generative process
**for** each document $d$ **do**  
&nbsp;&nbsp;Draw topic distribution $\theta \sim Dirichlet(\alpha)$  
&nbsp;&nbsp;&nbsp;&nbsp;**for** each word at position $n$ **do**  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Sample topic $z_n \sim Multinomial(1,\theta)$  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Sample word $w_n \sim Multinomial(1, \beta_{z_n})$  
&nbsp;&nbsp;&nbsp;&nbsp;**end**  
**end**

where the distribution of $w_n \sim \beta$, $\theta$ is $Multinomial(1, \beta\theta)$

## According to ProdLDA generative process
In comparisson to LDA's generative process:
- $\beta$ is **unnormalized**
- the conditional distribution of wn is defined as $w_n \mid \beta, \theta \sim Multinomial(1, \sigma(\beta\theta))$

In [9]:
# Step 3 - Document generation
documents_all = []
z_all = []

for i in np.arange(n_nodes):
    print("Generating document words for node ", str(i))
    documents = [] # Document words
    #z = [] # Assignments
    for docid in tqdm(np.arange(n_docs)):
        doc_len = np.random.randint(low=nwords[0], high=nwords[1])
        this_doc_words = []
        #this_doc_assigns = []
        for wd_idx in np.arange(doc_len):
            
            tpc = np.nonzero(np.random.multinomial(1, doc_topics_all[i][docid]))[0][0]
            #this_doc_assigns.append(tpc)
            if alg == "lda":
                word = np.nonzero(np.random.multinomial(1, topic_vectors[tpc]))[0][0]
            else: #prodlda
                pval = np.power(topic_vectors[tpc], doc_topics_all[i][docid][tpc])
                weights = torch.tensor(pval, dtype=torch.float) # create a tensor of weights
                word = torch.multinomial(weights, 1).numpy()[0]
                #pval = normalize(pval[:,np.newaxis], norm='l1', axis=0).ravel()
                #word = np.nonzero(np.random.multinomial(1, b))[0][0]
            this_doc_words.append('wd'+str(word))
        #z.append(this_doc_assigns)
        documents.append(this_doc_words)
    documents_all.append(documents)
    #z_all.append(z)

Generating document words for node  0


100%|███████████████████████████████████████████████████████████████████████| 20000/20000 [08:01<00:00, 41.57it/s]


Generating document words for node  1


100%|███████████████████████████████████████████████████████████████████████| 20000/20000 [08:00<00:00, 41.59it/s]


Generating document words for node  2


100%|███████████████████████████████████████████████████████████████████████| 20000/20000 [08:14<00:00, 40.45it/s]


Generating document words for node  3


100%|███████████████████████████████████████████████████████████████████████| 20000/20000 [08:08<00:00, 40.97it/s]


Generating document words for node  4


100%|███████████████████████████████████████████████████████████████████████| 20000/20000 [08:06<00:00, 41.14it/s]


## Generation of second corpus for validation with different generative process from that of the training corpus

In [10]:
n_docs_inf2 = 10000

In [11]:
# Step 1 - generation of topics
topic_vectors2 = np.random.dirichlet(vocab_size*[beta], n_topics)
print('Probabilidades ordenadas para el primer vector de tópicos:')
print(np.sort(topic_vectors2[0])[::-1])
print(topic_vectors2.shape)

Probabilidades ordenadas para el primer vector de tópicos:
[0.04373123 0.03963138 0.03749698 ... 0.         0.         0.        ]
(50, 5000)


In [12]:
# Step 2 - generation of document topic proportions
doc_topics_all2 = []
for i in np.arange(n_nodes):
    doc_topics = np.random.dirichlet(prior_frozen + prior_nofrozen, n_docs_inf2)
    prior_nofrozen = rotateArray(prior_nofrozen, len(prior_nofrozen), own_topics)
    doc_topics_all2.append(doc_topics)

In [13]:
# Step 3 - Document generation
documents_all2 = []
z_all2 = []

for i in np.arange(n_nodes):
    print("Generating document words for node ", str(i))
    documents = [] # Document words
    #z = [] # Assignments
    for docid in tqdm(np.arange(n_docs_inf2)):
        doc_len = np.random.randint(low=nwords[0], high=nwords[1])
        this_doc_words = []
        #this_doc_assigns = []
        for wd_idx in np.arange(doc_len):
            tpc = np.nonzero(np.random.multinomial(1, doc_topics_all[i][docid]))[0][0]
            #this_doc_assigns.append(tpc)
            if alg == "lda":
                word = np.nonzero(np.random.multinomial(1, topic_vectors[tpc]))[0][0]
            else: #prodlda
                pval = np.power(topic_vectors[tpc], doc_topics_all[i][docid][tpc])
                weights = torch.tensor(pval, dtype=torch.float) # create a tensor of weights
                word = torch.multinomial(weights, 1).numpy()[0]
                #pval = normalize(pval[:,np.newaxis], norm='l1', axis=0).ravel()
                #word = np.nonzero(np.random.multinomial(1, b))[0][0]
            this_doc_words.append('wd'+str(word))
        #z.append(this_doc_assigns)
        documents.append(this_doc_words)
    documents_all2.append(documents)
    #z_all.append(z)

Generating document words for node  0


100%|███████████████████████████████████████████████████████████████████████| 10000/10000 [04:01<00:00, 41.47it/s]


Generating document words for node  1


100%|███████████████████████████████████████████████████████████████████████| 10000/10000 [04:00<00:00, 41.64it/s]


Generating document words for node  2


100%|███████████████████████████████████████████████████████████████████████| 10000/10000 [04:07<00:00, 40.40it/s]


Generating document words for node  3


100%|███████████████████████████████████████████████████████████████████████| 10000/10000 [04:03<00:00, 41.07it/s]


Generating document words for node  4


100%|███████████████████████████████████████████████████████████████████████| 10000/10000 [04:02<00:00, 41.19it/s]


# 2. Mallet

In [14]:
def extractPipe(modeldir):
    
    path_corpus = modeldir.joinpath('corpus.mallet')
    if not path_corpus.is_file():
        print('-- Pipe extraction: Could not locate corpus file')
        return

    # Create auxiliary file with only first line from the original corpus file
    path_txt = modeldir.joinpath('corpus.txt')
    with path_txt.open('r', encoding='utf8') as f:
        first_line = f.readline()
        
    path_aux = modeldir.joinpath('corpus_aux.txt')
    with path_aux.open('w', encoding='utf8') as fout:
        fout.write(first_line + '\n')

    # We perform the import with the only goal to keep a small file containing the pipe
    print('-- Extracting pipeline')
    path_pipe = modeldir.joinpath('import.pipe')
    
    cmd = mallet_path + \
        ' import-file --use-pipe-from %s --input %s --output %s'   
    cmd = cmd % (path_corpus, path_aux, path_pipe)

    try:
        print(f'-- Running command {cmd}')
        check_output(args=cmd, shell=True)
    except:
        print('-- Failed to extract pipeline. Revise command')

    # Remove auxiliary file
    path_aux.unlink()

    return

In [15]:
def inference(modeldir):

    # A proper corpus should exist with the corresponding ipmortation pipe
    path_pipe = modeldir.joinpath('import.pipe')
    if not path_pipe.is_file():
        print('-- Inference error. Importation pipeline not found')
        return

    # Get inferencer
    inferencer = modeldir.joinpath('inferencer.mallet')
    
    # File for performing inference on
    corpus_file = modeldir.joinpath('corpus_inf.txt')
    if not corpus_file.is_file():
        print('-- Inference error. File to perform the inference on not found')
        return

    # The following files will be generated in the same folder
    corpus_mallet_inf = modeldir.joinpath('corpus_inf.mallet')  # mallet serialized
    doc_topics_file = modeldir.joinpath('doc-topics-inf.txt')

    # Import data to mallet
    print('-- Inference: Mallet Data Import')
   
    cmd = mallet_path + \
        ' import-file --use-pipe-from %s --input %s --output %s'
    cmd = cmd % (path_pipe, corpus_file, corpus_mallet_inf)

    try:
        print(f'-- Running command {cmd}')
        check_output(args=cmd, shell=True)
    except:
        print(
            '-- Mallet failed to import data. Revise command')
        return

    # Get topic proportions
    print('-- Inference: Inferring Topic Proportions')
    num_iterations = 100
    doc_topic_thr = 0

    cmd = mallet_path + \
        ' infer-topics --inferencer %s --input %s --output-doc-topics %s ' + \
        ' --doc-topics-threshold ' + str(doc_topic_thr) + \
        ' --num-iterations ' + str(num_iterations)
    cmd = cmd % (inferencer, corpus_mallet_inf, doc_topics_file)

    try:
        print(f'-- Running command {cmd}')
        check_output(args=cmd, shell=True)
    except:
        print('-- Mallet inference failed. Revise command')
        return

    return

## 2.1. Centralized approach

In [16]:
# Create model folder and save model training configuration
modelname = "mallet_centralized"
modeldir = Path("/export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test").joinpath(modelname)
if modeldir.exists():

    # Remove current backup folder, if it exists
    old_model_dir = Path(str(modeldir) + '_old/')
    if old_model_dir.exists():
        shutil.rmtree(old_model_dir)

    # Copy current model folder to the backup folder.
    shutil.move(modeldir, old_model_dir)
    print(f'-- -- Creating backup of existing model in {old_model_dir}')

modeldir.mkdir()
configFile = modeldir.joinpath('trainconfig.json')

-- -- Creating backup of existing model in /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized_old


In [26]:
# Corpus for centralized approach
my_corpus = [doc for docs_node in documents_all for doc in docs_node[0:(n_docs-n_docs_inf)]]
print(len(my_corpus))

50000


In [27]:
corpusFile = modeldir.joinpath("corpus.txt")
with open(corpusFile, 'w') as fout:
    [fout.write(str(idx) + ' 0 ' + ' '.join(doc) + '\n') for idx,doc in enumerate(my_corpus)]

In [28]:
corpusMallet = modeldir.joinpath('corpus.mallet')

cmd = mallet_path + \
    ' import-file --preserve-case --keep-sequence ' + \
    '--remove-stopwords --token-regex "' + '[\p{L}\p{N}][\p{L}\p{N}\p{P}]*' + \
    '" --input %s --output %s'
cmd = cmd % (corpusFile, corpusMallet)

try:
    print(f'-- -- Running command {cmd}')
    check_output(args=cmd, shell=True)
except:
    print('-- -- Mallet failed to import data. Revise command')

-- -- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "[\p{L}\p{N}][\p{L}\p{N}\p{P}]*" --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus.txt --output /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus.mallet


In [29]:
with open(configFile, 'w', encoding='utf8') as fout:
    fout.write('input = ' + corpusMallet.resolve().as_posix() + '\n')
    fout.write('num-topics = ' + str(n_topics) + '\n')
    fout.write('alpha = 1\n')
    fout.write('optimize-interval = 10\n')
    fout.write('num-threads = 4\n')
    fout.write('num-iterations = 1000\n')
    fout.write('doc-topics-threshold = 0\n')
    fout.write('output-doc-topics = ' +
                modeldir.joinpath('doc-topics.txt').resolve().as_posix() + '\n')
    fout.write('word-topic-counts-file = ' +
               modeldir.joinpath('word-topic-counts.txt').resolve().as_posix() + '\n')
    fout.write('output-topic-keys = ' +
               modeldir.joinpath('topickeys.txt').resolve().as_posix() + '\n')
    fout.write('inferencer-filename = ' + 
               modeldir.joinpath('inferencer.mallet').resolve().as_posix() + '\n')
cmd = mallet_path + ' train-topics --config ' + configFile.resolve().as_posix()
print(cmd)
try:
    print(
        f'-- -- Training mallet topic model. Command is {cmd}')
    check_output(args=cmd, shell=True)
except:
    print('-- -- Model training failed. Revise command')

/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet train-topics --config /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/trainconfig.json
-- -- Training mallet topic model. Command is /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet train-topics --config /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/trainconfig.json


Mallet LDA: 50 topics, 6 topic bits, 111111 topic mask
Data loaded.
max tokens: 249
total tokens: 9975625
<10> LL/token: -5.75279
<20> LL/token: -5.33538
<30> LL/token: -5.17453
<40> LL/token: -5.09724

0	0.02	wd1535 wd4319 wd622 wd3749 wd109 wd2749 wd2036 wd1894 wd3899 wd904 wd1063 wd4845 wd1492 wd1962 wd4540 wd2065 wd4908 wd1217 wd2793 wd2687 
1	0.02	wd4257 wd649 wd3714 wd2947 wd1760 wd2902 wd3489 wd4410 wd4854 wd3227 wd1573 wd3643 wd3725 wd4713 wd2650 wd3357 wd4162 wd3590 wd2676 wd792 
2	0.02	wd3245 wd1961 wd2772 wd818 wd1192 wd1641 wd30 wd4946 wd592 wd3138 wd3497 wd3049 wd435 wd2132 wd120 wd3311 wd974 wd894 wd2511 wd674 
3	0.02	wd3909 wd1371 wd2180 wd1862 wd311 wd1606 wd240 wd1673 wd1373 wd1065 wd1813 wd2978 wd1514 wd4163 wd3108 wd1710 wd2111 wd3620 wd3282 wd329 
4	0.02	wd3522 wd4169 wd3607 wd2632 wd2795 wd472 wd1666 wd3037 wd4374 wd4421 wd2691 wd824 wd2504 wd3400 wd2822 wd3835 wd1432 wd980 wd4158 wd4423 
5	0.02	wd2266 wd4288 wd931 wd515 wd2274 wd962 wd1228 wd4604 wd3089 wd2421 wd3

<100> LL/token: -5.04355
<110> LL/token: -5.04376
<120> LL/token: -5.04402
<130> LL/token: -5.04422
<140> LL/token: -5.04425

0	0.02	wd1535 wd4319 wd3749 wd622 wd109 wd2749 wd2036 wd1894 wd3899 wd904 wd1063 wd4845 wd1492 wd1962 wd4540 wd2065 wd4908 wd1217 wd2793 wd2687 
1	0.02	wd4257 wd649 wd3714 wd2947 wd1760 wd2902 wd3489 wd4410 wd4854 wd3227 wd1573 wd3643 wd3725 wd4713 wd2650 wd3357 wd4162 wd3590 wd2676 wd792 
2	0.02	wd3245 wd1961 wd2772 wd818 wd1192 wd1641 wd30 wd4946 wd592 wd3138 wd3497 wd3049 wd435 wd2132 wd120 wd3311 wd974 wd894 wd2511 wd674 
3	0.02	wd3909 wd1371 wd2180 wd311 wd1862 wd1606 wd240 wd1373 wd1673 wd2978 wd1065 wd1813 wd3108 wd1514 wd1710 wd4163 wd2111 wd3620 wd329 wd4222 
4	0.02	wd3522 wd4169 wd3607 wd2632 wd2795 wd472 wd1666 wd3037 wd4374 wd4421 wd2691 wd824 wd2504 wd3400 wd2822 wd3835 wd1432 wd980 wd4158 wd4423 
5	0.02	wd2266 wd4288 wd931 wd515 wd2274 wd962 wd1228 wd4604 wd3089 wd2421 wd3316 wd4164 wd4850 wd4993 wd17 wd2889 wd1315 wd4601 wd4401 wd4605 
6	0.02	wd48

<200> LL/token: -5.04389
[beta: 0.0102] 
<210> LL/token: -5.03608
[beta: 0.01092] 
<220> LL/token: -5.03123
[beta: 0.01135] 
<230> LL/token: -5.02933
[beta: 0.01152] 
<240> LL/token: -5.02789

0	0.00395	wd1535 wd4319 wd622 wd3749 wd109 wd2749 wd2036 wd1894 wd3899 wd904 wd1063 wd4845 wd1492 wd1962 wd4540 wd2065 wd4908 wd1217 wd2793 wd2687 
1	0.00382	wd4257 wd649 wd3714 wd2947 wd1760 wd2902 wd3489 wd4410 wd4854 wd3227 wd1573 wd3643 wd3725 wd4713 wd2650 wd3357 wd4162 wd3590 wd2676 wd792 
2	0.00386	wd3245 wd1961 wd2772 wd818 wd1192 wd1641 wd30 wd4946 wd592 wd3138 wd3497 wd3049 wd435 wd120 wd2132 wd3311 wd974 wd894 wd2511 wd674 
3	0.00375	wd3909 wd1371 wd2180 wd311 wd1862 wd1606 wd240 wd1373 wd1673 wd2978 wd1065 wd1813 wd1514 wd3108 wd1710 wd4163 wd2111 wd3620 wd329 wd4222 
4	0.00402	wd3522 wd4169 wd3607 wd2632 wd2795 wd472 wd1666 wd3037 wd4374 wd4421 wd2691 wd824 wd2504 wd3400 wd2822 wd3835 wd1432 wd980 wd4158 wd4423 
5	0.00335	wd2266 wd4288 wd931 wd515 wd2274 wd962 wd1228 wd4604 wd3089 wd

[beta: 0.01179] 
<300> LL/token: -5.02355
[beta: 0.01176] 
<310> LL/token: -5.02318
[beta: 0.01174] 
<320> LL/token: -5.02299
[beta: 0.01179] 
<330> LL/token: -5.02241
[beta: 0.01179] 
<340> LL/token: -5.02225

0	0.00366	wd1535 wd4319 wd3749 wd622 wd109 wd2749 wd2036 wd1894 wd3899 wd904 wd1063 wd4845 wd1492 wd1962 wd4540 wd2065 wd4908 wd1217 wd2793 wd2687 
1	0.00352	wd4257 wd649 wd3714 wd2947 wd1760 wd2902 wd3489 wd4410 wd4854 wd3227 wd1573 wd3643 wd3725 wd4713 wd2650 wd3357 wd4162 wd3590 wd2676 wd792 
2	0.00358	wd3245 wd1961 wd2772 wd818 wd1192 wd1641 wd30 wd4946 wd592 wd3138 wd3497 wd3049 wd435 wd2132 wd120 wd3311 wd974 wd894 wd2511 wd674 
3	0.00348	wd3909 wd1371 wd2180 wd311 wd1862 wd1606 wd240 wd1373 wd1673 wd2978 wd1065 wd1813 wd1514 wd3108 wd1710 wd4163 wd2111 wd3620 wd329 wd4222 
4	0.00371	wd3522 wd4169 wd3607 wd2632 wd2795 wd472 wd1666 wd3037 wd4374 wd4421 wd2691 wd824 wd2504 wd3400 wd2822 wd3835 wd1432 wd980 wd4158 wd4423 
5	0.00309	wd2266 wd4288 wd931 wd515 wd2274 wd962 wd122

[beta: 0.01177] 
<400> LL/token: -5.02226
[beta: 0.01183] 
<410> LL/token: -5.0224
[beta: 0.01177] 
<420> LL/token: -5.02226
[beta: 0.01177] 
<430> LL/token: -5.0223
[beta: 0.01175] 
<440> LL/token: -5.0222

0	0.00366	wd1535 wd4319 wd3749 wd622 wd109 wd2749 wd2036 wd1894 wd3899 wd904 wd1063 wd4845 wd1492 wd1962 wd4540 wd2065 wd4908 wd1217 wd2793 wd2687 
1	0.00349	wd4257 wd649 wd3714 wd2947 wd1760 wd2902 wd3489 wd4410 wd4854 wd3227 wd1573 wd3643 wd3725 wd4713 wd2650 wd3357 wd4162 wd3590 wd2676 wd792 
2	0.00359	wd3245 wd1961 wd2772 wd818 wd1192 wd1641 wd30 wd4946 wd592 wd3138 wd3497 wd3049 wd435 wd120 wd2132 wd3311 wd974 wd894 wd2511 wd674 
3	0.00347	wd3909 wd1371 wd2180 wd311 wd1862 wd1606 wd240 wd1373 wd1673 wd2978 wd1065 wd1813 wd3108 wd1514 wd1710 wd4163 wd2111 wd3620 wd329 wd4222 
4	0.00369	wd3522 wd4169 wd3607 wd2632 wd2795 wd472 wd1666 wd3037 wd4374 wd4421 wd2691 wd824 wd2504 wd3400 wd2822 wd3835 wd1432 wd980 wd4158 wd4423 
5	0.00314	wd2266 wd4288 wd931 wd515 wd2274 wd962 wd1228 w

[beta: 0.01178] 
<500> LL/token: -5.02189
[beta: 0.01177] 
<510> LL/token: -5.02196
[beta: 0.01177] 
<520> LL/token: -5.02195
[beta: 0.01176] 
<530> LL/token: -5.0218
[beta: 0.01175] 
<540> LL/token: -5.02188

0	0.00366	wd1535 wd4319 wd622 wd3749 wd109 wd2749 wd2036 wd1894 wd3899 wd904 wd1063 wd4845 wd1492 wd1962 wd4540 wd2065 wd4908 wd1217 wd2793 wd2687 
1	0.00348	wd4257 wd649 wd3714 wd2947 wd1760 wd2902 wd3489 wd4410 wd4854 wd3227 wd1573 wd3643 wd3725 wd4713 wd2650 wd3357 wd4162 wd3590 wd2676 wd792 
2	0.00354	wd3245 wd1961 wd2772 wd818 wd1192 wd1641 wd30 wd4946 wd592 wd3138 wd3497 wd3049 wd435 wd120 wd2132 wd3311 wd974 wd2511 wd894 wd674 
3	0.00349	wd3909 wd1371 wd2180 wd311 wd1862 wd1606 wd240 wd1373 wd1673 wd2978 wd1065 wd1813 wd3108 wd1514 wd1710 wd4163 wd2111 wd3620 wd329 wd4222 
4	0.00373	wd3522 wd4169 wd3607 wd2632 wd2795 wd472 wd1666 wd3037 wd4374 wd4421 wd2691 wd824 wd2504 wd3400 wd2822 wd3835 wd1432 wd980 wd4158 wd4423 
5	0.00318	wd2266 wd4288 wd931 wd515 wd2274 wd962 wd1228

[beta: 0.01175] 
<600> LL/token: -5.0222
[beta: 0.01176] 
<610> LL/token: -5.02194
[beta: 0.01171] 
<620> LL/token: -5.02187
[beta: 0.01175] 
<630> LL/token: -5.02189
[beta: 0.01171] 
<640> LL/token: -5.02151

0	0.00363	wd1535 wd4319 wd622 wd3749 wd109 wd2749 wd2036 wd1894 wd3899 wd904 wd1063 wd4845 wd1492 wd1962 wd4540 wd2065 wd4908 wd1217 wd2793 wd2687 
1	0.00344	wd4257 wd649 wd3714 wd2947 wd1760 wd2902 wd3489 wd4410 wd4854 wd3227 wd1573 wd3643 wd3725 wd4713 wd2650 wd3357 wd4162 wd3590 wd2676 wd792 
2	0.00361	wd3245 wd1961 wd2772 wd818 wd1192 wd1641 wd30 wd4946 wd592 wd3138 wd3497 wd3049 wd435 wd2132 wd120 wd3311 wd974 wd894 wd2511 wd674 
3	0.00346	wd3909 wd1371 wd2180 wd311 wd1862 wd1606 wd240 wd1373 wd1673 wd2978 wd1065 wd1514 wd1813 wd3108 wd1710 wd4163 wd2111 wd3620 wd329 wd4222 
4	0.0037	wd3522 wd4169 wd3607 wd2632 wd2795 wd472 wd1666 wd3037 wd4374 wd4421 wd2691 wd824 wd2504 wd3400 wd2822 wd3835 wd1432 wd980 wd4158 wd4423 
5	0.00322	wd2266 wd4288 wd931 wd515 wd2274 wd962 wd1228 

[beta: 0.01179] 
<700> LL/token: -5.02187
[beta: 0.01175] 
<710> LL/token: -5.02183
[beta: 0.01174] 
<720> LL/token: -5.02168
[beta: 0.01172] 
<730> LL/token: -5.02152
[beta: 0.01174] 
<740> LL/token: -5.02147

0	0.00365	wd1535 wd4319 wd3749 wd622 wd109 wd2749 wd2036 wd1894 wd3899 wd904 wd1063 wd4845 wd1492 wd1962 wd4540 wd2065 wd4908 wd1217 wd2793 wd2687 
1	0.0035	wd4257 wd649 wd3714 wd2947 wd1760 wd2902 wd3489 wd4410 wd4854 wd3227 wd1573 wd3643 wd3725 wd4713 wd2650 wd3357 wd4162 wd3590 wd2676 wd792 
2	0.0036	wd3245 wd1961 wd2772 wd818 wd1192 wd1641 wd30 wd4946 wd592 wd3138 wd3497 wd3049 wd435 wd2132 wd120 wd3311 wd974 wd894 wd2511 wd674 
3	0.00354	wd3909 wd1371 wd2180 wd311 wd1862 wd1606 wd240 wd1373 wd1673 wd2978 wd1065 wd1813 wd1514 wd3108 wd1710 wd4163 wd2111 wd3620 wd329 wd4222 
4	0.00368	wd3522 wd4169 wd3607 wd2632 wd2795 wd472 wd1666 wd3037 wd4374 wd4421 wd2691 wd824 wd2504 wd3400 wd2822 wd3835 wd1432 wd980 wd4158 wd4423 
5	0.00326	wd2266 wd4288 wd931 wd515 wd2274 wd962 wd1228 

[beta: 0.01178] 
<800> LL/token: -5.02149
[beta: 0.01173] 
<810> LL/token: -5.02162
[beta: 0.01175] 
<820> LL/token: -5.02141
[beta: 0.01181] 
<830> LL/token: -5.02146
[beta: 0.01175] 
<840> LL/token: -5.02165

0	0.00364	wd1535 wd4319 wd3749 wd622 wd109 wd2749 wd2036 wd1894 wd3899 wd904 wd1063 wd4845 wd1492 wd1962 wd4540 wd2065 wd4908 wd1217 wd2793 wd2687 
1	0.00347	wd4257 wd649 wd3714 wd2947 wd1760 wd2902 wd3489 wd4410 wd4854 wd3227 wd1573 wd3643 wd3725 wd4713 wd2650 wd3357 wd4162 wd3590 wd2676 wd792 
2	0.00357	wd3245 wd1961 wd2772 wd818 wd1192 wd1641 wd30 wd4946 wd592 wd3138 wd3497 wd3049 wd435 wd120 wd2132 wd3311 wd974 wd894 wd2511 wd674 
3	0.00351	wd3909 wd1371 wd2180 wd311 wd1862 wd1606 wd240 wd1373 wd1673 wd2978 wd1065 wd1813 wd3108 wd1514 wd1710 wd4163 wd2111 wd3620 wd329 wd4222 
4	0.00372	wd3522 wd4169 wd3607 wd2632 wd2795 wd472 wd1666 wd3037 wd4374 wd4421 wd2691 wd824 wd2504 wd3400 wd2822 wd3835 wd1432 wd980 wd4158 wd4423 
5	0.00326	wd2266 wd4288 wd931 wd515 wd2274 wd962 wd122

[beta: 0.0117] 
<900> LL/token: -5.0217
[beta: 0.01175] 
<910> LL/token: -5.0217
[beta: 0.01178] 
<920> LL/token: -5.02139
[beta: 0.01178] 
<930> LL/token: -5.02156
[beta: 0.01176] 
<940> LL/token: -5.0214

0	0.00366	wd1535 wd4319 wd3749 wd622 wd109 wd2749 wd2036 wd1894 wd3899 wd904 wd1063 wd4845 wd1492 wd1962 wd4540 wd2065 wd4908 wd1217 wd2793 wd2687 
1	0.00349	wd4257 wd649 wd3714 wd2947 wd1760 wd2902 wd3489 wd4410 wd4854 wd3227 wd1573 wd3643 wd3725 wd4713 wd2650 wd3357 wd4162 wd3590 wd2676 wd792 
2	0.0036	wd3245 wd1961 wd2772 wd818 wd1192 wd1641 wd30 wd4946 wd592 wd3138 wd3497 wd3049 wd435 wd2132 wd120 wd3311 wd974 wd894 wd2511 wd674 
3	0.00347	wd3909 wd1371 wd2180 wd311 wd1862 wd1606 wd240 wd1373 wd1673 wd2978 wd1065 wd1813 wd1514 wd3108 wd1710 wd4163 wd2111 wd3620 wd329 wd4222 
4	0.00374	wd3522 wd4169 wd3607 wd2632 wd2795 wd472 wd1666 wd3037 wd4374 wd4421 wd2691 wd824 wd2504 wd3400 wd2822 wd3835 wd1432 wd980 wd4158 wd4423 
5	0.00331	wd2266 wd4288 wd931 wd515 wd2274 wd962 wd1228 wd4

[beta: 0.01174] 
<1000> LL/token: -5.0215

Total time: 4 minutes 38 seconds


In [30]:
#Recover and build beta matrix
beta = np.zeros((n_topics, vocab_size))
with open(modeldir.joinpath('word-topic-counts.txt').resolve().as_posix(), 'r', encoding='utf8') as fin:
    for line in fin.readlines():
        tokens = line.split()[1:]
        pos = int(tokens[0][2:])
        for el in tokens[1:]:
            tpc = int(el.split(':')[0])
            cnt = int(el.split(':')[1])
            beta[tpc,pos] = cnt

beta = normalize(beta,axis=1,norm='l1')

In [31]:
print('Tópicos (equivalentes) evaluados correctamente:', np.sum(np.max(np.sqrt(beta).dot(np.sqrt(topic_vectors.T)), axis=0)))

Tópicos (equivalentes) evaluados correctamente: 45.996142003556066


In [32]:
sim_mat_theoretical = np.sqrt(doc_topics_all[0][0:(n_docs-n_docs_inf)]).dot(np.sqrt(doc_topics_all[0][0:(n_docs-n_docs_inf)].T))

In [33]:
thetas = np.loadtxt(modeldir.joinpath('doc-topics.txt').resolve().as_posix(), delimiter='\t', dtype=np.float32)[:,2:][:(n_docs-n_docs_inf),:]
thetas[thetas<3e-3] = 0
thetas = normalize(thetas,axis=1,norm='l1')
print(thetas.shape)

(10000, 50)


In [34]:
sim_mat_actual = np.sqrt(thetas).dot(np.sqrt(thetas.T))

In [35]:
print('Difference in evaluation of doc similarity:', np.sum(np.abs(sim_mat_theoretical - sim_mat_actual))/(n_docs-n_docs_inf))

Difference in evaluation of doc similarity: 450.62922411649225


### 2.1.1. Inference on the centralized approach

In [36]:
# Inference corpus for centralized approach
my_corpus_inf = [doc for docs_node in documents_all for doc in docs_node[(n_docs-n_docs_inf):n_docs]]
print(len(my_corpus_inf))
corpusFile = modeldir.joinpath("corpus_inf.txt")
with open(corpusFile, 'w') as fout:
    [fout.write(str(idx) + ' 0 ' + ' '.join(doc) + '\n') for idx,doc in enumerate(my_corpus)]

50000


In [37]:
extractPipe(modeldir)
inference(modeldir)

-- Extracting pipeline
-- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --use-pipe-from /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus.mallet --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus_aux.txt --output /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/import.pipe
-- Inference: Mallet Data Import
-- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --use-pipe-from /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/import.pipe --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus_inf.txt --output /export/usuarios_ml4ds/lbartolome/data/project_

In [38]:
thetas_inf = np.loadtxt(modeldir.joinpath('doc-topics-inf.txt').resolve().as_posix(), delimiter='\t', dtype=np.float32)[:,2:][:n_docs_inf,:]
thetas_inf[thetas_inf<3e-3] = 0
thetas_inf = normalize(thetas_inf,axis=1,norm='l1')
print(thetas_inf.shape)

(10000, 50)


In [39]:
sim_mat_theoretical_inf = np.sqrt(doc_topics_all[0][(n_docs-n_docs_inf):n_docs]).dot(np.sqrt(doc_topics_all[0][(n_docs-n_docs_inf):n_docs].T))
sim_mat_actual_inf = np.sqrt(thetas_inf).dot(np.sqrt(thetas_inf.T))
print('Difference in evaluation of doc similarity of inferred docs:', np.sum(np.abs(sim_mat_theoretical_inf - sim_mat_actual_inf))/n_docs_inf)

Difference in evaluation of doc similarity of inferred docs: 2064.6542869832606


In [44]:
# Inference corpus for centralized approach
my_corpus_inf2 = [doc for docs_node in documents_all2 for doc in docs_node]
print(len(my_corpus_inf2))
corpusFile = modeldir.joinpath("corpus_inf.txt")
with open(corpusFile, 'w') as fout:
    [fout.write(str(idx) + ' 0 ' + ' '.join(doc) + '\n') for idx,doc in enumerate(my_corpus)]

50000


In [45]:
extractPipe(modeldir)
inference(modeldir)

-- Extracting pipeline
-- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --use-pipe-from /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus.mallet --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus_aux.txt --output /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/import.pipe
-- Inference: Mallet Data Import
-- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --use-pipe-from /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/import.pipe --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus_inf.txt --output /export/usuarios_ml4ds/lbartolome/data/project_

In [46]:
thetas_inf = np.loadtxt(modeldir.joinpath('doc-topics-inf.txt').resolve().as_posix(), delimiter='\t', dtype=np.float32)[:,2:][:n_docs_inf,:]
thetas_inf[thetas_inf<3e-3] = 0
thetas_inf = normalize(thetas_inf,axis=1,norm='l1')
print(thetas_inf.shape)

(10000, 50)


In [47]:
sim_mat_theoretical_inf = np.sqrt(doc_topics_all[0][(n_docs-n_docs_inf):n_docs]).dot(np.sqrt(doc_topics_all[0][(n_docs-n_docs_inf):n_docs].T))
sim_mat_actual_inf = np.sqrt(thetas_inf).dot(np.sqrt(thetas_inf.T))
print('Difference in evaluation of doc similarity of inferred docs:', np.sum(np.abs(sim_mat_theoretical_inf - sim_mat_actual_inf))/n_docs_inf)

Difference in evaluation of doc similarity of inferred docs: 2064.6542869832606


## 2.2. Just in one node approach

In [None]:
# Create model folder and save model training configuration
modelname = "mallet_node0"
modeldir = Path("/export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test").joinpath(modelname)
if modeldir.exists():

    # Remove current backup folder, if it exists
    old_model_dir = Path(str(modeldir) + '_old/')
    if old_model_dir.exists():
        shutil.rmtree(old_model_dir)

    # Copy current model folder to the backup folder.
    shutil.move(modeldir, old_model_dir)
    print(f'-- -- Creating backup of existing model in {old_model_dir}')

modeldir.mkdir()
configFile = modeldir.joinpath('trainconfig.json')

In [None]:
my_corpus = documents_all[0][0:(n_docs-n_docs_inf)]

In [None]:
corpusFile = modeldir.joinpath("corpus.txt")
with open(corpusFile, 'w') as fout:
    [fout.write(str(idx) + ' 0 ' + ' '.join(doc) + '\n') for idx,doc in enumerate(my_corpus)]

In [None]:
corpusMallet = modeldir.joinpath('corpus.mallet')

cmd = mallet_path + \
    ' import-file --preserve-case --keep-sequence ' + \
    '--remove-stopwords --token-regex "' + '[\p{L}\p{N}][\p{L}\p{N}\p{P}]*' + \
    '" --input %s --output %s'
cmd = cmd % (corpusFile, corpusMallet)

try:
    print(f'-- -- Running command {cmd}')
    check_output(args=cmd, shell=True)
except:
    print('-- -- Mallet failed to import data. Revise command')

In [None]:
with open(configFile, 'w', encoding='utf8') as fout:
    fout.write('input = ' + corpusMallet.resolve().as_posix() + '\n')
    fout.write('num-topics = ' + str(n_topics) + '\n')
    fout.write('alpha = 1\n')
    fout.write('optimize-interval = 10\n')
    fout.write('num-threads = 4\n')
    fout.write('num-iterations = 1000\n')
    fout.write('doc-topics-threshold = 0\n')
    fout.write('output-doc-topics = ' +
                modeldir.joinpath('doc-topics.txt').resolve().as_posix() + '\n')
    fout.write('word-topic-counts-file = ' +
               modeldir.joinpath('word-topic-counts.txt').resolve().as_posix() + '\n')
    fout.write('output-topic-keys = ' +
               modeldir.joinpath('topickeys.txt').resolve().as_posix() + '\n')
    fout.write('inferencer-filename = ' + 
               modeldir.joinpath('inferencer.mallet').resolve().as_posix() + '\n')
cmd = mallet_path + ' train-topics --config ' + configFile.resolve().as_posix()
print(cmd)
try:
    print(
        f'-- -- Training mallet topic model. Command is {cmd}')
    check_output(args=cmd, shell=True)
except:
    print('-- -- Model training failed. Revise command')

In [None]:
#Recover and build beta matrix
beta = np.zeros((n_topics, vocab_size))
with open(modeldir.joinpath('word-topic-counts.txt').resolve().as_posix(), 'r', encoding='utf8') as fin:
    for line in fin.readlines():
        tokens = line.split()[1:]
        pos = int(tokens[0][2:])
        for el in tokens[1:]:
            tpc = int(el.split(':')[0])
            cnt = int(el.split(':')[1])
            beta[tpc,pos] = cnt

beta = normalize(beta,axis=1,norm='l1')

In [None]:
print('Tópicos (equivalentes) evaluados correctamente:', np.sum(np.max(np.sqrt(beta).dot(np.sqrt(topic_vectors.T)), axis=0)))

In [None]:
sim_mat_theoretical = np.sqrt(doc_topics_all[0][0:(n_docs-n_docs_inf)]).dot(np.sqrt(doc_topics_all[0][0:(n_docs-n_docs_inf)].T))

In [None]:
thetas = np.loadtxt(modeldir.joinpath('doc-topics.txt').resolve().as_posix(), delimiter='\t', dtype=np.float32)[:,2:][:n_docs,:]
thetas[thetas<3e-3] = 0
thetas = normalize(thetas,axis=1,norm='l1')

In [None]:
sim_mat_actual = np.sqrt(thetas).dot(np.sqrt(thetas.T))

In [None]:
print('Difference in evaluation of doc similarity:', np.sum(np.abs(sim_mat_theoretical - sim_mat_actual))/n_docs)

### 2.2.1. Inference on one node approach

In [None]:
# Inference corpus for centralized approach
my_corpus_inf = documents_all[0][(n_docs-n_docs_inf):n_docs]
corpusFile = modeldir.joinpath("corpus_inf.txt")
with open(corpusFile, 'w') as fout:
    [fout.write(str(idx) + ' 0 ' + ' '.join(doc) + '\n') for idx,doc in enumerate(my_corpus)]

In [None]:
extractPipe(modeldir)
inference(modeldir)

In [None]:
thetas_inf = np.loadtxt(modeldir.joinpath('doc-topics-inf.txt').resolve().as_posix(), delimiter='\t', dtype=np.float32)[:,2:][:n_docs_inf,:]
thetas_inf[thetas_inf<3e-3] = 0
thetas_inf = normalize(thetas_inf,axis=1,norm='l1')
print(thetas_inf.shape)

In [None]:
sim_mat_theoretical_inf = np.sqrt(doc_topics_all[0][(n_docs-n_docs_inf):n_docs]).dot(np.sqrt(doc_topics_all[0][(n_docs-n_docs_inf):n_docs].T))
sim_mat_actual_inf = np.sqrt(thetas_inf).dot(np.sqrt(thetas_inf.T))
print('Difference in evaluation of doc similarity of inferred docs:', np.sum(np.abs(sim_mat_theoretical_inf - sim_mat_actual_inf))/n_docs_inf)

# 3. ProdLDA

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse

In [None]:
cd /export/usuarios_ml4ds/lbartolome/topicmodeler

In [None]:
from src.topicmodeling.neural_models.pytorchavitm.datasets.bow_dataset import BOWDataset
from src.topicmodeling.neural_models.pytorchavitm.avitm_network.avitm import AVITM

In [None]:
def convert_topic_word_to_init_size(vocab_size, model, model_type,
                                    ntopics, id2token, all_words):
    """It converts the topic-word distribution matrix obtained from the training of a model into a matrix with the dimensions of the original topic-word distribution, assigning zeros to those words that are not present in the corpus. 
    It is only of use in case we are training a model over a synthetic dataset, so as to later compare the performance of the attained model in what regards to the similarity between the original and the trained model.

    Args:
        * vocab_size (int):       Size of the synethic'data vocabulary.
        * model (AVITM/CTM):      Model whose topic-word matrix is being transformed.
        * model_type (str):       Type of the trained model (e.g. AVITM)
        * ntopics (int):          Number of topics of the trained model.
        * id2token (List[tuple]): Mappings with the content of the document-term matrix.
        * all_words (List[str]):  List of all the words of the vocabulary of size vocab_size.

    Returns:
        * ndarray: Normalized transormed topic-word distribution.
    """
    if model_type == "avitm":
        w_t_distrib = np.zeros((ntopics, vocab_size), dtype=np.float64)
        wd = model.get_topic_word_distribution()
        for i in np.arange(ntopics):
            for idx, word in id2token.items():
                for j in np.arange(len(all_words)):
                    if all_words[j] == word:
                        w_t_distrib[i, j] = wd[i][idx]
                        break
        normalized_array = normalize(w_t_distrib,axis=1,norm='l1')
        return normalized_array
    else:
        print("Method not impleemnted for the selected model type")
        return None

## 3.1. Centralized approach

In [None]:
# Create model folder and save model training configuration
modelname = "prod_centralized"
modeldir = Path("/export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test").joinpath(modelname)
if modeldir.exists():

    # Remove current backup folder, if it exists
    old_model_dir = Path(str(modeldir) + '_old/')
    if old_model_dir.exists():
        shutil.rmtree(old_model_dir)

    # Copy current model folder to the backup folder.
    shutil.move(modeldir, old_model_dir)
    print(f'-- -- Creating backup of existing model in {old_model_dir}')

In [None]:
# Corpus for centralized approach
my_corpus = [doc for docs_node in documents_all for doc in docs_node[0:(n_docs-n_docs_inf)]]
print(len(my_corpus))

In [None]:
cv = CountVectorizer(input='content', lowercase=True, stop_words='english', binary=False)
docs = [" ".join(my_corpus[i]) for i in np.arange(len(my_corpus))]
train_bow = cv.fit_transform(docs).toarray()
idx2token = cv.get_feature_names()
train_dataset = BOWDataset(train_bow, idx2token)
input_size = len(idx2token)
id2token = {k: v for k, v in zip(range(0, len(idx2token)), idx2token)}

In [None]:
avitm = AVITM(logger=None,
              input_size=input_size,
              n_components=n_topics,
              model_type="prodLDA",
              hidden_sizes=(100, 100),
              activation='softplus',
              dropout=0.2,
              learn_priors=True,
              batch_size=64,
              lr=2e-3,
              momentum=0.99,
              solver='adam',
              num_epochs=100,
              reduce_on_plateau=False,
              topic_prior_mean=0.0,
              topic_prior_variance=None,
              num_samples=10
            
              num_data_loader_workers=0,
              verbose=True)
avitm.fit(train_dataset)

In [None]:
thetas = np.asarray(avitm.get_doc_topic_distribution(avitm.train_data))[0:(n_docs-n_docs_inf),:] 
thetas[thetas < 3e-3] = 0
thetas = normalize(thetas, axis=1, norm='l1')
thetas = sparse.csr_matrix(thetas, copy=True)


betas = avitm.get_topic_word_distribution()
all_words = ['wd'+str(word) for word in np.arange(vocab_size+1) if word > 0]
betas = convert_topic_word_to_init_size(vocab_size=vocab_size,
                                        model=avitm,
                                        model_type="avitm",
                                        ntopics=n_topics,
                                        id2token=id2token,
                                        all_words=all_words)
print(thetas.shape)

In [None]:
print('Tópicos (equivalentes) evaluados correctamente:', np.sum(np.max(np.sqrt(betas).dot(np.sqrt(topic_vectors.T)), axis=0)))

In [None]:
sim_mat_theoretical = np.sqrt(doc_topics_all[0][0:(n_docs-n_docs_inf)]).dot(np.sqrt(doc_topics_all[0][0:(n_docs-n_docs_inf)].T))
sim_mat_actual = np.sqrt(thetas).dot(np.sqrt(thetas.T))
print('Difference in evaluation of doc similarity:', np.sum(np.abs(sim_mat_theoretical - sim_mat_actual))/n_docs)

### 3.1.1. Inference on the centralized approach

In [None]:
my_corpus_inf = [doc for docs_node in documents_all for doc in docs_node[(n_docs-n_docs_inf):n_docs]]

docs_val_conv = [" ".join(my_corpus_inf[i]) for i in np.arange(len(my_corpus_inf))]
val_bow = cv.transform(docs_val_conv)
val_bow = val_bow.toarray()
val_data = BOWDataset(val_bow, idx2token)

In [None]:
thetas_inf = np.asarray(avitm.get_doc_topic_distribution(val_data))[(n_docs-n_docs_inf):n_docs,:] 
thetas_inf[thetas_inf < 3e-3] = 0
thetas_inf = normalize(thetas_inf, axis=1, norm='l1')
thetas_inf = sparse.csr_matrix(thetas_inf, copy=True)

In [None]:
sim_mat_theoretical = np.sqrt(doc_topics_all[0][(n_docs-n_docs_inf):n_docs]).dot(np.sqrt(doc_topics_all[0][(n_docs-n_docs_inf):n_docs].T))
sim_mat_actual = np.sqrt(thetas_inf).dot(np.sqrt(thetas_inf.T))
print('Difference in evaluation of doc similarity:', np.sum(np.abs(sim_mat_theoretical - sim_mat_actual))/n_docs_inf)

## 3.2. Just in one node approach

In [None]:
# Create model folder and save model training configuration
modelname = "prod_node0"
modeldir = Path("/export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test").joinpath(modelname)
if modeldir.exists():

    # Remove current backup folder, if it exists
    old_model_dir = Path(str(modeldir) + '_old/')
    if old_model_dir.exists():
        shutil.rmtree(old_model_dir)

    # Copy current model folder to the backup folder.
    shutil.move(modeldir, old_model_dir)
    print(f'-- -- Creating backup of existing model in {old_model_dir}')

In [None]:
my_corpus = documents_all[0][0:(n_docs-n_docs_inf)]

In [None]:
cv = CountVectorizer(input='content', lowercase=True, stop_words='english', binary=False)
docs = [" ".join(my_corpus[i]) for i in np.arange(len(my_corpus))]
train_bow = cv.fit_transform(docs).toarray()
idx2token = cv.get_feature_names()
train_dataset = BOWDataset(train_bow, idx2token)
input_size = len(idx2token)
id2token = {k: v for k, v in zip(range(0, len(idx2token)), idx2token)}

In [None]:
avitm = AVITM(logger=None,
              input_size=input_size,
              n_components=n_topics,
              model_type="prodLDA",
              hidden_sizes=(100, 100),
              activation='softplus',
              dropout=0.2,
              learn_priors=True,
              batch_size=64,
              lr=2e-3,
              momentum=0.99,
              solver='adam',
              num_epochs=100,
              reduce_on_plateau=False,
              topic_prior_mean=0.0,
              topic_prior_variance=None,
              num_samples=10,
              num_data_loader_workers=0,
              verbose=True)
avitm.fit(train_dataset)

In [None]:
thetas = np.asarray(avitm.get_doc_topic_distribution(avitm.train_data)) 
thetas[thetas < 3e-3] = 0
thetas = normalize(thetas, axis=1, norm='l1')
thetas = sparse.csr_matrix(thetas, copy=True)

betas = avitm.get_topic_word_distribution()
all_words = ['wd'+str(word) for word in np.arange(vocab_size+1) if word > 0]
betas = convert_topic_word_to_init_size(vocab_size=vocab_size,
                                        model=avitm,
                                        model_type="avitm",
                                        ntopics=n_topics,
                                        id2token=id2token,
                                        all_words=all_words)

In [None]:
print('Tópicos (equivalentes) evaluados correctamente:', np.sum(np.max(np.sqrt(betas).dot(np.sqrt(topic_vectors.T)), axis=0)))

In [None]:
sim_mat_theoretical = np.sqrt(doc_topics_all[0][0:(n_docs-n_docs_inf)]).dot(np.sqrt(doc_topics_all[0][0:(n_docs-n_docs_inf)].T))
print('Difference in evaluation of doc similarity:', np.sum(np.abs(sim_mat_theoretical - sim_mat_actual))/n_docs)

### 3.2.1. Inference on node approach

In [None]:
my_corpus_inf = documents_all[0][(n_docs-n_docs_inf):n_docs]
docs_val_conv = [" ".join(my_corpus_inf[i]) for i in np.arange(len(my_corpus_inf))]
val_bow = cv.transform(docs_val_conv)
val_bow = val_bow.toarray()
val_data = BOWDataset(val_bow, idx2token)

In [None]:
thetas_inf = np.asarray(avitm.get_doc_topic_distribution(val_data))
thetas_inf[thetas_inf < 3e-3] = 0
thetas_inf = normalize(thetas_inf, axis=1, norm='l1')
thetas_inf = sparse.csr_matrix(thetas_inf, copy=True)

In [None]:
print(thetas_inf.shape)

In [None]:
sim_mat_theoretical_inf = np.sqrt(doc_topics_all[0][(n_docs-n_docs_inf):n_docs]).dot(np.sqrt(doc_topics_all[0][(n_docs-n_docs_inf):n_docs].T))
sim_mat_actual_inf = np.sqrt(thetas_inf).dot(np.sqrt(thetas_inf.T))
print('Difference in evaluation of doc similarity of inferred docs:', np.sum(np.abs(sim_mat_theoretical_inf - sim_mat_actual_inf))/n_docs_inf)