In [1]:
import numpy as np
import pandas as pd
import zipfile as zp
from pathlib import Path
from gensim.utils import check_output
from sklearn.preprocessing import normalize
import shutil
from subprocess import check_output



In [2]:
mallet_path = '/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet'

# 1. Creation of synthetic corpus
We consider a scenario with n parties, each of them as an associated corpus.
To generate the corpus associated with each of the parties, we consider a common beta distribution (word-topic distribution), but we freeze different topics/ assign different asymmetric Dirichlet priors favoring different topics at the time of generating the document that composes each party's corpus.

In [3]:
def rotateArray(arr, n, d):
    temp = []
    i = 0
    while (i < d):
        temp.append(arr[i])
        i = i + 1
    i = 0
    while (d < n):
        arr[i] = arr[d]
        i = i + 1
        d = d + 1
    arr[:] = arr[: i] + temp
    return arr

In [4]:
# Topic modeling settings
vocab_size = 5000
n_topics = 50
beta = 1e-2
alpha = 1/n_topics
n_docs = 1000
nwords = (150, 250) #Min and max lengths of the documents

In [5]:
# Nodes settings
n_nodes = 5
frozen_topics = 5
prior_frozen = frozen_topics * [alpha]
own_topics = int((n_topics-frozen_topics)/n_nodes)
prior_nofrozen = own_topics * [alpha] + (n_topics-frozen_topics-own_topics) * [alpha/10000]

In [6]:
# Step 1 - generation of topics
topic_vectors = np.random.dirichlet(vocab_size*[beta], n_topics)
print('Probabilidades ordenadas para el primer vector de tópicos:')
print(np.sort(topic_vectors[0])[::-1])
print(topic_vectors.shape)

Probabilidades ordenadas para el primer vector de tópicos:
[0.04534345 0.03386211 0.032944   ... 0.         0.         0.        ]
(50, 5000)


In [7]:
#Here we compare alignment of the topic_vector matrix with itself and with another randomly generated matrix
print('Tópicos (equivalentes) identificados correctamente (true):', np.sum(np.max(np.sqrt(topic_vectors).dot(np.sqrt(topic_vectors.T)), axis=0)))
topic_vectors2 = np.random.dirichlet(vocab_size*[beta], n_topics)
print('Tópicos (equivalentes) identificados correctamente (random):', np.sum(np.max(np.sqrt(topic_vectors2).dot(np.sqrt(topic_vectors.T)), axis=0)))

Tópicos (equivalentes) identificados correctamente (true): 50.00000000000004
Tópicos (equivalentes) identificados correctamente (random): 3.5155221510147308


In [8]:
# Step 2 - generation of document topic proportions
doc_topics_all = []
for i in np.arange(n_nodes):
    doc_topics = np.random.dirichlet(prior_frozen + prior_nofrozen, n_docs)
    prior_nofrozen = rotateArray(prior_nofrozen, len(prior_nofrozen), own_topics)
    doc_topics_all.append(doc_topics)

In [9]:
# Step 3 - Document generation
documents_all = []
z_all = []

for i in np.arange(n_nodes):
    documents = [] # Document words
    #z = [] # Assignments
    for docid in np.arange(n_docs):
        doc_len = np.random.randint(low=nwords[0], high=nwords[1])
        this_doc_words = []
        #this_doc_assigns = []
        for wd_idx in np.arange(doc_len):
            tpc = np.nonzero(np.random.multinomial(1, doc_topics_all[i][docid]))[0][0]
            #this_doc_assigns.append(tpc)
            word = np.nonzero(np.random.multinomial(1, topic_vectors[tpc]))[0][0]
            this_doc_words.append('wd'+str(word))
        #z.append(this_doc_assigns)
        documents.append(this_doc_words)
    documents_all.append(documents)
    #z_all.append(z)

# 2. Mallet

## 2.1. Centralized approach

In [10]:
# Create model folder and save model training configuration
modelname = "mallet_centralized"
modeldir = Path("/export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test").joinpath(modelname)
if modeldir.exists():

    # Remove current backup folder, if it exists
    old_model_dir = Path(str(modeldir) + '_old/')
    if old_model_dir.exists():
        shutil.rmtree(old_model_dir)

    # Copy current model folder to the backup folder.
    shutil.move(modeldir, old_model_dir)
    print(f'-- -- Creating backup of existing model in {old_model_dir}')

modeldir.mkdir()
configFile = modeldir.joinpath('trainconfig.json')

-- -- Creating backup of existing model in /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized_old


In [11]:
# Corpus for centralized approach
my_corpus = [doc for docs_node in documents_all for doc in docs_node]

In [12]:
corpusFile = modeldir.joinpath("corpus.txt")
with open(corpusFile, 'w') as fout:
    [fout.write(str(idx) + ' 0 ' + ' '.join(doc) + '\n') for idx,doc in enumerate(my_corpus)]

In [13]:
corpusMallet = modeldir.joinpath('corpus.mallet')

cmd = mallet_path + \
    ' import-file --preserve-case --keep-sequence ' + \
    '--remove-stopwords --token-regex "' + '[\p{L}\p{N}][\p{L}\p{N}\p{P}]*' + \
    '" --input %s --output %s'
cmd = cmd % (corpusFile, corpusMallet)

try:
    print(f'-- -- Running command {cmd}')
    check_output(args=cmd, shell=True)
except:
    print('-- -- Mallet failed to import data. Revise command')

-- -- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "[\p{L}\p{N}][\p{L}\p{N}\p{P}]*" --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus.txt --output /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus.mallet


In [14]:
with open(configFile, 'w', encoding='utf8') as fout:
    fout.write('input = ' + corpusMallet.resolve().as_posix() + '\n')
    fout.write('num-topics = ' + str(n_topics) + '\n')
    fout.write('alpha = 1\n')
    fout.write('optimize-interval = 10\n')
    fout.write('num-threads = 4\n')
    fout.write('num-iterations = 1000\n')
    fout.write('doc-topics-threshold = 0\n')
    fout.write('output-doc-topics = ' +
                modeldir.joinpath('doc-topics.txt').resolve().as_posix() + '\n')
    fout.write('word-topic-counts-file = ' +
               modeldir.joinpath('word-topic-counts.txt').resolve().as_posix() + '\n')
    fout.write('output-topic-keys = ' +
               modeldir.joinpath('topickeys.txt').resolve().as_posix() + '\n')
cmd = mallet_path + ' train-topics --config ' + configFile.resolve().as_posix()
print(cmd)
try:
    print(
        f'-- -- Training mallet topic model. Command is {cmd}')
    check_output(args=cmd, shell=True)
except:
    print('-- -- Model training failed. Revise command')

/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet train-topics --config /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/trainconfig.json
-- -- Training mallet topic model. Command is /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet train-topics --config /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/trainconfig.json


Mallet LDA: 50 topics, 6 topic bits, 111111 topic mask
Data loaded.
max tokens: 249
total tokens: 997836
<10> LL/token: -5.70223
<20> LL/token: -5.30609
<30> LL/token: -5.27211
<40> LL/token: -5.26593

0	0.02	wd2258 wd957 wd3794 wd4590 wd2552 wd3810 wd1448 wd1323 wd624 wd3584 wd1393 wd4968 wd1724 wd3442 wd2153 wd4532 wd2189 wd1447 wd4890 wd88 
1	0.02	wd1874 wd734 wd2104 wd3088 wd4620 wd4918 wd584 wd1880 wd1200 wd2040 wd2013 wd4035 wd3787 wd1140 wd4384 wd1742 wd4356 wd2691 wd4028 wd4908 
2	0.02	wd1522 wd4108 wd187 wd1575 wd253 wd3 wd2286 wd3814 wd3621 wd4723 wd2956 wd1750 wd3971 wd2668 wd4488 wd3545 wd3020 wd931 wd2336 wd4344 
3	0.02	wd4046 wd4631 wd1498 wd2543 wd3286 wd2218 wd1973 wd1242 wd577 wd1352 wd2398 wd3768 wd4850 wd2784 wd3202 wd781 wd2701 wd4342 wd2665 wd4457 
4	0.02	wd2258 wd957 wd3794 wd4590 wd2552 wd3810 wd1448 wd1323 wd624 wd3584 wd4968 wd4532 wd1393 wd1724 wd4890 wd3442 wd2189 wd1447 wd88 wd2153 
5	0.02	wd4891 wd1198 wd3380 wd947 wd4901 wd3130 wd3480 wd4217 wd1491 wd4104 

<110> LL/token: -5.2226
<120> LL/token: -5.22147
<130> LL/token: -5.2201
<140> LL/token: -5.21688

0	0.02	wd2258 wd957 wd3794 wd4590 wd3810 wd2552 wd1448 wd1323 wd3584 wd4968 wd624 wd2189 wd4890 wd1724 wd4532 wd2153 wd3442 wd1393 wd1447 wd3612 
1	0.02	wd1874 wd734 wd2104 wd3088 wd4620 wd4918 wd584 wd1880 wd1200 wd2040 wd2013 wd4035 wd3787 wd1140 wd4356 wd4384 wd1742 wd2691 wd4908 wd4028 
2	0.02	wd1522 wd4108 wd187 wd1575 wd253 wd3 wd2286 wd4723 wd3814 wd3971 wd2956 wd3621 wd1750 wd4488 wd2668 wd4344 wd2336 wd4445 wd2884 wd931 
3	0.02	wd4046 wd4631 wd1498 wd2543 wd3286 wd2218 wd1973 wd1242 wd577 wd1352 wd2398 wd3768 wd4850 wd2784 wd3202 wd781 wd2701 wd4342 wd2665 wd4457 
4	0.02	wd2258 wd957 wd3794 wd4590 wd2552 wd3810 wd1448 wd1323 wd624 wd3584 wd4968 wd4532 wd1393 wd1724 wd3442 wd2189 wd4890 wd1447 wd88 wd4011 
5	0.02	wd4891 wd1198 wd3380 wd947 wd4901 wd3130 wd3480 wd1491 wd4217 wd4104 wd4153 wd3533 wd4798 wd715 wd1044 wd4779 wd3426 wd714 wd1589 wd4069 
6	0.02	wd757 wd2724 wd4245 wd294

[beta: 0.01133] 
<210> LL/token: -5.2068
[beta: 0.01192] 
<220> LL/token: -5.20175
[beta: 0.01219] 
<230> LL/token: -5.19828
[beta: 0.01229] 
<240> LL/token: -5.1958

0	0.00277	wd3794 wd2258 wd4590 wd957 wd3810 wd3584 wd2552 wd4968 wd1448 wd4890 wd4532 wd624 wd1323 wd3744 wd2153 wd4879 wd1724 wd3032 wd4441 wd893 
1	0.00418	wd1874 wd734 wd2104 wd3088 wd4620 wd4918 wd584 wd1880 wd1200 wd2040 wd2013 wd4035 wd3787 wd1140 wd4356 wd4384 wd1742 wd2691 wd4028 wd4908 
2	0.00423	wd1522 wd4108 wd187 wd1575 wd253 wd3 wd2286 wd4723 wd3971 wd3814 wd2956 wd3621 wd1750 wd4488 wd2668 wd4344 wd2336 wd4445 wd2884 wd931 
3	0.00435	wd4046 wd4631 wd1498 wd2543 wd3286 wd2218 wd1973 wd1242 wd577 wd1352 wd2398 wd3768 wd4850 wd2784 wd3202 wd781 wd2701 wd4342 wd2665 wd4457 
4	0.01444	wd2258 wd957 wd3794 wd4590 wd2552 wd3810 wd1448 wd1323 wd624 wd3584 wd4968 wd1393 wd4532 wd1724 wd3442 wd2189 wd4890 wd1447 wd88 wd4011 
5	0.00969	wd4891 wd1198 wd3380 wd947 wd4901 wd3480 wd3130 wd4217 wd1491 wd4104 wd4153 wd4798 wd

[beta: 0.01256] 
<310> LL/token: -5.18869
[beta: 0.01256] 
<320> LL/token: -5.18694
[beta: 0.01257] 
<330> LL/token: -5.1864
[beta: 0.01257] 
<340> LL/token: -5.18632

0	0	
1	0.00402	wd1874 wd734 wd2104 wd3088 wd4620 wd4918 wd584 wd1880 wd1200 wd2040 wd2013 wd4035 wd3787 wd1140 wd4384 wd4356 wd1742 wd2691 wd4028 wd4908 
2	0.00376	wd1522 wd4108 wd187 wd1575 wd253 wd3 wd2286 wd4723 wd3971 wd3814 wd2956 wd3621 wd1750 wd2668 wd4488 wd4344 wd2336 wd4445 wd2884 wd931 
3	0.00387	wd4046 wd4631 wd1498 wd2543 wd3286 wd2218 wd1973 wd1242 wd577 wd1352 wd2398 wd3768 wd4850 wd2784 wd3202 wd781 wd2701 wd4342 wd2665 wd4457 
4	0.01481	wd2258 wd957 wd3794 wd4590 wd2552 wd3810 wd1448 wd1323 wd624 wd3584 wd4968 wd1393 wd1724 wd4532 wd3442 wd2189 wd4890 wd1447 wd88 wd2153 
5	0.00836	wd4891 wd1198 wd3380 wd947 wd4901 wd3130 wd3480 wd4217 wd4104 wd1491 wd4153 wd3533 wd4798 wd1044 wd4779 wd3426 wd715 wd714 wd4069 wd4728 
6	0.00799	wd757 wd2724 wd4245 wd294 wd1794 wd3835 wd1485 wd4292 wd3842 wd4447 wd940 wd445

[beta: 0.01262] 
<410> LL/token: -5.18576
[beta: 0.01266] 
<420> LL/token: -5.18569
[beta: 0.01258] 
<430> LL/token: -5.18594
[beta: 0.01259] 
<440> LL/token: -5.18598

0	0	
1	0.00395	wd1874 wd734 wd2104 wd3088 wd4620 wd4918 wd584 wd1880 wd1200 wd2040 wd2013 wd4035 wd3787 wd1140 wd4384 wd4356 wd1742 wd2691 wd4028 wd4908 
2	0.00362	wd1522 wd4108 wd187 wd1575 wd253 wd3 wd2286 wd4723 wd3971 wd3814 wd2956 wd3621 wd1750 wd2668 wd4488 wd4344 wd2336 wd4445 wd2884 wd931 
3	0.00401	wd4046 wd4631 wd1498 wd2543 wd3286 wd2218 wd1973 wd1242 wd577 wd1352 wd2398 wd3768 wd4850 wd2784 wd3202 wd781 wd2701 wd4342 wd2665 wd4457 
4	0.01454	wd2258 wd957 wd3794 wd4590 wd2552 wd3810 wd1448 wd1323 wd624 wd3584 wd4968 wd1724 wd1393 wd4532 wd3442 wd4890 wd2189 wd1447 wd88 wd2153 
5	0.00832	wd4891 wd1198 wd3380 wd947 wd4901 wd3130 wd3480 wd4217 wd4104 wd1491 wd4153 wd3533 wd4798 wd3426 wd1044 wd715 wd4779 wd714 wd4728 wd4069 
6	0.00814	wd757 wd2724 wd4245 wd294 wd1794 wd3835 wd1485 wd4292 wd3842 wd4447 wd940 wd44

[beta: 0.0126] 
<510> LL/token: -5.18485
[beta: 0.01256] 
<520> LL/token: -5.18523
[beta: 0.01259] 
<530> LL/token: -5.18454
[beta: 0.01264] 
<540> LL/token: -5.18415

0	0	
1	0.00395	wd1874 wd734 wd2104 wd3088 wd4620 wd4918 wd584 wd1880 wd1200 wd2040 wd2013 wd4035 wd3787 wd1140 wd4356 wd4384 wd1742 wd4028 wd2691 wd4908 
2	0.0036	wd1522 wd4108 wd187 wd1575 wd253 wd3 wd2286 wd4723 wd3814 wd3971 wd2956 wd3621 wd1750 wd2668 wd4488 wd2336 wd4344 wd4445 wd2884 wd931 
3	0.00386	wd4046 wd4631 wd1498 wd2543 wd3286 wd2218 wd1973 wd1242 wd577 wd1352 wd2398 wd3768 wd4850 wd2784 wd3202 wd781 wd2701 wd4342 wd2665 wd4457 
4	0.01568	wd2258 wd957 wd3794 wd4590 wd2552 wd3810 wd1448 wd1323 wd624 wd3584 wd1393 wd4968 wd1724 wd4532 wd2189 wd3442 wd4890 wd1447 wd88 wd2153 
5	0.00806	wd4891 wd1198 wd3380 wd947 wd4901 wd3130 wd3480 wd4217 wd1491 wd4153 wd4104 wd4798 wd3533 wd3426 wd1044 wd4779 wd715 wd714 wd4069 wd4728 
6	0.00795	wd757 wd2724 wd4245 wd294 wd1794 wd3835 wd1485 wd4292 wd3842 wd4447 wd940 wd4457

[beta: 0.01257] 
<610> LL/token: -5.18579
[beta: 0.01257] 
<620> LL/token: -5.18614
[beta: 0.01254] 
<630> LL/token: -5.18546
[beta: 0.01257] 
<640> LL/token: -5.18512

0	0	
1	0.00404	wd1874 wd734 wd2104 wd3088 wd4620 wd4918 wd584 wd1880 wd1200 wd2040 wd2013 wd4035 wd3787 wd1140 wd4384 wd4356 wd1742 wd2691 wd4028 wd4908 
2	0.00382	wd1522 wd4108 wd187 wd1575 wd253 wd3 wd2286 wd4723 wd3971 wd3814 wd2956 wd3621 wd1750 wd4488 wd2668 wd4344 wd2336 wd4445 wd2884 wd931 
3	0.00399	wd4046 wd4631 wd1498 wd2543 wd3286 wd2218 wd1242 wd1973 wd577 wd1352 wd2398 wd3768 wd4850 wd2784 wd3202 wd781 wd2701 wd4342 wd2665 wd4457 
4	0.01575	wd2258 wd957 wd3794 wd4590 wd2552 wd3810 wd1448 wd1323 wd624 wd3584 wd1393 wd1724 wd4968 wd4532 wd2189 wd3442 wd4890 wd1447 wd88 wd2153 
5	0.00797	wd4891 wd1198 wd3380 wd947 wd4901 wd3130 wd3480 wd4217 wd1491 wd4153 wd4104 wd4798 wd3533 wd3426 wd1044 wd4779 wd715 wd714 wd4069 wd4728 
6	0.00817	wd757 wd2724 wd4245 wd294 wd1794 wd3835 wd1485 wd4292 wd3842 wd940 wd4447 wd44

[beta: 0.01254] 
<710> LL/token: -5.18377
[beta: 0.01257] 
<720> LL/token: -5.18394
[beta: 0.01259] 
<730> LL/token: -5.18356
[beta: 0.01255] 
<740> LL/token: -5.18431

0	0	
1	0.00394	wd1874 wd734 wd2104 wd3088 wd4620 wd4918 wd584 wd1880 wd1200 wd2040 wd2013 wd4035 wd3787 wd1140 wd4384 wd4356 wd1742 wd4028 wd2691 wd4908 
2	0.00379	wd1522 wd4108 wd187 wd1575 wd253 wd3 wd2286 wd4723 wd3971 wd3814 wd2956 wd3621 wd1750 wd4488 wd2668 wd4344 wd2336 wd4445 wd2884 wd931 
3	0.00409	wd4046 wd4631 wd1498 wd2543 wd3286 wd2218 wd1973 wd1242 wd577 wd1352 wd2398 wd3768 wd4850 wd2784 wd3202 wd781 wd2701 wd4342 wd2665 wd4457 
4	0.01594	wd2258 wd957 wd3794 wd4590 wd2552 wd3810 wd1448 wd1323 wd624 wd3584 wd1393 wd4968 wd1724 wd4532 wd3442 wd4890 wd2189 wd1447 wd88 wd2153 
5	0.00791	wd4891 wd1198 wd3380 wd947 wd4901 wd3130 wd3480 wd4217 wd4153 wd4104 wd1491 wd4798 wd3533 wd1044 wd3426 wd714 wd4779 wd715 wd4728 wd4069 
6	0.00789	wd757 wd2724 wd4245 wd294 wd1794 wd3835 wd1485 wd4292 wd3842 wd4447 wd940 wd44

[beta: 0.01256] 
<810> LL/token: -5.18443
[beta: 0.01257] 
<820> LL/token: -5.18417
[beta: 0.01258] 
<830> LL/token: -5.18471
[beta: 0.01262] 
<840> LL/token: -5.18367

0	0	
1	0.00401	wd1874 wd734 wd2104 wd3088 wd4620 wd4918 wd584 wd1880 wd1200 wd2040 wd2013 wd4035 wd3787 wd1140 wd4384 wd4356 wd1742 wd2691 wd4028 wd4908 
2	0.00359	wd1522 wd4108 wd187 wd1575 wd253 wd3 wd2286 wd4723 wd3814 wd3971 wd2956 wd3621 wd1750 wd4488 wd2668 wd4344 wd2336 wd4445 wd2884 wd931 
3	0.00384	wd4046 wd4631 wd1498 wd2543 wd3286 wd2218 wd1242 wd1973 wd577 wd1352 wd2398 wd3768 wd4850 wd2784 wd3202 wd781 wd2701 wd4342 wd2665 wd4457 
4	0.0167	wd2258 wd957 wd3794 wd4590 wd2552 wd3810 wd1448 wd1323 wd624 wd3584 wd1393 wd4968 wd1724 wd4532 wd3442 wd4890 wd2189 wd1447 wd88 wd2153 
5	0.0082	wd4891 wd1198 wd3380 wd947 wd4901 wd3130 wd3480 wd4217 wd4153 wd4104 wd1491 wd4798 wd3533 wd1044 wd3426 wd4779 wd714 wd715 wd4069 wd4642 
6	0.00796	wd757 wd2724 wd4245 wd294 wd1794 wd3835 wd1485 wd4292 wd3842 wd940 wd4447 wd4457

[beta: 0.01259] 
<910> LL/token: -5.1835
[beta: 0.01254] 
<920> LL/token: -5.18415
[beta: 0.01257] 
<930> LL/token: -5.18414
[beta: 0.01254] 
<940> LL/token: -5.18372

0	0	
1	0.00393	wd1874 wd734 wd2104 wd3088 wd4620 wd4918 wd584 wd1880 wd1200 wd2040 wd2013 wd4035 wd3787 wd1140 wd4384 wd4356 wd1742 wd2691 wd4028 wd4908 
2	0.00367	wd1522 wd4108 wd187 wd1575 wd253 wd3 wd2286 wd4723 wd3971 wd3814 wd2956 wd3621 wd1750 wd4488 wd2668 wd4344 wd2336 wd4445 wd2884 wd931 
3	0.00399	wd4046 wd4631 wd1498 wd2543 wd3286 wd2218 wd1242 wd1973 wd577 wd1352 wd2398 wd3768 wd4850 wd2784 wd3202 wd781 wd2701 wd4342 wd2665 wd4457 
4	0.01719	wd2258 wd957 wd3794 wd4590 wd2552 wd3810 wd1448 wd1323 wd624 wd3584 wd1393 wd4968 wd1724 wd4532 wd3442 wd4890 wd2189 wd1447 wd88 wd2153 
5	0.00752	wd4891 wd1198 wd3380 wd947 wd4901 wd3130 wd3480 wd4217 wd4153 wd1491 wd4104 wd3533 wd4798 wd3426 wd1044 wd4779 wd714 wd715 wd4069 wd4728 
6	0.00816	wd757 wd2724 wd4245 wd294 wd1794 wd3835 wd1485 wd4292 wd3842 wd4447 wd940 wd445

In [15]:
#Recover and build beta matrix
beta = np.zeros((n_topics, vocab_size))
with open(modeldir.joinpath('word-topic-counts.txt').resolve().as_posix(), 'r', encoding='utf8') as fin:
    for line in fin.readlines():
        tokens = line.split()[1:]
        pos = int(tokens[0][2:])
        for el in tokens[1:]:
            tpc = int(el.split(':')[0])
            cnt = int(el.split(':')[1])
            beta[tpc,pos] = cnt

beta = normalize(beta,axis=1,norm='l1')

In [16]:
print('Tópicos (equivalentes) evaluados correctamente:', np.sum(np.max(np.sqrt(beta).dot(np.sqrt(topic_vectors.T)), axis=0)))

Tópicos (equivalentes) evaluados correctamente: 44.66266253457564


In [17]:
sim_mat_theoretical = np.sqrt(doc_topics_all[0]).dot(np.sqrt(doc_topics_all[0].T))

In [18]:
thetas = np.loadtxt(modeldir.joinpath('doc-topics.txt').resolve().as_posix(), delimiter='\t', dtype=np.float32)[:,2:][:n_docs,:]
thetas[thetas<3e-3] = 0
thetas = normalize(thetas,axis=1,norm='l1')
print(thetas.shape)

(1000, 50)


In [19]:
sim_mat_actual = np.sqrt(thetas).dot(np.sqrt(thetas.T))

In [20]:
print('Difference in evaluation of doc similarity:', np.sum(np.abs(sim_mat_theoretical - sim_mat_actual))/n_docs)

Difference in evaluation of doc similarity: 26.214126280117014


## 2.2. Just in one node approach

In [21]:
# Create model folder and save model training configuration
modelname = "mallet_node0"
modeldir = Path("/export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test").joinpath(modelname)
if modeldir.exists():

    # Remove current backup folder, if it exists
    old_model_dir = Path(str(modeldir) + '_old/')
    if old_model_dir.exists():
        shutil.rmtree(old_model_dir)

    # Copy current model folder to the backup folder.
    shutil.move(modeldir, old_model_dir)
    print(f'-- -- Creating backup of existing model in {old_model_dir}')

modeldir.mkdir()
configFile = modeldir.joinpath('trainconfig.json')

-- -- Creating backup of existing model in /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0_old


In [22]:
my_corpus = documents_all[0]

In [23]:
corpusFile = modeldir.joinpath("corpus.txt")
with open(corpusFile, 'w') as fout:
    [fout.write(str(idx) + ' 0 ' + ' '.join(doc) + '\n') for idx,doc in enumerate(my_corpus)]

In [24]:
corpusMallet = modeldir.joinpath('corpus.mallet')

cmd = mallet_path + \
    ' import-file --preserve-case --keep-sequence ' + \
    '--remove-stopwords --token-regex "' + '[\p{L}\p{N}][\p{L}\p{N}\p{P}]*' + \
    '" --input %s --output %s'
cmd = cmd % (corpusFile, corpusMallet)

try:
    print(f'-- -- Running command {cmd}')
    check_output(args=cmd, shell=True)
except:
    print('-- -- Mallet failed to import data. Revise command')

-- -- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "[\p{L}\p{N}][\p{L}\p{N}\p{P}]*" --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/corpus.txt --output /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/corpus.mallet


In [25]:
with open(configFile, 'w', encoding='utf8') as fout:
    fout.write('input = ' + corpusMallet.resolve().as_posix() + '\n')
    fout.write('num-topics = ' + str(n_topics) + '\n')
    fout.write('alpha = 1\n')
    fout.write('optimize-interval = 10\n')
    fout.write('num-threads = 4\n')
    fout.write('num-iterations = 1000\n')
    fout.write('doc-topics-threshold = 0\n')
    fout.write('output-doc-topics = ' +
                modeldir.joinpath('doc-topics.txt').resolve().as_posix() + '\n')
    fout.write('word-topic-counts-file = ' +
               modeldir.joinpath('word-topic-counts.txt').resolve().as_posix() + '\n')
    fout.write('output-topic-keys = ' +
               modeldir.joinpath('topickeys.txt').resolve().as_posix() + '\n')
cmd = mallet_path + ' train-topics --config ' + configFile.resolve().as_posix()
print(cmd)
try:
    print(
        f'-- -- Training mallet topic model. Command is {cmd}')
    check_output(args=cmd, shell=True)
except:
    print('-- -- Model training failed. Revise command')

/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet train-topics --config /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/trainconfig.json
-- -- Training mallet topic model. Command is /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet train-topics --config /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/trainconfig.json


Mallet LDA: 50 topics, 6 topic bits, 111111 topic mask
Data loaded.
max tokens: 249
total tokens: 201020
<10> LL/token: -5.75978
<20> LL/token: -5.73526
<30> LL/token: -5.70098
<40> LL/token: -5.67858

0	0.02	wd4891 wd3380 wd1198 wd4901 wd3480 wd3130 wd4153 wd4642 wd4728 wd715 wd947 wd3460 wd3426 wd1491 wd1075 wd4217 wd4104 wd1589 wd1044 wd1922 
1	0.02	wd719 wd2309 wd1235 wd3029 wd3748 wd3917 wd4908 wd2652 wd617 wd2505 wd1367 wd2336 wd2573 wd651 wd1273 wd315 wd3231 wd3732 wd3140 wd4430 
2	0.02	wd719 wd2165 wd2309 wd1958 wd839 wd2573 wd1123 wd3917 wd3686 wd4533 wd1387 wd4114 wd4552 wd3678 wd1093 wd2921 wd4879 wd121 wd358 wd3231 
3	0.02	wd1054 wd4299 wd2139 wd3573 wd2580 wd369 wd4119 wd3114 wd1767 wd268 wd2226 wd3244 wd2897 wd1411 wd3222 wd1283 wd2122 wd2932 wd4902 wd1000 
4	0.02	wd4152 wd498 wd971 wd1863 wd3505 wd270 wd4659 wd4969 wd1572 wd252 wd666 wd2605 wd1791 wd200 wd2960 wd1923 wd4190 wd3470 wd2088 wd3353 
5	0.02	wd3389 wd1958 wd3613 wd3296 wd135 wd1165 wd3961 wd1552 wd3512 wd1303 

<110> LL/token: -5.53397
<120> LL/token: -5.51463
<130> LL/token: -5.49265
<140> LL/token: -5.47716

0	0.02	wd4891 wd3380 wd1198 wd714 wd4642 wd947 wd3460 wd3130 wd4728 wd3480 wd4217 wd1251 wd4069 wd4542 wd315 wd1313 wd3100 wd3213 wd3426 wd2488 
1	0.02	wd3917 wd4327 wd3855 wd1730 wd190 wd3787 wd613 wd1482 wd4868 wd1810 wd1027 wd3358 wd1247 wd2844 wd3298 wd1178 wd3669 wd2422 wd4631 wd4256 
2	0.02	wd2573 wd4896 wd3686 wd3917 wd1617 wd839 wd4114 wd1093 wd1606 wd1243 wd3790 wd2418 wd3498 wd3895 wd1400 wd2397 wd3644 wd1619 wd333 wd2434 
3	0.02	wd4299 wd1054 wd2139 wd2580 wd369 wd3573 wd1411 wd1252 wd4104 wd3114 wd1767 wd4119 wd2122 wd4141 wd1283 wd354 wd2897 wd2000 wd264 wd3244 
4	0.02	wd4152 wd498 wd971 wd1863 wd4659 wd3353 wd3505 wd252 wd4969 wd666 wd200 wd3470 wd1572 wd270 wd1791 wd2605 wd2960 wd3807 wd2088 wd3955 
5	0.02	wd3389 wd1958 wd135 wd3296 wd3613 wd3961 wd3512 wd4679 wd1552 wd1165 wd2189 wd4356 wd1850 wd1303 wd1988 wd1380 wd3738 wd1558 wd3395 wd3204 
6	0.02	wd719 wd526 wd2573 wd

[beta: 0.01213] 
<210> LL/token: -5.36916
[beta: 0.01098] 
<220> LL/token: -5.35086
[beta: 0.012] 
<230> LL/token: -5.3356
[beta: 0.01248] 
<240> LL/token: -5.32023

0	0.00362	wd3460 wd1198 wd3213 wd714 wd4542 wd3380 wd1313 wd1251 wd791 wd1877 wd2585 wd4314 wd4217 wd3256 wd2488 wd4069 wd3310 wd1096 wd489 wd4364 
1	0	
2	0.00014	
3	0.01398	wd3573 wd4299 wd2139 wd1767 wd4119 wd1054 wd369 wd1252 wd2580 wd2122 wd3114 wd1872 wd1411 wd373 wd3222 wd3244 wd354 wd2932 wd2778 wd264 
4	0.02296	wd4152 wd498 wd971 wd1863 wd4659 wd3353 wd252 wd270 wd4969 wd2605 wd666 wd3505 wd200 wd3470 wd1572 wd699 wd2088 wd1791 wd2960 wd3807 
5	0.0252	wd3389 wd1958 wd135 wd3296 wd3613 wd3512 wd1165 wd3961 wd4679 wd1552 wd1303 wd4356 wd2189 wd1850 wd3738 wd1988 wd1380 wd3395 wd2962 wd1572 
6	0.02158	wd4896 wd3917 wd2165 wd719 wd3851 wd2573 wd271 wd4361 wd1123 wd4579 wd1093 wd2309 wd2333 wd4910 wd4371 wd2382 wd839 wd1617 wd4087 wd526 
7	0.00345	wd2440 wd2929 wd2399 wd4406 wd4116 wd1367 wd3620 wd4076 wd3263 wd2779 wd9

[beta: 0.01482] 
<310> LL/token: -5.18662
[beta: 0.01512] 
<320> LL/token: -5.16639
[beta: 0.01522] 
<330> LL/token: -5.14864
[beta: 0.01601] 
<340> LL/token: -5.13367

0	0	
1	0	
2	0	
3	0.00013	wd4223 wd4405 
4	0.02156	wd4152 wd498 wd4659 wd971 wd1863 wd3353 wd270 wd252 wd4969 wd666 wd2605 wd3505 wd200 wd3470 wd699 wd1572 wd2088 wd2960 wd1791 wd3394 
5	0.02217	wd3389 wd1958 wd135 wd3296 wd3613 wd3512 wd1165 wd3961 wd4679 wd1552 wd1303 wd2189 wd4356 wd1850 wd1988 wd3738 wd1380 wd3395 wd1558 wd1572 
6	0.01325	wd2165 wd719 wd271 wd4361 wd2207 wd3855 wd968 wd4896 wd684 wd2573 wd2309 wd121 wd2382 wd3520 wd3917 wd3390 wd3125 wd2921 wd2678 wd237 
7	0.00038	wd4406 wd903 wd4986 wd3088 
8	0.01814	wd1662 wd4634 wd4126 wd507 wd1111 wd3825 wd3578 wd1791 wd4438 wd4380 wd1749 wd1800 wd450 wd4852 wd2046 wd2197 wd2057 wd3019 wd1953 wd2241 
9	0.01514	wd3256 wd4166 wd2504 wd805 wd4620 wd1360 wd437 wd2465 wd4640 wd591 wd4479 wd2532 wd731 wd3830 wd3734 wd4358 wd1195 wd126 wd513 wd850 
10	0.0005	
11	0	
12	0

[beta: 0.01846] 
<460> LL/token: -5.06054
[beta: 0.01859] 
<470> LL/token: -5.06033
[beta: 0.01872] 
<480> LL/token: -5.06015
[beta: 0.01869] 
<490> LL/token: -5.06041

0	0	
1	0	
2	0	
3	0	
4	0.01893	wd4152 wd498 wd4659 wd971 wd1863 wd3353 wd270 wd252 wd4969 wd666 wd2605 wd3505 wd200 wd3470 wd699 wd1572 wd2088 wd1791 wd2960 wd3394 
5	0.01909	wd3389 wd1958 wd135 wd3296 wd3613 wd3512 wd1165 wd3961 wd4679 wd1552 wd1303 wd4356 wd2189 wd1850 wd1988 wd1380 wd3738 wd3395 wd1558 wd1572 
6	0.00011	wd3640 wd4814 wd4506 wd1178 wd3917 
7	0	
8	0.01711	wd4634 wd1662 wd4126 wd1111 wd4380 wd507 wd4438 wd3825 wd1791 wd3578 wd1800 wd2046 wd1749 wd450 wd3019 wd4852 wd2197 wd2057 wd1953 wd2746 
9	0.01363	wd3256 wd4166 wd805 wd437 wd4640 wd1360 wd4620 wd731 wd2532 wd2465 wd3830 wd4479 wd2504 wd3545 wd1195 wd3734 wd591 wd4358 wd4408 wd2075 
10	0	
11	0	
12	0.02177	wd2724 wd4245 wd1485 wd4292 wd940 wd2666 wd2154 wd499 wd3088 wd1644 wd1794 wd4279 wd3833 wd4159 wd2962 wd3384 wd2440 wd3475 wd2628 wd2132 
13	0.020

[beta: 0.01871] 
<610> LL/token: -5.05569
[beta: 0.01883] 
<620> LL/token: -5.05443
[beta: 0.01879] 
<630> LL/token: -5.05133
[beta: 0.0186] 
<640> LL/token: -5.05093

0	0	
1	0	
2	0	
3	0	
4	0.01851	wd4152 wd498 wd4659 wd971 wd1863 wd3353 wd270 wd252 wd4969 wd666 wd2605 wd3505 wd200 wd3470 wd699 wd1572 wd2088 wd2960 wd1791 wd3394 
5	0.01959	wd3389 wd1958 wd135 wd3296 wd3613 wd3512 wd1165 wd3961 wd4679 wd1552 wd1303 wd4356 wd2189 wd1850 wd3738 wd1988 wd1380 wd1572 wd3395 wd2962 
6	0	
7	0	
8	0.0172	wd4634 wd1662 wd4126 wd1111 wd4380 wd507 wd4438 wd3825 wd1791 wd3578 wd1800 wd2046 wd1749 wd450 wd3019 wd4852 wd2197 wd2057 wd1953 wd2746 
9	0.01273	wd3256 wd4166 wd437 wd805 wd4620 wd4479 wd1360 wd3734 wd4182 wd2075 wd625 wd2532 wd3984 wd4299 wd2504 wd4317 wd850 wd126 wd591 wd513 
10	0	
11	0	
12	0.022	wd2724 wd4245 wd1485 wd4292 wd940 wd2666 wd2154 wd499 wd3088 wd1644 wd1794 wd4279 wd3833 wd4159 wd2962 wd3384 wd2440 wd3475 wd2132 wd388 
13	0.02055	wd719 wd2253 wd2165 wd2309 wd271 wd526 wd1235 

[beta: 0.01887] 
<760> LL/token: -5.03628
[beta: 0.0191] 
<770> LL/token: -5.03398
[beta: 0.01914] 
<780> LL/token: -5.03266
[beta: 0.01918] 
<790> LL/token: -5.02878

0	0	
1	0	
2	0	
3	0	
4	0.01806	wd4152 wd498 wd4659 wd971 wd1863 wd3353 wd270 wd252 wd4969 wd666 wd2605 wd3505 wd200 wd3470 wd699 wd1572 wd2088 wd1791 wd2960 wd3394 
5	0.01871	wd3389 wd1958 wd135 wd3296 wd3613 wd3512 wd1165 wd3961 wd4679 wd1552 wd1303 wd4356 wd2189 wd1850 wd1988 wd1380 wd3738 wd1572 wd3395 wd2962 
6	0	
7	0	
8	0.01691	wd4634 wd1662 wd4126 wd1111 wd4380 wd507 wd4438 wd3825 wd1791 wd3578 wd1800 wd2046 wd1749 wd450 wd3019 wd4852 wd2197 wd2057 wd1953 wd2746 
9	0.00432	wd706 wd1361 wd1051 wd2585 wd187 wd437 wd3158 wd4214 wd931 wd1360 wd4620 wd4166 wd4527 wd693 wd654 wd186 wd2044 wd3362 wd2808 wd1962 
10	0	
11	0	
12	0.02142	wd2724 wd4245 wd1485 wd4292 wd940 wd2666 wd2154 wd499 wd3088 wd1644 wd1794 wd4279 wd3833 wd4159 wd2962 wd3384 wd2440 wd3475 wd2628 wd2132 
13	0.02013	wd719 wd2253 wd2165 wd2309 wd271 wd526 wd2

[beta: 0.01941] 
<960> LL/token: -5.02549
[beta: 0.01953] 
<970> LL/token: -5.02671
[beta: 0.01959] 
<980> LL/token: -5.02588
[beta: 0.01938] 
<990> LL/token: -5.02542

0	0	
1	0	
2	0	
3	0	
4	0.01782	wd4152 wd498 wd4659 wd971 wd1863 wd3353 wd270 wd252 wd4969 wd666 wd2605 wd3505 wd200 wd3470 wd699 wd1572 wd2088 wd1791 wd2960 wd3394 
5	0.01902	wd3389 wd1958 wd135 wd3296 wd3613 wd3512 wd1165 wd3961 wd4679 wd1552 wd1303 wd2189 wd4356 wd1850 wd1988 wd1380 wd3738 wd3395 wd2962 wd1572 
6	0	
7	0	
8	0.01727	wd4634 wd1662 wd4126 wd1111 wd4380 wd507 wd4438 wd3825 wd1791 wd3578 wd1800 wd2046 wd1749 wd4852 wd450 wd3019 wd2197 wd2057 wd1953 wd2746 
9	0	
10	0	
11	0	
12	0.02102	wd2724 wd4245 wd1485 wd4292 wd940 wd2666 wd2154 wd499 wd3088 wd1644 wd1794 wd4279 wd3833 wd4159 wd2962 wd3384 wd2440 wd3475 wd2628 wd2132 
13	0.01879	wd719 wd2253 wd2165 wd2309 wd271 wd526 wd1235 wd2333 wd1958 wd121 wd4579 wd4910 wd269 wd2336 wd2382 wd1850 wd1297 wd617 wd4552 wd4793 
14	0	
15	0	
16	0	
17	0	
18	0	
19	0	
20	0	
21	

In [26]:
#Recover and build beta matrix
beta = np.zeros((n_topics, vocab_size))
with open(modeldir.joinpath('word-topic-counts.txt').resolve().as_posix(), 'r', encoding='utf8') as fin:
    for line in fin.readlines():
        tokens = line.split()[1:]
        pos = int(tokens[0][2:])
        for el in tokens[1:]:
            tpc = int(el.split(':')[0])
            cnt = int(el.split(':')[1])
            beta[tpc,pos] = cnt

beta = normalize(beta,axis=1,norm='l1')

In [27]:
print('Tópicos (equivalentes) evaluados correctamente:', np.sum(np.max(np.sqrt(beta).dot(np.sqrt(topic_vectors.T)), axis=0)))

Tópicos (equivalentes) evaluados correctamente: 16.480160058080475


In [28]:
sim_mat_theoretical = np.sqrt(doc_topics_all[0]).dot(np.sqrt(doc_topics_all[0].T))

In [29]:
thetas = np.loadtxt(modeldir.joinpath('doc-topics.txt').resolve().as_posix(), delimiter='\t', dtype=np.float32)[:,2:][:n_docs,:]
thetas[thetas<3e-3] = 0
thetas = normalize(thetas,axis=1,norm='l1')

In [30]:
sim_mat_actual = np.sqrt(thetas).dot(np.sqrt(thetas.T))

In [31]:
print('Difference in evaluation of doc similarity:', np.sum(np.abs(sim_mat_theoretical - sim_mat_actual))/n_docs)

Difference in evaluation of doc similarity: 13.425832360293896


# 3. ProdLDA

In [32]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse

In [33]:
cd /export/usuarios_ml4ds/lbartolome/topicmodeler

/export/usuarios_ml4ds/lbartolome/topicmodeler


In [34]:
from src.topicmodeling.neural_models.pytorchavitm.datasets.bow_dataset import BOWDataset
from src.topicmodeling.neural_models.pytorchavitm.avitm_network.avitm import AVITM

In [35]:
def convert_topic_word_to_init_size(vocab_size, model, model_type,
                                    ntopics, id2token, all_words):
    """It converts the topic-word distribution matrix obtained from the training of a model into a matrix with the dimensions of the original topic-word distribution, assigning zeros to those words that are not present in the corpus. 
    It is only of use in case we are training a model over a synthetic dataset, so as to later compare the performance of the attained model in what regards to the similarity between the original and the trained model.

    Args:
        * vocab_size (int):       Size of the synethic'data vocabulary.
        * model (AVITM/CTM):      Model whose topic-word matrix is being transformed.
        * model_type (str):       Type of the trained model (e.g. AVITM)
        * ntopics (int):          Number of topics of the trained model.
        * id2token (List[tuple]): Mappings with the content of the document-term matrix.
        * all_words (List[str]):  List of all the words of the vocabulary of size vocab_size.

    Returns:
        * ndarray: Normalized transormed topic-word distribution.
    """
    if model_type == "avitm":
        w_t_distrib = np.zeros((ntopics, vocab_size), dtype=np.float64)
        wd = model.get_topic_word_distribution()
        for i in np.arange(ntopics):
            for idx, word in id2token.items():
                for j in np.arange(len(all_words)):
                    if all_words[j] == word:
                        w_t_distrib[i, j] = wd[i][idx]
                        break
        normalized_array = normalize(w_t_distrib,axis=1,norm='l1')
        return normalized_array
    else:
        print("Method not impleemnted for the selected model type")
        return None

## 3.1. Centralized approach

In [36]:
# Create model folder and save model training configuration
modelname = "prod_centralized"
modeldir = Path("/export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test").joinpath(modelname)
if modeldir.exists():

    # Remove current backup folder, if it exists
    old_model_dir = Path(str(modeldir) + '_old/')
    if old_model_dir.exists():
        shutil.rmtree(old_model_dir)

    # Copy current model folder to the backup folder.
    shutil.move(modeldir, old_model_dir)
    print(f'-- -- Creating backup of existing model in {old_model_dir}')

In [37]:
# Corpus for centralized approach
my_corpus = [doc for docs_node in documents_all for doc in docs_node]

In [38]:
cv = CountVectorizer(input='content', lowercase=True, stop_words='english', binary=False)
docs = [" ".join(my_corpus[i]) for i in np.arange(len(my_corpus))]
train_bow = cv.fit_transform(docs).toarray()
idx2token = cv.get_feature_names()
train_dataset = BOWDataset(train_bow, idx2token)
input_size = len(idx2token)
id2token = {k: v for k, v in zip(range(0, len(idx2token)), idx2token)}



In [39]:
avitm = AVITM(logger=None,
              input_size=input_size,
              n_components=n_topics,
              model_type="prodLDA",
              hidden_sizes=(100, 100),
              activation='softplus',
              dropout=0.2,
              learn_priors=True,
              batch_size=64,
              lr=2e-3,
              momentum=0.99,
              solver='adam',
              num_epochs=100,
              reduce_on_plateau=False,
              topic_prior_mean=0.0,
              topic_prior_variance=None,
              num_samples=10,
              num_data_loader_workers=0,
              verbose=True)
avitm.fit(train_dataset)

Settings: 
                N Components: 50
                Topic Prior Mean: 0.0
                Topic Prior Variance: None
                Model Type: prodLDA
                Hidden Sizes: (100, 100)
                Activation: softplus
                Dropout: 0.2
                Learn Priors: True
                Learning Rate: 0.002
                Momentum: 0.99
                Reduce On Plateau: False
                Save Dir: None


Epoch: [100/100]	 Seen Samples: [500000/500000]	Train Loss: 1299.7313498046874	Time: 0:00:01.266435: : 100it [02:01,  1.22s/it]
Sampling: [20/20]: : 20it [00:17,  1.11it/s]


In [43]:
thetas = np.asarray(avitm.get_doc_topic_distribution(avitm.train_data))[:n_docs,:] 
thetas[thetas < 3e-3] = 0
thetas = normalize(thetas, axis=1, norm='l1')
thetas = sparse.csr_matrix(thetas, copy=True)


betas = avitm.get_topic_word_distribution()
all_words = ['wd'+str(word) for word in np.arange(vocab_size+1) if word > 0]
betas = convert_topic_word_to_init_size(vocab_size=vocab_size,
                                        model=avitm,
                                        model_type="avitm",
                                        ntopics=n_topics,
                                        id2token=id2token,
                                        all_words=all_words)
print(thetas.shape)

Sampling: [20/20]: : 20it [00:18,  1.05it/s]


(1000, 50)


In [44]:
print('Tópicos (equivalentes) evaluados correctamente:', np.sum(np.max(np.sqrt(betas).dot(np.sqrt(topic_vectors.T)), axis=0)))

Tópicos (equivalentes) evaluados correctamente: 8.671070001084397


In [45]:
sim_mat_theoretical = np.sqrt(doc_topics_all[0]).dot(np.sqrt(doc_topics_all[0].T))
sim_mat_actual = np.sqrt(thetas).dot(np.sqrt(thetas.T))
print('Difference in evaluation of doc similarity:', np.sum(np.abs(sim_mat_theoretical - sim_mat_actual))/n_docs)

Difference in evaluation of doc similarity: 515.0291223693249


## 3.2. Just in one node approach

In [46]:
# Create model folder and save model training configuration
modelname = "prod_node0"
modeldir = Path("/export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test").joinpath(modelname)
if modeldir.exists():

    # Remove current backup folder, if it exists
    old_model_dir = Path(str(modeldir) + '_old/')
    if old_model_dir.exists():
        shutil.rmtree(old_model_dir)

    # Copy current model folder to the backup folder.
    shutil.move(modeldir, old_model_dir)
    print(f'-- -- Creating backup of existing model in {old_model_dir}')

In [47]:
my_corpus = documents_all[0]

In [48]:
cv = CountVectorizer(input='content', lowercase=True, stop_words='english', binary=False)
docs = [" ".join(my_corpus[i]) for i in np.arange(len(my_corpus))]
train_bow = cv.fit_transform(docs).toarray()
idx2token = cv.get_feature_names()
train_dataset = BOWDataset(train_bow, idx2token)
input_size = len(idx2token)
id2token = {k: v for k, v in zip(range(0, len(idx2token)), idx2token)}



In [49]:
avitm = AVITM(logger=None,
              input_size=input_size,
              n_components=n_topics,
              model_type="prodLDA",
              hidden_sizes=(100, 100),
              activation='softplus',
              dropout=0.2,
              learn_priors=True,
              batch_size=64,
              lr=2e-3,
              momentum=0.99,
              solver='adam',
              num_epochs=100,
              reduce_on_plateau=False,
              topic_prior_mean=0.0,
              topic_prior_variance=None,
              num_samples=10,
              num_data_loader_workers=0,
              verbose=True)
avitm.fit(train_dataset)

Settings: 
                N Components: 50
                Topic Prior Mean: 0.0
                Topic Prior Variance: None
                Model Type: prodLDA
                Hidden Sizes: (100, 100)
                Activation: softplus
                Dropout: 0.2
                Learn Priors: True
                Learning Rate: 0.002
                Momentum: 0.99
                Reduce On Plateau: False
                Save Dir: None


Epoch: [100/100]	 Seen Samples: [100000/100000]	Train Loss: 1254.18375390625	Time: 0:00:00.925325: : 100it [01:30,  1.10it/s]
Sampling: [20/20]: : 20it [00:16,  1.23it/s]


In [50]:
thetas = np.asarray(avitm.get_doc_topic_distribution(avitm.train_data)) 
thetas[thetas < 3e-3] = 0
thetas = normalize(thetas, axis=1, norm='l1')
thetas = sparse.csr_matrix(thetas, copy=True)

betas = avitm.get_topic_word_distribution()
all_words = ['wd'+str(word) for word in np.arange(vocab_size+1) if word > 0]
betas = convert_topic_word_to_init_size(vocab_size=vocab_size,
                                        model=avitm,
                                        model_type="avitm",
                                        ntopics=n_topics,
                                        id2token=id2token,
                                        all_words=all_words)

Sampling: [20/20]: : 20it [00:16,  1.24it/s]


In [51]:
print('Tópicos (equivalentes) evaluados correctamente:', np.sum(np.max(np.sqrt(betas).dot(np.sqrt(topic_vectors.T)), axis=0)))

Tópicos (equivalentes) evaluados correctamente: 6.773036044805549


In [52]:
sim_mat_theoretical = np.sqrt(doc_topics_all[0]).dot(np.sqrt(doc_topics_all[0].T))
sim_mat_actual = np.sqrt(thetas).dot(np.sqrt(thetas.T))
print('Difference in evaluation of doc similarity:', np.sum(np.abs(sim_mat_theoretical - sim_mat_actual))/n_docs)

Difference in evaluation of doc similarity: 531.721231823675
