In [1]:
!pip install python-Levenshtein

import numpy as np
import pandas as pd
import zipfile as zp
from pathlib import Path
from gensim.utils import check_output
from sklearn.preprocessing import normalize
from scipy.special import softmax
import shutil
from subprocess import check_output
import torch
from tqdm import tqdm
import colored

You should consider upgrading via the '/export/usuarios_ml4ds/lbartolome/topicmodeler/.venv/bin/python3 -m pip install --upgrade pip' command.[0m[33m
[0m

In [2]:
mallet_path = '/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet'

In [3]:
modelsdir = Path("/export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test")

In [4]:
def printgr(text):
    print(colored.stylize(text, colored.fg('green')))

## **0. Results from previous executions**


---

![](https://drive.google.com/uc?export=view&id=1aveMEBkLa4G2IS3gsf9clOsHa2CkDy6S)

---
![](https://drive.google.com/uc?export=view&id=1KTBvGSqO6EP-07-seMp8ftzBsVlb6DlS)

---
![](https://drive.google.com/uc?export=view&id=190u-98U2EhEQCW9nYjfmVha8okRS3LyA)

---
![](https://drive.google.com/uc?export=view&id=1gE2hyQEUomv1w9ngkglx6YT18j9ofn-q)

## **1. Creation of synthetic corpus**


---

We consider a scenario with n parties, each of them as an associated corpus.
To generate the corpus associated with each of the parties, we consider a common beta distribution (word-topic distribution), but we freeze different topics/ assign different asymmetric Dirichlet priors favoring different topics at the time of generating the document that composes each party's corpus.

We consider two scenarios for the last step (documents generation) according to whether we are utilzing LDA or ProdLDA's generative process.

* **According to LDA's generative process**
    **for** each document $d$ **do**  
    &nbsp;&nbsp;Draw topic distribution $\theta \sim Dirichlet(\alpha)$  
    &nbsp;&nbsp;&nbsp;&nbsp;**for** each word at position $n$ **do**  
    &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Sample topic $z_n \sim Multinomial(1,\theta)$  
    &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Sample word $w_n \sim Multinomial(1, \beta_{z_n})$  
    &nbsp;&nbsp;&nbsp;&nbsp;**end**  
    **end**

    where the distribution of $w_n \sim \beta$, $\theta$ is $Multinomial(1, \beta\theta)$

* **According to ProdLDA generative process**
    In comparisson to LDA's generative process:
    - $\beta$ is **unnormalized**
    - the conditional distribution of wn is defined as $w_n \mid \beta, \theta \sim Multinomial(1, \sigma(\beta\theta))$

### *Auxiliary functions for synthetic corpus generation*

In [5]:
def rotateArray(arr, n, d):
    temp = []
    i = 0
    while (i < d):
        temp.append(arr[i])
        i = i + 1
    i = 0
    while (d < n):
        arr[i] = arr[d]
        i = i + 1
        d = d + 1
    arr[:] = arr[: i] + temp
    return arr

In [6]:
def generateSynthetic(just_inf, vocab_size, n_topics, beta, alpha, n_docs,
                      n_docs_inf, nwords, alg, n_nodes, frozen_topics,
                      prior_frozen, own_topics, prior_nofrozen):
    
    if just_inf:
        n_total_docs = n_docs_inf
    else:
        n_total_docs = n_docs + n_docs_inf

    # Step 1 - generation of topics
    topic_vectors = np.random.dirichlet(vocab_size*[beta], n_topics)
    
    # Step 2 - generation of document topic proportions
    doc_topics_all = []
    for i in np.arange(n_nodes):
        doc_topics = np.random.dirichlet(prior_frozen + prior_nofrozen, n_total_docs)
        prior_nofrozen = rotateArray(prior_nofrozen, len(prior_nofrozen), own_topics)
        doc_topics_all.append(doc_topics)
        
    # Step 3 - Document generation
    documents_all = []
    z_all = []

    for i in np.arange(n_nodes):
        print("Generating document words for node ", str(i))
        documents = [] # Document words
        #z = [] # Assignments
        for docid in tqdm(np.arange(n_total_docs)):
            doc_len = np.random.randint(low=nwords[0], high=nwords[1])
            this_doc_words = []
            #this_doc_assigns = []
            for wd_idx in np.arange(doc_len):

                tpc = np.nonzero(np.random.multinomial(1, doc_topics_all[i][docid]))[0][0]
                #this_doc_assigns.append(tpc)
                if alg == "lda":
                    word = np.nonzero(np.random.multinomial(1, topic_vectors[tpc]))[0][0]
                else: #prodlda
                    pval = np.power(topic_vectors[tpc], doc_topics_all[i][docid][tpc])
                    weights = torch.tensor(pval, dtype=torch.float) # create a tensor of weights
                    word = torch.multinomial(weights, 1).numpy()[0]
                    #pval = normalize(pval[:,np.newaxis], norm='l1', axis=0).ravel()
                    #word = np.nonzero(np.random.multinomial(1, b))[0][0]
                this_doc_words.append('wd'+str(word))
            #z.append(this_doc_assigns)
            documents.append(this_doc_words)
        documents_all.append(documents)
        #z_all.append(z)
    
    return topic_vectors, doc_topics_all, documents_all

### *Topic modeling and Centralized settings*

In [7]:
# Topic modeling settings
vocab_size = 5000
n_topics = 50
beta = 1e-2
alpha = 50/n_topics#1/n_topics
n_docs = 10000
n_docs_inf = 10000
nwords = (150, 250) #Min and max lengths of the documents
alg = "lda" #"prod"

tm_settings = {
    "vocab_size": vocab_size,
    "n_topics": n_topics,
    "beta": beta,
    "alpha": alpha,
    "n_docs": n_docs,
    "n_docs_inf": n_docs_inf,
    "nwords": nwords,
    "alg": alg
}

In [8]:
# Centralized settings
n_nodes = 5
frozen_topics = 5
prior_frozen = frozen_topics * [alpha]
own_topics = int((n_topics-frozen_topics)/n_nodes)
prior_nofrozen = own_topics * [alpha] + (n_topics-frozen_topics-own_topics) * [alpha/10000]

centralized_settings = {
    "n_nodes": n_nodes,
    "frozen_topics": frozen_topics,
    "prior_frozen": prior_frozen,
    "own_topics": own_topics,
    "prior_nofrozen": prior_nofrozen
}

### *1.1. Generation of training + validation corpus (same generative process)*

In [9]:
topic_vectors, doc_topics_all, documents_all = generateSynthetic(False, **tm_settings, **centralized_settings)

Generating document words for node  0


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 20000/20000 [08:18<00:00, 40.11it/s]


Generating document words for node  1


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 20000/20000 [08:08<00:00, 40.91it/s]


Generating document words for node  2


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 20000/20000 [08:19<00:00, 40.07it/s]


Generating document words for node  3


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 20000/20000 [08:19<00:00, 40.07it/s]


Generating document words for node  4


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 20000/20000 [08:19<00:00, 40.04it/s]


### *1.2. Generation of second validation corpus (different generative process)*

In [10]:
topic_vectors2, doc_topics_all2, documents_all2 = generateSynthetic(True, **tm_settings, **centralized_settings)

Generating document words for node  0


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [04:08<00:00, 40.19it/s]


Generating document words for node  1


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [04:10<00:00, 39.84it/s]


Generating document words for node  2


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [04:10<00:00, 39.98it/s]


Generating document words for node  3


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [04:09<00:00, 40.06it/s]


Generating document words for node  4


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [04:08<00:00, 40.30it/s]


## **2. Mallet**


---

### *Auxiliary functions*

In [11]:
def create_model_folder(modelname, modelsdir):
    
    # Create model folder and save model training configuration
    modeldir = modelsdir.joinpath(modelname)
    
    if modeldir.exists():

        # Remove current backup folder, if it exists
        old_model_dir = Path(str(modeldir) + '_old/')
        if old_model_dir.exists():
            shutil.rmtree(old_model_dir)

        # Copy current model folder to the backup folder.
        shutil.move(modeldir, old_model_dir)
        print(f'-- -- Creating backup of existing model in {old_model_dir}')

    modeldir.mkdir()
    configFile = modeldir.joinpath('trainconfig.json')
    
    return modeldir, configFile

In [12]:
def train(modelname, modelsdir, corpus):
    
    # Create model folder
    modeldir, configFile = create_model_folder(modelname, modelsdir)
    
    # Create corpus txt file
    corpusFile = modeldir.joinpath("corpus.txt")
    with open(corpusFile, 'w') as fout:
        [fout.write(str(idx) + ' 0 ' + ' '.join(doc) + '\n') for idx,doc in enumerate(corpus)]
    
    # Create corpus mallet file
    corpusMallet = modeldir.joinpath('corpus.mallet')

    cmd = mallet_path + \
        ' import-file --preserve-case --keep-sequence ' + \
        '--remove-stopwords --token-regex "' + '[\p{L}\p{N}][\p{L}\p{N}\p{P}]*' + \
        '" --input %s --output %s'
    cmd = cmd % (corpusFile, corpusMallet)

    try:
        print(f'-- -- Running command {cmd}')
        check_output(args=cmd, shell=True)
    except:
        print('-- -- Mallet failed to import data. Revise command')
    
    # Perform Mallet train
    with open(configFile, 'w', encoding='utf8') as fout:
        fout.write('input = ' + corpusMallet.resolve().as_posix() + '\n')
        fout.write('num-topics = ' + str(n_topics) + '\n')
        fout.write('alpha = 1\n')
        fout.write('optimize-interval = 10\n')
        fout.write('num-threads = 4\n')
        fout.write('num-iterations = 1000\n')
        fout.write('doc-topics-threshold = 0\n')
        fout.write('output-doc-topics = ' +
                    modeldir.joinpath('doc-topics.txt').resolve().as_posix() + '\n')
        fout.write('word-topic-counts-file = ' +
                   modeldir.joinpath('word-topic-counts.txt').resolve().as_posix() + '\n')
        fout.write('output-topic-keys = ' +
                   modeldir.joinpath('topickeys.txt').resolve().as_posix() + '\n')
        fout.write('inferencer-filename = ' + 
                   modeldir.joinpath('inferencer.mallet').resolve().as_posix() + '\n')
    cmd = mallet_path + ' train-topics --config ' + configFile.resolve().as_posix()
    print(cmd)
    try:
        print(
            f'-- -- Training mallet topic model. Command is {cmd}')
        check_output(args=cmd, shell=True)
    except:
        print('-- -- Model training failed. Revise command')
    
    return modeldir

In [13]:
def extractPipe(modeldir):
    
    path_corpus = modeldir.joinpath('corpus.mallet')
    if not path_corpus.is_file():
        print('-- Pipe extraction: Could not locate corpus file')
        return

    # Create auxiliary file with only first line from the original corpus file
    path_txt = modeldir.joinpath('corpus.txt')
    with path_txt.open('r', encoding='utf8') as f:
        first_line = f.readline()
        
    path_aux = modeldir.joinpath('corpus_aux.txt')
    with path_aux.open('w', encoding='utf8') as fout:
        fout.write(first_line + '\n')

    # We perform the import with the only goal to keep a small file containing the pipe
    print('-- Extracting pipeline')
    path_pipe = modeldir.joinpath('import.pipe')
    
    cmd = mallet_path + \
        ' import-file --use-pipe-from %s --input %s --output %s'   
    cmd = cmd % (path_corpus, path_aux, path_pipe)

    try:
        print(f'-- Running command {cmd}')
        check_output(args=cmd, shell=True)
    except:
        print('-- Failed to extract pipeline. Revise command')

    # Remove auxiliary file
    path_aux.unlink()

    return 

In [14]:
def inference(modeldir, corpus_inf):

    # A proper corpus should exist with the corresponding ipmortation pipe
    path_pipe = modeldir.joinpath('import.pipe')
    if not path_pipe.is_file():
        print('-- Inference error. Importation pipeline not found')
        return

    # Get inferencer
    inferencer = modeldir.joinpath('inferencer.mallet')
    
    # File for performing inference on
    corpus_file = modeldir.joinpath("corpus_inf.txt")
    with open(corpus_file, 'w') as fout:
        [fout.write(str(idx) + ' 0 ' + ' '.join(doc) + '\n') for idx,doc in enumerate(corpus_inf)]

    # The following files will be generated in the same folder
    corpus_mallet_inf = modeldir.joinpath('corpus_inf.mallet')  # mallet serialized
    doc_topics_file = modeldir.joinpath('doc-topics-inf.txt')

    # Import data to mallet
    print('-- Inference: Mallet Data Import')
   
    cmd = mallet_path + \
        ' import-file --use-pipe-from %s --input %s --output %s'
    cmd = cmd % (path_pipe, corpus_file, corpus_mallet_inf)

    try:
        print(f'-- Running command {cmd}')
        check_output(args=cmd, shell=True)
    except:
        print(
            '-- Mallet failed to import data. Revise command')
        return

    # Get topic proportions
    print('-- Inference: Inferring Topic Proportions')
    num_iterations = 100
    doc_topic_thr = 0

    cmd = mallet_path + \
        ' infer-topics --inferencer %s --input %s --output-doc-topics %s ' + \
        ' --doc-topics-threshold ' + str(doc_topic_thr) + \
        ' --num-iterations ' + str(num_iterations)
    cmd = cmd % (inferencer, corpus_mallet_inf, doc_topics_file)

    try:
        print(f'-- Running command {cmd}')
        check_output(args=cmd, shell=True)
    except:
        print('-- Mallet inference failed. Revise command')
        return

    return

In [15]:
def eval_betas(beta, topic_vectors):
    print('Tópicos (equivalentes) evaluados correctamente:')
    score = np.sum(np.max(np.sqrt(beta).dot(np.sqrt(topic_vectors.T)), axis=0))
    printgr(score)
    return score

In [16]:
def eval_thetas(thetas_theoretical, thetas_actual, n_docs):
    sim_mat_theoretical = np.sqrt(thetas_theoretical).dot(np.sqrt(thetas_theoretical.T))
    sim_mat_actual = np.sqrt(thetas).dot(np.sqrt(thetas.T))
    print('Difference in evaluation of doc similarity:')
    score = np.sum(np.abs(sim_mat_theoretical - sim_mat_actual))/n_docs
    printgr(score)
    return score

### *2.1. Centralized approach*

In [17]:
# Define corpus
corpus = [doc for docs_node in documents_all for doc in docs_node[0:n_docs]]
print(len(corpus))

# Train model 
modelname = "mallet_centralized"
modeldir = train(modelname, modelsdir, corpus)

50000
-- -- Creating backup of existing model in /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized_old
-- -- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "[\p{L}\p{N}][\p{L}\p{N}\p{P}]*" --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus.txt --output /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus.mallet
/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet train-topics --config /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/trainconfig.json
-- -- Training mallet topic model. Command is /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/

Mallet LDA: 50 topics, 6 topic bits, 111111 topic mask
Data loaded.
max tokens: 249
total tokens: 9978456
<10> LL/token: -8.13427
<20> LL/token: -7.8399
<30> LL/token: -7.71965
<40> LL/token: -7.63989

0	0.02	wd2409 wd3547 wd1613 wd3699 wd4269 wd2787 wd1269 wd4192 wd1483 wd1142 wd3779 wd2036 wd1027 wd4195 wd4365 wd4843 wd2638 wd4197 wd3439 wd4129 
1	0.02	wd247 wd1055 wd3773 wd914 wd3844 wd967 wd491 wd3441 wd1855 wd2828 wd2647 wd3665 wd4237 wd2224 wd4881 wd4978 wd3580 wd3924 wd2346 wd4080 
2	0.02	wd247 wd204 wd1055 wd2907 wd3773 wd4336 wd967 wd914 wd3954 wd2751 wd3429 wd3844 wd3777 wd2887 wd2828 wd491 wd3441 wd2647 wd2224 wd4237 
3	0.02	wd1855 wd3580 wd2346 wd1420 wd4029 wd2593 wd4286 wd2120 wd2318 wd1660 wd3192 wd1349 wd275 wd2024 wd3002 wd1947 wd4703 wd1 wd1743 wd3236 
4	0.02	wd4584 wd3319 wd2401 wd14 wd378 wd2600 wd1079 wd325 wd1574 wd4701 wd3089 wd418 wd855 wd1665 wd1786 wd3239 wd1440 wd4183 wd1535 wd734 
5	0.02	wd247 wd1055 wd3773 wd914 wd967 wd3844 wd3441 wd491 wd2828 wd2647 wd423

In [18]:
# Get betas
betas = np.zeros((tm_settings["n_topics"], tm_settings["vocab_size"]))
with open(modeldir.joinpath('word-topic-counts.txt').resolve().as_posix(), 'r', encoding='utf8') as fin:
    for line in fin.readlines():
        tokens = line.split()[1:]
        pos = int(tokens[0][2:])
        for el in tokens[1:]:
            tpc = int(el.split(':')[0])
            cnt = int(el.split(':')[1])
            betas[tpc,pos] = cnt

betas = normalize(betas,axis=1,norm='l1')

# Get thetas
thetas = np.loadtxt(modeldir.joinpath('doc-topics.txt').resolve().as_posix(), delimiter='\t', dtype=np.float32)[:,2:][:n_docs,:]
printgr(thetas.shape)
thetas[thetas<3e-3] = 0
thetas = normalize(thetas,axis=1,norm='l1')
thetas_theoretical = doc_topics_all[0][0:n_docs]

# Eval betas and thetas
betas_21 = eval_betas(betas, topic_vectors)
thetas_21 = eval_thetas(thetas_theoretical, thetas, n_docs)

[38;5;2m(10000, 50)[0m
Tópicos (equivalentes) evaluados correctamente:
[38;5;2m37.8101737657931[0m
Difference in evaluation of doc similarity:
[38;5;2m3904.0740468546105[0m


#### 2.1.1. Inference on the centralized approach with validation corpus generated with the same GP

In [19]:
# Define inference corpus
corpus_inf = [doc for docs_node in documents_all for doc in docs_node[n_docs:(n_docs+n_docs_inf)]]
print(len(corpus_inf))

# Perform inference
extractPipe(modeldir)
inference(modeldir, corpus_inf)

50000
-- Extracting pipeline
-- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --use-pipe-from /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus.mallet --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus_aux.txt --output /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/import.pipe
-- Inference: Mallet Data Import
-- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --use-pipe-from /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/import.pipe --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus_inf.txt --output /export/usuarios_ml4ds/lbartolome/data/pr

In [20]:
# Get inferred thetas
thetas_inf = np.loadtxt(modeldir.joinpath('doc-topics-inf.txt').resolve().as_posix(), delimiter='\t', dtype=np.float32)[:,2:][:n_docs,:]
thetas_inf[thetas_inf<3e-3] = 0
thetas_inf = normalize(thetas_inf,axis=1,norm='l1')
thetas_theoretical = doc_topics_all[0][n_docs:(n_docs+n_docs_inf)]

# Eval thetas
thetas_211 = eval_thetas(thetas_theoretical, thetas_inf, n_docs_inf)

Difference in evaluation of doc similarity:
[38;5;2m3931.209908308005[0m


#### 2.1.2. Inference on the centralized approach with validation corpus generated with different GP

In [21]:
# Define inference corpus
corpus_inf = [doc for docs_node in documents_all2 for doc in docs_node]
print(len(corpus_inf))

# Perform inference
extractPipe(modeldir)
inference(modeldir, corpus_inf)

50000
-- Extracting pipeline
-- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --use-pipe-from /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus.mallet --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus_aux.txt --output /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/import.pipe
-- Inference: Mallet Data Import
-- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --use-pipe-from /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/import.pipe --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_centralized/corpus_inf.txt --output /export/usuarios_ml4ds/lbartolome/data/pr

In [22]:
# Get inferred thetas
thetas_inf = np.loadtxt(modeldir.joinpath('doc-topics-inf.txt').resolve().as_posix(), delimiter='\t', dtype=np.float32)[:,2:][:n_docs,:]
thetas_inf[thetas_inf<3e-3] = 0
thetas_inf = normalize(thetas_inf,axis=1,norm='l1')
thetas_theoretical = doc_topics_all2[0][0:n_docs_inf]

# Eval thetas
thetas_212 = eval_thetas(thetas_theoretical, thetas_inf, n_docs_inf)

Difference in evaluation of doc similarity:
[38;5;2m3919.636855588864[0m


### *2.2. Just in one node approach*

In [23]:
# Define corpus
corpus = documents_all[0][0:n_docs]
print(len(corpus))

# Train model 
modelname = "mallet_node0"
modeldir = train(modelname, modelsdir, corpus)

10000
-- -- Creating backup of existing model in /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0_old
-- -- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "[\p{L}\p{N}][\p{L}\p{N}\p{P}]*" --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/corpus.txt --output /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/corpus.mallet
/export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet train-topics --config /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/trainconfig.json
-- -- Training mallet topic model. Command is /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet 

Mallet LDA: 50 topics, 6 topic bits, 111111 topic mask
Data loaded.
max tokens: 249
total tokens: 1990327
<10> LL/token: -9.18485
<20> LL/token: -8.4155
<30> LL/token: -8.17196
<40> LL/token: -8.0354

0	0.02	wd2825 wd3692 wd623 wd3874 wd4919 wd3626 wd3308 wd2721 wd1135 wd961 wd2212 wd1023 wd4793 wd1119 wd2473 wd3428 wd3858 wd384 wd4997 wd1766 
1	0.02	wd4982 wd1392 wd2790 wd2841 wd2238 wd1262 wd4821 wd3948 wd3643 wd1397 wd3368 wd1171 wd4548 wd3622 wd4769 wd4500 wd3360 wd2982 wd3629 wd1422 
2	0.02	wd1855 wd3580 wd2346 wd4029 wd1420 wd4286 wd2593 wd2120 wd2318 wd1660 wd3192 wd2024 wd275 wd1349 wd1947 wd3002 wd4876 wd4703 wd4675 wd3236 
3	0.02	wd1756 wd204 wd2907 wd3429 wd247 wd473 wd2751 wd3777 wd4336 wd2482 wd1055 wd3954 wd2236 wd2887 wd2662 wd534 wd1676 wd1927 wd1807 wd4783 
4	0.02	wd232 wd4272 wd1908 wd4801 wd3570 wd2330 wd4310 wd2829 wd1322 wd2203 wd3660 wd460 wd472 wd988 wd3319 wd2544 wd1436 wd3305 wd2287 wd368 
5	0.02	wd1756 wd473 wd1807 wd1927 wd625 wd1495 wd4172 wd4459 wd4128 wd54

In [24]:
# Get betas
betas = np.zeros((tm_settings["n_topics"], tm_settings["vocab_size"]))
with open(modeldir.joinpath('word-topic-counts.txt').resolve().as_posix(), 'r', encoding='utf8') as fin:
    for line in fin.readlines():
        tokens = line.split()[1:]
        pos = int(tokens[0][2:])
        for el in tokens[1:]:
            tpc = int(el.split(':')[0])
            cnt = int(el.split(':')[1])
            betas[tpc,pos] = cnt

betas = normalize(betas,axis=1,norm='l1')

# Get thetas
thetas = np.loadtxt(modeldir.joinpath('doc-topics.txt').resolve().as_posix(), delimiter='\t', dtype=np.float32)[:,2:][:n_docs,:]

thetas[thetas<3e-3] = 0
thetas = normalize(thetas,axis=1,norm='l1')
thetas_theoretical = doc_topics_all[0][0:n_docs]
print(thetas_theoretical.shape)

# Eval betas and thetas
betas_22 = eval_betas(betas, topic_vectors)
thetas_22 = eval_thetas(thetas_theoretical, thetas, n_docs)

(10000, 50)
Tópicos (equivalentes) evaluados correctamente:
[38;5;2m16.85952221836009[0m
Difference in evaluation of doc similarity:
[38;5;2m2401.504144264718[0m


#### 2.2.1. Inference on the just in one node approach with validation corpus generated with the same GP

In [25]:
# Define inference corpus
corpus_inf = documents_all[0][n_docs:(n_docs+n_docs_inf)]
print(len(corpus_inf))

# Perform inference
extractPipe(modeldir)
inference(modeldir, corpus_inf)

10000
-- Extracting pipeline
-- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --use-pipe-from /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/corpus.mallet --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/corpus_aux.txt --output /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/import.pipe
-- Inference: Mallet Data Import
-- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --use-pipe-from /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/import.pipe --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/corpus_inf.txt --output /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federate

In [26]:
# Get inferred thetas
thetas_inf = np.loadtxt(modeldir.joinpath('doc-topics-inf.txt').resolve().as_posix(), delimiter='\t', dtype=np.float32)[:,2:][:n_docs,:]
thetas_inf[thetas_inf<3e-3] = 0
thetas_inf = normalize(thetas_inf,axis=1,norm='l1')
thetas_theoretical = doc_topics_all[0][n_docs:(n_docs+n_docs_inf)]

# Eval thetas
thetas_221 = eval_thetas(thetas_theoretical, thetas_inf, n_docs_inf)

Difference in evaluation of doc similarity:
[38;5;2m2432.69175161107[0m


#### 2.2.2. Inference on the just in one node approach with validation corpus generated with different GP

In [27]:
# Define inference corpus
corpus_inf = documents_all2[0][0:n_docs]
print(len(corpus_inf))

# Perform inference
extractPipe(modeldir)
inference(modeldir, corpus_inf)

10000
-- Extracting pipeline
-- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --use-pipe-from /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/corpus.mallet --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/corpus_aux.txt --output /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/import.pipe
-- Inference: Mallet Data Import
-- Running command /export/usuarios_ml4ds/jarenas/github/IntelComp/ITMT/topicmodeler/src/topicmodeling/mallet-2.0.8/bin/mallet import-file --use-pipe-from /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/import.pipe --input /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/mallet_node0/corpus_inf.txt --output /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federate

In [28]:
# Get inferred thetas
thetas_inf = np.loadtxt(modeldir.joinpath('doc-topics-inf.txt').resolve().as_posix(), delimiter='\t', dtype=np.float32)[:,2:][:n_docs,:]
thetas_inf[thetas_inf<3e-3] = 0
thetas_inf = normalize(thetas_inf,axis=1,norm='l1')
thetas_theoretical = doc_topics_all2[0]

# Eval thetas
thetas_222 = eval_thetas(thetas_theoretical, thetas_inf, n_docs_inf)

Difference in evaluation of doc similarity:
[38;5;2m2422.072309360402[0m


## **3. ProdLDA**


---

In [29]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse

In [30]:
cd /export/usuarios_ml4ds/lbartolome/topicmodeler

/export/usuarios_ml4ds/lbartolome/topicmodeler


In [31]:
from src.topicmodeling.neural_models.pytorchavitm.datasets.bow_dataset import BOWDataset
from src.topicmodeling.neural_models.pytorchavitm.avitm_network.avitm import AVITM

### *Auxiliary functions*

In [32]:
def convert_topic_word_to_init_size(vocab_size, model, model_type,
                                    ntopics, id2token, all_words):
    """It converts the topic-word distribution matrix obtained from the training of a model into a matrix with the dimensions of the original topic-word distribution, assigning zeros to those words that are not present in the corpus. 
    It is only of use in case we are training a model over a synthetic dataset, so as to later compare the performance of the attained model in what regards to the similarity between the original and the trained model.

    Args:
        * vocab_size (int):       Size of the synethic'data vocabulary.
        * model (AVITM/CTM):      Model whose topic-word matrix is being transformed.
        * model_type (str):       Type of the trained model (e.g. AVITM)
        * ntopics (int):          Number of topics of the trained model.
        * id2token (List[tuple]): Mappings with the content of the document-term matrix.
        * all_words (List[str]):  List of all the words of the vocabulary of size vocab_size.

    Returns:
        * ndarray: Normalized transormed topic-word distribution.
    """
    if model_type == "avitm":
        w_t_distrib = np.zeros((ntopics, vocab_size), dtype=np.float64)
        wd = model.get_topic_word_distribution()
        for i in np.arange(ntopics):
            for idx, word in id2token.items():
                for j in np.arange(len(all_words)):
                    if all_words[j] == word:
                        w_t_distrib[i, j] = wd[i][idx]
                        break
        normalized_array = normalize(w_t_distrib,axis=1,norm='l1')
        return normalized_array
    else:
        print("Method not impleemnted for the selected model type")
        return None

In [33]:
def train_avitm(modelname, modelsdir, corpus):
    
    # Create model folder
    modeldir, configFile = create_model_folder(modelname, modelsdir)
    
    # Create corpus in ProdLDA format (BoWDataset)
    cv = CountVectorizer(input='content', lowercase=True, stop_words='english', binary=False)
    docs = [" ".join(corpus[i]) for i in np.arange(len(corpus))]
    train_bow = cv.fit_transform(docs).toarray()
    idx2token = cv.get_feature_names_out()
    train_dataset = BOWDataset(train_bow, idx2token)
    input_size = len(idx2token)
    id2token = {k: v for k, v in zip(range(0, len(idx2token)), idx2token)}
    
    avitm = AVITM(logger=None,
              input_size=input_size,
              n_components=n_topics,
              model_type="prodLDA",
              hidden_sizes=(100, 100),
              activation='softplus',
              dropout=0.2,
              learn_priors=True,
              batch_size=64,
              lr=2e-3,
              momentum=0.99,
              solver='adam',
              num_epochs=100,
              reduce_on_plateau=False,
              topic_prior_mean=0.0,
              topic_prior_variance=None,
              num_samples=10,
              num_data_loader_workers=0,
              verbose=True)
    avitm.fit(train_dataset)
    
    return modeldir, avitm, cv, id2token, idx2token

### *3.1. Centralized approach*

In [34]:
# Define corpus
corpus = [doc for docs_node in documents_all for doc in docs_node[0:n_docs]]
print(len(corpus))

# Train model 
modelname = "prod_centralized"
modeldir, avitm, cv, id2token, idx2token = train_avitm(modelname, modelsdir, corpus)

50000
-- -- Creating backup of existing model in /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/prod_centralized_old
Settings: 
                N Components: 50
                Topic Prior Mean: 0.0
                Topic Prior Variance: None
                Model Type: prodLDA
                Hidden Sizes: (100, 100)
                Activation: softplus
                Dropout: 0.2
                Learn Priors: True
                Learning Rate: 0.002
                Momentum: 0.99
                Reduce On Plateau: False
                Save Dir: None


Epoch: [100/100]	 Seen Samples: [5000000/5000000]	Train Loss: 1568.4424375390624	Time: 0:00:09.726430: : 100it [15:52,  9.52s/it]
Sampling: [20/20]: : 20it [01:50,  5.55s/it]


In [35]:
# Get betas
betas = avitm.get_topic_word_distribution()
all_words = ['wd'+str(word) for word in np.arange(vocab_size+1) if word > 0]
betas = convert_topic_word_to_init_size(vocab_size=vocab_size,
                                        model=avitm,
                                        model_type="avitm",
                                        ntopics=n_topics,
                                        id2token=id2token,
                                        all_words=all_words)

# Get thetas
thetas = np.asarray(avitm.get_doc_topic_distribution(avitm.train_data))[0:n_docs,:]
thetas[thetas < 3e-3] = 0
thetas = normalize(thetas, axis=1, norm='l1')
thetas_theoretical = doc_topics_all[0][0:n_docs]

# Eval betas and thetas
betas_31 = eval_betas(betas, topic_vectors)
thetas_31 = eval_thetas(thetas_theoretical, thetas, n_docs)

Sampling: [20/20]: : 20it [01:54,  5.72s/it]


Tópicos (equivalentes) evaluados correctamente:
[38;5;2m8.826222539834463[0m
Difference in evaluation of doc similarity:
[38;5;2m728.767726628274[0m


#### 3.1.1. Inference on the centralized approach with validation corpus generated with the same GP

In [36]:
# Define inference corpus
corpus_inf = [doc for docs_node in documents_all for doc in docs_node[n_docs:(n_docs+n_docs_inf)]]
print(len(corpus_inf))


# Get inferred thetas
docs_val_conv = [" ".join(corpus_inf[i]) for i in np.arange(len(corpus_inf))]
val_bow = cv.transform(docs_val_conv)
val_bow = val_bow.toarray()
val_data = BOWDataset(val_bow, idx2token)

thetas_inf = np.asarray(avitm.get_doc_topic_distribution(val_data))[0:n_docs,:]
thetas_inf[thetas_inf < 3e-3] = 0
thetas_inf = normalize(thetas_inf, axis=1, norm='l1')

thetas_theoretical = doc_topics_all[0][n_docs:(n_docs+n_docs_inf)]

# Eval thetas
thetas_311 = eval_thetas(thetas_theoretical, thetas_inf, n_docs_inf)

50000


Sampling: [20/20]: : 20it [01:53,  5.67s/it]


Difference in evaluation of doc similarity:
[38;5;2m928.1226728932468[0m


#### 3.1.2. Inference on the centralized approach with validation corpus generated with different GP

In [37]:
# Define inference corpus
corpus_inf = [doc for docs_node in documents_all2 for doc in docs_node]
print(len(corpus_inf))

# Get inferred thetas
docs_val_conv = [" ".join(corpus_inf[i]) for i in np.arange(len(corpus_inf))]
val_bow = cv.transform(docs_val_conv)
val_bow = val_bow.toarray()
val_data = BOWDataset(val_bow, idx2token)

thetas_inf = np.asarray(avitm.get_doc_topic_distribution(val_data))[0:n_docs,:]
thetas_inf[thetas_inf < 3e-3] = 0
thetas_inf = normalize(thetas_inf, axis=1, norm='l1')

thetas_theoretical = doc_topics_all2[0][0:n_docs_inf]

# Eval thetas
thetas_312 = eval_thetas(thetas_theoretical, thetas_inf, n_docs_inf)

50000


Sampling: [20/20]: : 20it [01:54,  5.72s/it]


Difference in evaluation of doc similarity:
[38;5;2m938.5847124722367[0m


### *3.2. Just in one node approach*

In [38]:
# Define corpus
corpus = documents_all[0][0:n_docs]
print(len(corpus))

# Train model 
modelname = "prodlda_node0"
modeldir, avitm, cv, id2token, idx2token = train_avitm(modelname, modelsdir, corpus)

# Get betas
betas = avitm.get_topic_word_distribution()
all_words = ['wd'+str(word) for word in np.arange(vocab_size+1) if word > 0]
betas = convert_topic_word_to_init_size(vocab_size=vocab_size,
                                        model=avitm,
                                        model_type="avitm",
                                        ntopics=n_topics,
                                        id2token=id2token,
                                        all_words=all_words)

# Get thetas
thetas = np.asarray(avitm.get_doc_topic_distribution(avitm.train_data))
thetas[thetas < 3e-3] = 0
thetas = normalize(thetas, axis=1, norm='l1')
thetas_theoretical = doc_topics_all[0][0:n_docs]

# Eval betas and thetas
betas_32 = eval_betas(betas, topic_vectors)
thetas_32 = eval_thetas(thetas_theoretical, thetas, n_docs)

10000
-- -- Creating backup of existing model in /export/usuarios_ml4ds/lbartolome/data/project_folder/TMmodels/Federated_test/prodlda_node0_old
Settings: 
                N Components: 50
                Topic Prior Mean: 0.0
                Topic Prior Variance: None
                Model Type: prodLDA
                Hidden Sizes: (100, 100)
                Activation: softplus
                Dropout: 0.2
                Learn Priors: True
                Learning Rate: 0.002
                Momentum: 0.99
                Reduce On Plateau: False
                Save Dir: None


Epoch: [100/100]	 Seen Samples: [1000000/1000000]	Train Loss: 1565.370505859375	Time: 0:00:03.882485: : 100it [06:33,  3.94s/it]
Sampling: [20/20]: : 20it [01:05,  3.29s/it]
Sampling: [20/20]: : 20it [01:04,  3.23s/it]


Tópicos (equivalentes) evaluados correctamente:
[38;5;2m7.4636626571774505[0m
Difference in evaluation of doc similarity:
[38;5;2m654.0007020386628[0m


#### 3.2.1. Inference on the just in one node approach with validation corpus generated with the same GP

In [39]:
# Define inference corpus
corpus_inf = documents_all[0][n_docs:(n_docs+n_docs_inf)]
print(len(corpus_inf))

# Get inferred thetas
docs_val_conv = [" ".join(corpus_inf[i]) for i in np.arange(len(corpus_inf))]
val_bow = cv.transform(docs_val_conv)
val_bow = val_bow.toarray()
val_data = BOWDataset(val_bow, idx2token)

thetas_inf = np.asarray(avitm.get_doc_topic_distribution(val_data))
thetas_inf[thetas_inf < 3e-3] = 0
thetas_inf = normalize(thetas_inf, axis=1, norm='l1')

thetas_theoretical = doc_topics_all[0][n_docs:(n_docs+n_docs_inf)]

# Eval thetas
thetas_321 = eval_thetas(thetas_theoretical, thetas_inf, n_docs_inf)

10000


Sampling: [20/20]: : 20it [01:02,  3.13s/it]


Difference in evaluation of doc similarity:
[38;5;2m967.6032233903243[0m


#### 2.2.2. Inference on the just in one node approach with validation corpus generated with different GP

In [40]:
# Define inference corpus
corpus_inf = documents_all2[0][0:n_docs]
print(len(corpus_inf))

# Get inferred thetas
docs_val_conv = [" ".join(corpus_inf[i]) for i in np.arange(len(corpus_inf))]
val_bow = cv.transform(docs_val_conv)
val_bow = val_bow.toarray()
val_data = BOWDataset(val_bow, idx2token)

thetas_inf = np.asarray(avitm.get_doc_topic_distribution(val_data))
thetas_inf[thetas_inf < 3e-3] = 0
thetas_inf = normalize(thetas_inf, axis=1, norm='l1')

thetas_theoretical = doc_topics_all2[0]

# Eval thetas
thetas_322 = eval_thetas(thetas_theoretical, thetas_inf, n_docs_inf)

10000


Sampling: [20/20]: : 20it [01:02,  3.11s/it]


Difference in evaluation of doc similarity:
[38;5;2m974.0166815560478[0m


In [41]:
print("_"*115)
print(f"{'Mallet-Centr':>50}\
        {'Mallet-Node0':>8}\
        {'ProdLDA-Centr':>6}\
        {'ProdLDA-Node0':>10}"
     )
print("-"*115)
print(f"{'Nr topics correctly evaluated:':<30}\
        {betas_21:>10.3f}\
        {betas_22:>11.3f}\
        {betas_31:>13.3f}\
        {betas_32:>13.3f}"
     )
print(f"{'Difference in doc sim:':<30}\
        {thetas_21:>10.3f}\
        {thetas_22:>11.3f}\
        {thetas_31:>13.3f}\
        {thetas_32:>13.3f}"
     )
print(f"{'Difference in doc sim inf1:':<30}\
        {thetas_211:>10.3f}\
        {thetas_221:>11.3f}\
        {thetas_311:>13.3f}\
        {thetas_321:>13.3f}"
     )
print(f"{'Difference in doc sim inf2:':<30}\
        {thetas_212:>10.3f}\
        {thetas_222:>11.3f}\
        {thetas_312:>13.3f}\
        {thetas_322:>13.3f}"
     )
print("_"*115)
print()

print("_"*100)
print(f"{'Stats':<15}\
        {'Nr nodes':>8}\
        {'Nr topics':>8}\
        {'Alpha':>8}\
        {'Beta':>8}\
        {'Ndocs':>8}"
     )
print("-"*100)
print(f"{'':>15}\
        {n_nodes:>8}\
        {n_topics:>8}\
        {alpha:>8}\
        {beta:>8}\
        {n_docs:>8}"
     )
print("_"*100)

___________________________________________________________________________________________________________________
                                      Mallet-Centr        Mallet-Node0        ProdLDA-Centr        ProdLDA-Node0
-------------------------------------------------------------------------------------------------------------------
Nr topics correctly evaluated:            37.810             16.860                8.826                7.464
Difference in doc sim:                  3904.074           2401.504              728.768              654.001
Difference in doc sim inf1:             3931.210           2432.692              928.123              967.603
Difference in doc sim inf2:             3919.637           2422.072              938.585              974.017
___________________________________________________________________________________________________________________

__________________________________________________________________________________________________