In [5]:
import numpy as np
import pandas as pd
import zipfile as zp
from pathlib import Path
from gensim.utils import check_output
from sklearn.preprocessing import normalize
from scipy.special import softmax
import shutil
from subprocess import check_output
import torch
from tqdm import tqdm
import colored
import itertools
import matplotlib.pyplot as plt
import os
import pickle

In [6]:
def printgr(text):
    print(colored.stylize(text, colored.fg('green')))

In [7]:
cd /export/usuarios_ml4ds/lbartolome/2hNTM

/export/usuarios_ml4ds/lbartolome/2hNTM


In [8]:
ls

LICENSE  README.md  config.cf.default  [0m[01;34mdata[0m/  main.py  [00;32mrequirements.txt[0m  [01;34msrc[0m/


In [9]:
output_path = Path("/export/usuarios_ml4ds/lbartolome/2hNTM/data/output")

## **0. Auxiliary functions**

---

In [11]:
def unpickler(file: str):
    """Unpickle file"""
    with open(file, 'rb') as f:
        return pickle.load(f)

In [12]:
def rotateArray(arr, n, d):
    temp = []
    i = 0
    while (i < d):
        temp.append(arr[i])
        i = i + 1
    i = 0
    while (d < n):
        arr[i] = arr[d]
        i = i + 1
        d = d + 1
    arr[:] = arr[: i] + temp
    return arr

In [13]:
def generateSynthetic(just_inf, gen_docs, vocab_size, n_topics, beta, alpha, n_docs,
                      n_docs_inf, n_docs_global_inf, nwords, alg, n_nodes,
                      frozen_topics, prior_frozen, own_topics, prior_nofrozen):
    
    if just_inf:
        n_total_docs = n_docs_global_inf
    else:
        n_total_docs = n_docs + n_docs_inf

    # Step 1 - generation of topics
    topic_vectors = np.random.dirichlet(vocab_size*[beta], n_topics)
    
    # Step 2 - generation of document topic proportions
    doc_topics_all = []
    for i in np.arange(n_nodes):
        doc_topics = np.random.dirichlet(prior_frozen + prior_nofrozen, n_total_docs)
        prior_nofrozen = rotateArray(prior_nofrozen, len(prior_nofrozen), own_topics)
        doc_topics_all.append(doc_topics)
        
    # Step 3 - Document generation
    documents_all = []
    # z_all = []
    
    if gen_docs:
        for i in np.arange(n_nodes):
            print("Generating document words for node ", str(i))
            documents = [] # Document words
            #z = [] # Assignments
            for docid in tqdm(np.arange(n_total_docs)):
                doc_len = np.random.randint(low=nwords[0], high=nwords[1])
                this_doc_words = []
                #this_doc_assigns = []
                for wd_idx in np.arange(doc_len):

                    tpc = np.nonzero(np.random.multinomial(1, doc_topics_all[i][docid]))[0][0]
                    #this_doc_assigns.append(tpc)
                    if alg == "lda":
                        word = np.nonzero(np.random.multinomial(1, topic_vectors[tpc]))[0][0]
                    else: #prodlda
                        pval = np.power(topic_vectors[tpc], doc_topics_all[i][docid][tpc])
                        weights = torch.tensor(pval, dtype=torch.float) # create a tensor of weights
                        word = torch.multinomial(weights, 1).numpy()[0]
                        #pval = normalize(pval[:,np.newaxis], norm='l1', axis=0).ravel()
                        #word = np.nonzero(np.random.multinomial(1, b))[0][0]
                    this_doc_words.append('wd'+str(word))
                #z.append(this_doc_assigns)
                documents.append(this_doc_words)
            documents_all.append(documents)
            #z_all.append(z)
    
    return topic_vectors, doc_topics_all, documents_all

In [14]:
def convert_topic_word_to_init_size(vocab_size, model, model_type,
                                    ntopics, id2token, all_words):
    """It converts the topic-word distribution matrix obtained from the training of a model into a matrix with the dimensions of the original topic-word distribution, assigning zeros to those words that are not present in the corpus. 
    It is only of use in case we are training a model over a synthetic dataset, so as to later compare the performance of the attained model in what regards to the similarity between the original and the trained model.

    Args:
        * vocab_size (int):       Size of the synethic'data vocabulary.
        * model (AVITM/CTM):      Model whose topic-word matrix is being transformed.
        * model_type (str):       Type of the trained model (e.g. AVITM)
        * ntopics (int):          Number of topics of the trained model.
        * id2token (List[tuple]): Mappings with the content of the document-term matrix.
        * all_words (List[str]):  List of all the words of the vocabulary of size vocab_size.

    Returns:
        * ndarray: Normalized transormed topic-word distribution.
    """
    if model_type == "avitm":
        w_t_distrib = np.zeros((ntopics, vocab_size), dtype=np.float64)
        wd = model.get_topic_word_distribution()
        for i in np.arange(ntopics):
            for idx, word in id2token.items():
                for j in np.arange(len(all_words)):
                    if all_words[j] == word:
                        w_t_distrib[i, j] = wd[i][idx]
                        break
        normalized_array = normalize(w_t_distrib,axis=1,norm='l1')
        return normalized_array
    else:
        print("Method not impleemnted for the selected model type")
        return None

In [15]:
def eval_betas(beta, topic_vectors):
    print('Tópicos (equivalentes) evaluados correctamente:')
    score = np.sum(np.max(np.sqrt(beta).dot(np.sqrt(topic_vectors.T)), axis=0))
    printgr(score)
    return score

In [16]:
def eval_thetas(thetas_theoretical, thetas_actual, n_docs):
    sim_mat_theoretical = np.sqrt(thetas_theoretical).dot(np.sqrt(thetas_theoretical.T))
    sim_mat_actual = np.sqrt(thetas_actual).dot(np.sqrt(thetas_actual.T))
    print('Difference in evaluation of doc similarity:')
    score = np.sum(np.abs(sim_mat_theoretical - sim_mat_actual))/n_docs
    printgr(score)
    return score

In [25]:
import torch
from torch.utils.data import Dataset


class BOWDataset(Dataset):
    
    """Class to load BOW dataset."""

    def __init__(self, X, idx2token, cv):

        """
        Initializes BOWDataset.

        Parameters
        ----------
        X : array-like, shape=(n_samples, n_features)
            Document-term matrix
        idx2token : list
            A list of feature names
        """

        self.X = X
        self.idx2token = idx2token
        self.cv = cv

    def __len__(self):
        """Returns length of dataset."""
        return len(self.X)

    def __getitem__(self, i):
        """Returns sample from dataset at index i."""
        X = torch.FloatTensor(self.X[i])

        return {'X': X}


In [None]:
def prepare_hold_out_dataset(hold_out_corpus, cv, idx2token):

    docs_ho_conv = \
        [" ".join(hold_out_corpus[i]) for i in np.arange(len(hold_out_corpus))]
    ho_bow = cv.transform(docs_ho_conv)
    ho_bow = ho_bow.toarray()
    ho_data = BOWDataset(ho_bow, idx2token, cv)

    return ho_data

## **1. Generation of documents**

---

In [17]:
n_nodes = 5

In [18]:
# Topic modeling settings
vocab_size = 5000
n_topics = 50
beta = 1e-2
alpha = 5/n_topics
n_docs = 1000
n_docs_inf = 1000
n_docs_global_inf = 1000#int(n_docs / n_nodes)
nwords = (150, 250) #Min and max lengths of the documents
alg = "lda" #"prod"

tm_settings = {
    "vocab_size": vocab_size,
    "n_topics": n_topics,
    "beta": beta,
    "alpha": alpha,
    "n_docs": n_docs,
    "n_docs_inf": n_docs_inf,
    "n_docs_global_inf": n_docs_global_inf,
    "nwords": nwords,
    "alg": alg
}

In [19]:
# Centralized settings

frozen_topics = 5
prior_frozen = frozen_topics * [alpha]
own_topics = int((n_topics-frozen_topics)/n_nodes)
prior_nofrozen = own_topics * [alpha] + (n_topics-frozen_topics-own_topics) * [alpha/10000]

centralized_settings = {
    "n_nodes": n_nodes,
    "frozen_topics": frozen_topics,
    "prior_frozen": prior_frozen,
    "own_topics": own_topics,
    "prior_nofrozen": prior_nofrozen
}

In [20]:
# Generate documents
topic_vectors, doc_topics_all, documents_all = generateSynthetic(False, True, **tm_settings, **centralized_settings)

Generating document words for node  0


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:48<00:00, 41.06it/s]


Generating document words for node  1


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:48<00:00, 41.12it/s]


Generating document words for node  2


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:49<00:00, 40.51it/s]


Generating document words for node  3


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:49<00:00, 40.77it/s]


Generating document words for node  4


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:49<00:00, 40.69it/s]


In [21]:
docs = [' '.join(doc) for doc in documents_all[0][0:n_docs]]
df0 = pd.DataFrame(docs)
df0.columns = ["bow_text"]
df0['fieldsOfStudy'] = ["computer_science"] * len(df0)

docs = [' '.join(doc) for doc in documents_all[1][0:n_docs]]
df1 = pd.DataFrame(docs)
df1.columns = ["bow_text"]
df1['fieldsOfStudy'] = ["economics"] * len(df1)

docs = [' '.join(doc) for doc in documents_all[2][0:n_docs]]
df2 = pd.DataFrame(docs)
df2.columns = ["bow_text"]
df2['fieldsOfStudy'] = ["sociology"] * len(df2)

docs = [' '.join(doc) for doc in documents_all[3][0:n_docs]]
df3 = pd.DataFrame(docs)
df3.columns = ["bow_text"]
df3['fieldsOfStudy'] = ["philosophy"] * len(df3)

docs = [' '.join(doc) for doc in documents_all[4][0:n_docs]]
df4 = pd.DataFrame(docs)
df4.columns = ["bow_text"]
df4['fieldsOfStudy'] = ["political_science"] * len(df4)

df = pd.concat([df0, df1, df2, df3, df4])
df

Unnamed: 0,bow_text,fieldsOfStudy
0,wd4385 wd4657 wd1715 wd4656 wd4827 wd3446 wd26...,computer_science
1,wd4110 wd205 wd4781 wd3873 wd559 wd2228 wd559 ...,computer_science
2,wd2621 wd2717 wd3522 wd1996 wd1110 wd1294 wd94...,computer_science
3,wd3304 wd4907 wd1543 wd3278 wd3388 wd665 wd366...,computer_science
4,wd963 wd4973 wd2901 wd1363 wd2980 wd2980 wd413...,computer_science
...,...,...
995,wd378 wd1803 wd1043 wd3308 wd3432 wd3357 wd360...,political_science
996,wd2374 wd1786 wd3060 wd2088 wd328 wd3849 wd253...,political_science
997,wd2730 wd4500 wd1326 wd3772 wd4569 wd4012 wd55...,political_science
998,wd3153 wd29 wd1188 wd2964 wd2911 wd205 wd3185 ...,political_science


In [24]:
# Generate inference corpus and its docs_topics
inf = [doc for docs_node in documents_all for doc in docs_node[n_docs:(n_docs+n_docs_global_inf)]]
print("Length of the inference corpus ", str(len(inf)))

for i in range(len(doc_topics_all)):
    if i == 0:
        inf_doc_topics = doc_topics_all[i][n_docs:(n_docs+n_docs_global_inf)]
    else:
        inf_doc_topics = np.concatenate((inf_doc_topics,doc_topics_all[i][n_docs:(n_docs+n_docs_global_inf)])) 
print("Shape of inf_doc_topics", str(inf_doc_topics.shape))

Length of the inference corpus  5000
Shape of inf_doc_topics (5000, 50)


In [23]:
# Save training dataset in file
filepath = Path('data/training_data/out.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df.to_csv(filepath)  

## **2. Evaluation**

---

In [40]:
path_output = Path('/export/usuarios_ml4ds/lbartolome/2hNTM/data/output')

sim_thetas_nodes = []
for entry in path_output.iterdir():
    path_model = entry.joinpath("modelFiles/model.pickle")
    
    # Get model object to perform inference
    avitm = unpickler(path_model)
    
    # Get inf corpus in avitm format
    ho_data = prepare_hold_out_dataset(
            inf, avitm.train_data.cv, avitm.train_data.idx2token)
    
    thetas_inf = np.asarray(avitm.get_doc_topic_distribution(ho_data))
    thetas_theoretical = inf_doc_topics

    # Eval thetas
    thetas_sim = eval_thetas(thetas_theoretical, thetas_inf, len(thetas_inf))
    
    if entry.as_posix().split("/")[-1].startswith("node"):
        sim_thetas_nodes.append(thetas_sim)
    else:
        sim_thetas_centralized = thetas_sim

avg_sim_thetas_nodes = sum(sim_thetas_nodes)/n_nodes

# Baseline
# Baseline doc-topics generation
topic_vectors, doc_topics_all, _ = generateSynthetic(True, False, **tm_settings, **centralized_settings)

for i in range(len(doc_topics_all)):
    if i == 0:
        thetas_bas = doc_topics_all[i]
    else:
        thetas_bas = np.concatenate((thetas_bas,doc_topics_all[i]))
        
thetas_theoretical = inf_doc_topics
thetas_baseline = eval_thetas(thetas_theoretical, thetas_bas, len(thetas_bas))

print("####################################################")
print("Averages of inferred thetas in nodes (level-1 models): ", avg_sim_thetas_nodes)
print("Inferred thetas with second level model: ", sim_thetas_centralized)
print("Baseline: ", thetas_baseline)

Sampling: [20/20]: : 20it [00:38,  1.91s/it]


[[0.10225347 0.08972973 0.11785859 ... 0.13537964 0.11281251 0.07142649]
 [0.08366234 0.10211106 0.03991462 ... 0.01728298 0.09018794 0.0590278 ]
 [0.07233718 0.09996512 0.10105768 ... 0.11944315 0.14564925 0.09656231]
 ...
 [0.11041382 0.14063583 0.08788011 ... 0.10886418 0.08803174 0.08655686]
 [0.10145532 0.07431257 0.09304321 ... 0.04613498 0.13449026 0.06671481]
 [0.11221697 0.10069423 0.08394611 ... 0.02899571 0.1286096  0.08313668]]
Difference in evaluation of doc similarity:
[38;5;2m4111.595927833973[0m


Sampling: [20/20]: : 20it [01:00,  3.05s/it]


[[0.1010623  0.08040764 0.10750438 ... 0.12043143 0.12487704 0.07974143]
 [0.1126292  0.1027429  0.10769147 ... 0.07497612 0.0860968  0.09462127]
 [0.08610949 0.07778461 0.08552249 ... 0.11447899 0.15131176 0.12126468]
 ...
 [0.20180932 0.0915784  0.10235601 ... 0.06723996 0.08419517 0.0881216 ]
 [0.18810379 0.06488642 0.10371054 ... 0.07440621 0.07720394 0.09226294]
 [0.16082843 0.06065461 0.10503435 ... 0.07018349 0.07126443 0.08907101]]
Difference in evaluation of doc similarity:
[38;5;2m4196.128134706327[0m


Sampling: [20/20]: : 20it [00:40,  2.00s/it]


[[0.04823256 0.17635389 0.08902908 ... 0.0459877  0.14566489 0.21372318]
 [0.05765652 0.05504364 0.10553588 ... 0.07237964 0.05265043 0.08393895]
 [0.04832333 0.08013556 0.14371341 ... 0.0861653  0.09775459 0.12772186]
 ...
 [0.10762317 0.06059394 0.08227239 ... 0.07313255 0.09156939 0.19718215]
 [0.05585344 0.08707504 0.19382078 ... 0.18452522 0.05765669 0.18174644]
 [0.0607813  0.03585553 0.09391747 ... 0.09457948 0.05914036 0.0804162 ]]
Difference in evaluation of doc similarity:
[38;5;2m4131.990892491831[0m


Sampling: [20/20]: : 20it [00:40,  2.01s/it]


[[0.0971758  0.10062097 0.09833107 ... 0.15578528 0.06597111 0.11631232]
 [0.0819947  0.05237578 0.06023697 ... 0.14115332 0.09656935 0.1012709 ]
 [0.09813085 0.09338076 0.13665884 ... 0.08573293 0.07618148 0.11354846]
 ...
 [0.09003014 0.11010802 0.09959604 ... 0.11332934 0.07839242 0.11658286]
 [0.10488924 0.12911632 0.11506834 ... 0.10177713 0.07389606 0.09492969]
 [0.08649923 0.08551789 0.04947664 ... 0.10318171 0.09594417 0.07254157]]
Difference in evaluation of doc similarity:
[38;5;2m4144.742258359256[0m


Sampling: [20/20]: : 20it [00:40,  2.03s/it]


[[0.09949548 0.10819748 0.13640513 ... 0.07729032 0.13251615 0.13443898]
 [0.10015063 0.21524402 0.05697452 ... 0.05780128 0.03372726 0.11331372]
 [0.0718631  0.11162189 0.09783238 ... 0.08127696 0.09580944 0.12422001]
 ...
 [0.11695234 0.09008134 0.07174911 ... 0.10950576 0.15367256 0.09825709]
 [0.09501    0.15423849 0.05818317 ... 0.07234916 0.15165965 0.1026371 ]
 [0.12493182 0.13098422 0.06291202 ... 0.09708492 0.05506542 0.07703925]]
Difference in evaluation of doc similarity:
[38;5;2m4139.749366920617[0m


Sampling: [20/20]: : 20it [00:40,  2.03s/it]


[[0.08542518 0.07607844 0.09754066 ... 0.19401567 0.08479702 0.09454972]
 [0.03905668 0.02452578 0.02728593 ... 0.32572629 0.22696612 0.05531969]
 [0.08506558 0.06716078 0.09140845 ... 0.09611458 0.25881431 0.07281016]
 ...
 [0.08194279 0.05790422 0.05936885 ... 0.16132596 0.13182935 0.13840428]
 [0.10423137 0.07257192 0.06151288 ... 0.12071347 0.11253575 0.16314245]
 [0.07001191 0.04233691 0.05205844 ... 0.29163649 0.14413459 0.06231043]]
Difference in evaluation of doc similarity:
[38;5;2m4157.72458740711[0m
Difference in evaluation of doc similarity:
[38;5;2m761.4294931685141[0m
####################################################
Averages of inferred thetas in nodes (level-1 models):  4137.160606602558
Inferred thetas with second level model:  4196.128134706327
Baseline:  761.4294931685141
