In [None]:
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from scipy.special import softmax
import multiprocessing as mp
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch
from torch.nn import functional as F
from sklearn.preprocessing import normalize
import gensim
from gensim.models.wrappers.ldamallet import malletmodel2ldamodel

In [None]:
import os #importing os to set environment variable

def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64" #set environment variab
  !java -version #check java version

install_java()

openjdk version "11.0.13" 2021-10-19
OpenJDK Runtime Environment (build 11.0.13+8-Ubuntu-0ubuntu1.18.04)
OpenJDK 64-Bit Server VM (build 11.0.13+8-Ubuntu-0ubuntu1.18.04, mixed mode, sharing)


In [None]:
%%capture
!wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip #Mallet download
!unzip mallet-2.0.8.zip

In [None]:
os.environ['MALLET_HOME'] = '/content/mallet-2.0.8' #set environment variable MALLET_HOME
mallet_path = '/content/mallet-2.0.8/bin/mallet'

# 1. Creation of synthetic corpus

We consider a scenario with n parties, each of them as an associated corpus.
To generate the corpus associated with each of the parties, we consider a common beta distribution (word-topic distribution), but we freeze different topics/ assign different asymmetric Dirichlet priors favoring different topics at the time of generating the document that composes each party's corpus.

## 1.1. Function for permuting the Dirichlet prior at each node

In [None]:
def rotateArray(arr, n, d):
    temp = []
    i = 0
    while (i < d):
        temp.append(arr[i])
        i = i + 1
    i = 0
    while (d < n):
        arr[i] = arr[d]
        i = i + 1
        d = d + 1
    arr[:] = arr[: i] + temp
    return arr

## 1.2. Topic modeling and node settings

In [None]:
# Topic modeling settings
vocab_size = 10000
n_topics = 10
beta = 1 #1e-2
alpha = 5/n_topics
n_docs = 1000
nwords = (150, 250) #Min and max lengths of the documents

# Nodes settings
n_nodes = 5
frozen_topics = 3
dirichlet_symmetric = False
prior = (n_topics)*[0.9]
prior[0] = prior[1] = prior[2] = 0.1
print(prior)

[0.1, 0.1, 0.1, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9]


## 1.3. Topics generation (common for all nodes)

In [None]:
topic_vectors = np.random.dirichlet(vocab_size*[beta], n_topics)
print('Ordered probabilities for the first topic vector:')
print(np.sort(topic_vectors[0])[::-1])
print(topic_vectors.shape)

Ordered probabilities for the first topic vector:
[9.97038953e-04 8.81063162e-04 8.29019050e-04 ... 1.74076137e-08
 1.10299467e-08 6.78132215e-09]
(10, 10000)


## 1.4. Generation of document topic proportions and documents for each node


In [None]:
doc_topics_all_gt = []
documents_all = []
z_all = []
for i in np.arange(n_nodes):
  # Step 2 - generation of document topic proportions for each node
  if dirichlet_symmetric:
    doc_topics = np.random.dirichlet((n_topics)*[alpha], n_docs)
  else:
    doc_topics = np.random.dirichlet(prior, n_docs)
    prior = rotateArray(prior, len(prior), 3)
    print(prior)
  print('Ordered probabilities for the first document - node', str(i), ':')
  print(np.sort(doc_topics[0])[::-1])
  doc_topics_all_gt.append(doc_topics)
  # Step 3 - Document generation
  documents = [] # Document words
  z = [] # Assignments
  for docid in np.arange(n_docs):
      doc_len = np.random.randint(low=nwords[0], high=nwords[1])
      this_doc_words = []
      this_doc_assigns = []
      for wd_idx in np.arange(doc_len):
          tpc = np.nonzero(np.random.multinomial(1, doc_topics[docid]))[0][0]
          this_doc_assigns.append(tpc)
          word = np.nonzero(np.random.multinomial(1, topic_vectors[tpc]))[0][0]
          this_doc_words.append('wd'+str(word))
      z.append(this_doc_assigns)
      documents.append(this_doc_words)
  print("Documents of node", str(i), "generated.")
  documents_all.append(documents)
  z_all.append(z)

[0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.1, 0.1, 0.1]
Ordered probabilities for the first document - node 0 :
[2.29261975e-01 2.11470344e-01 2.09714483e-01 1.35515211e-01
 1.34462622e-01 6.49942691e-02 1.42732233e-02 2.84482430e-04
 2.33905934e-05 1.06571932e-09]
Documents of node 0 generated.
[0.9, 0.9, 0.9, 0.9, 0.1, 0.1, 0.1, 0.9, 0.9, 0.9]
Ordered probabilities for the first document - node 1 :
[4.61419792e-01 2.81965118e-01 1.49825846e-01 5.25369268e-02
 4.18029231e-02 6.30235985e-03 5.51213409e-03 3.16427089e-04
 3.10018985e-04 8.45417846e-06]
Documents of node 1 generated.
[0.9, 0.1, 0.1, 0.1, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9]
Ordered probabilities for the first document - node 2 :
[4.55968488e-01 2.00659356e-01 1.10106517e-01 1.09342398e-01
 7.59038610e-02 3.96122953e-02 4.26871027e-03 3.99939443e-03
 1.38979970e-04 1.12917064e-17]
Documents of node 2 generated.
[0.1, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.1, 0.1]
Ordered probabilities for the first document - node 3 :
[3.07623329e-01 2

In [None]:
doc_topics_all_gt[0].shape

(1000, 10)

In [None]:
all_words = []
for word in np.arange(vocab_size+1):
  if word > 0:
    all_words.append('wd'+str(word))
print(all_words)

['wd1', 'wd2', 'wd3', 'wd4', 'wd5', 'wd6', 'wd7', 'wd8', 'wd9', 'wd10', 'wd11', 'wd12', 'wd13', 'wd14', 'wd15', 'wd16', 'wd17', 'wd18', 'wd19', 'wd20', 'wd21', 'wd22', 'wd23', 'wd24', 'wd25', 'wd26', 'wd27', 'wd28', 'wd29', 'wd30', 'wd31', 'wd32', 'wd33', 'wd34', 'wd35', 'wd36', 'wd37', 'wd38', 'wd39', 'wd40', 'wd41', 'wd42', 'wd43', 'wd44', 'wd45', 'wd46', 'wd47', 'wd48', 'wd49', 'wd50', 'wd51', 'wd52', 'wd53', 'wd54', 'wd55', 'wd56', 'wd57', 'wd58', 'wd59', 'wd60', 'wd61', 'wd62', 'wd63', 'wd64', 'wd65', 'wd66', 'wd67', 'wd68', 'wd69', 'wd70', 'wd71', 'wd72', 'wd73', 'wd74', 'wd75', 'wd76', 'wd77', 'wd78', 'wd79', 'wd80', 'wd81', 'wd82', 'wd83', 'wd84', 'wd85', 'wd86', 'wd87', 'wd88', 'wd89', 'wd90', 'wd91', 'wd92', 'wd93', 'wd94', 'wd95', 'wd96', 'wd97', 'wd98', 'wd99', 'wd100', 'wd101', 'wd102', 'wd103', 'wd104', 'wd105', 'wd106', 'wd107', 'wd108', 'wd109', 'wd110', 'wd111', 'wd112', 'wd113', 'wd114', 'wd115', 'wd116', 'wd117', 'wd118', 'wd119', 'wd120', 'wd121', 'wd122', 'wd123', 

# 2. Creation of Mallet LDA models at each node

In [None]:
def train_lda_mallet_model(corpus, n_topics, alpha):
  # Create and filter dictionary
  D = gensim.corpora.Dictionary(corpus) # Created from all tokens
  len_D_orig = len(D)
  no_below = 5 # Minimum number of documents in which a term must appear to keep this term in the dictionary
  no_above = 0.5 # Maximum proportion of documents in which a term can appear to be kept in the dictionary
  D.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)
  print("Terms in the dictionary filtered from", len_D_orig, "to", len(D))
  # Build BoW
  corpus_bow = [D.doc2bow(doc) for doc in corpus]
  # Create LDA Mallet model
  lda_mallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus_bow, id2word=D, num_topics=n_topics, alpha=alpha, iterations=1000) 
  return lda_mallet, corpus_bow

In [None]:
def convert_beta_to_init_vocab(topic_words_lda):
  w_t_distrib = np.zeros((n_topics,vocab_size), dtype=np.float64) 
  for tpc in np.arange(n_topics):
    tpc_words_distrib = lda_gensim.show_topic(tpc, topic_words_lda.shape[1])
    for idx in np.arange(len(tpc_words_distrib)):
      for j in np.arange(len(all_words)):
        word = tpc_words_distrib[idx][0]
        prob_word = tpc_words_distrib[idx][1]
        if all_words[j] == word:
          w_t_distrib[tpc,j] = prob_word
          break
  return w_t_distrib

In [None]:
n_topics_train = 10
alpha_train = 5/n_topics_train
lda_mallet_models = []
lda_gensim_models = []
corpus_bows = []
doc_topic_all = []
word_topic_all = []
for node in np.arange(n_nodes):
  lda_mallet, corpus_bow = train_lda_mallet_model(documents_all[node], n_topics_train, alpha_train)
  lda_mallet_models.append(lda_mallet)
  corpus_bows.append(corpus_bow)

  lda_gensim = malletmodel2ldamodel(lda_mallet)
  lda_gensim_models.append(lda_gensim)

  doc_topics_lda = [el for el in lda_gensim[corpus_bow[:]]] 
  doc_topics_lda = gensim.matutils.corpus2dense(doc_topics_lda, n_topics_train).T
  doc_topic_all.append(doc_topics_lda)

  topic_words_lda = lda_gensim.get_topics()
  converted_dim_topic_words_lda = convert_beta_to_init_vocab(topic_words_lda)
  word_topic_all.append(converted_dim_topic_words_lda)

Terms in the dictionary filtered from 10000 to 9904


  result = np.column_stack(sparse2full(doc, num_terms) for doc in corpus)


Terms in the dictionary filtered from 10000 to 9920
Terms in the dictionary filtered from 9999 to 9918
Terms in the dictionary filtered from 10000 to 9935
Terms in the dictionary filtered from 9998 to 9939


# 3. Creation of the centralized model

In [None]:
documents_centr = [*documents_all[0], *documents_all[1], *documents_all[2], *documents_all[3], *documents_all[4]]
len(documents_centr)

5000

In [None]:
lda_mallet_centr, corpus_bow_centr = train_lda_mallet_model(documents_centr, n_topics_train, alpha_train)
lda_gensim_centr = malletmodel2ldamodel(lda_mallet_centr)

doc_topics_centr = [el for el in lda_gensim_centr[corpus_bow_centr[:]]] 
doc_topics_centr = gensim.matutils.corpus2dense(doc_topics_centr, n_topics_train).T

topic_words_before_centr = lda_gensim_centr.get_topics()
topic_words_centr = convert_beta_to_init_vocab(topic_words_before_centr)

Terms in the dictionary filtered from 10000 to 10000


  result = np.column_stack(sparse2full(doc, num_terms) for doc in corpus)


# 4. Get similarity through Frobenius

In [None]:
doc_topic_centr_all = []
doc_topic_centr_all.append(doc_topics_centr[0:1000,:])
doc_topic_centr_all.append(doc_topics_centr[1000:2000,:])
doc_topic_centr_all.append(doc_topics_centr[2000:3000,:])
doc_topic_centr_all.append(doc_topics_centr[3000:4000,:])
doc_topic_centr_all.append(doc_topics_centr[4000:5000,:])

In [None]:
for node in np.arange(n_nodes):
  # Ground truth in node vs inferred in node
  doc_topics_lda_sqrt_node = np.sqrt(doc_topic_all[node])
  similarity_lda_node = doc_topics_lda_sqrt_node.dot(doc_topics_lda_sqrt_node.T)

  doc_topics_gt_sqrt_node = np.sqrt(doc_topics_all_gt[node])
  similarity_gt = doc_topics_gt_sqrt_node.dot(doc_topics_gt_sqrt_node.T)

  diff_sims = similarity_lda_node - similarity_gt
  frobenius_diff_sims_node = np.linalg.norm(diff_sims,'fro')

  # Ground truth in node vs centralized (for documents of such a node)
  doc_topics_lda_sqrt_centr_node = np.sqrt(doc_topic_centr_all[node])
  similarity_lda_centr = doc_topics_lda_sqrt_centr_node.dot(doc_topics_lda_sqrt_centr_node.T)

  diff_sims = similarity_lda_centr - similarity_gt
  frobenius_diff_sims_avg = np.linalg.norm(diff_sims,'fro')

  print("NODE", str(node))
  print("GT vs inferred in node:", frobenius_diff_sims_node)
  print("GT vs centralized in node", frobenius_diff_sims_avg)
  print("***************************************************************")

NODE 0
GT vs inferred in node: 143.2911516249535
GT vs centralized in node 233.46675161536274
***************************************************************
NODE 1
GT vs inferred in node: 142.7989499767236
GT vs centralized in node 232.92920094859073
***************************************************************
NODE 2
GT vs inferred in node: 147.02617525319747
GT vs centralized in node 233.89877523459222
***************************************************************
NODE 3
GT vs inferred in node: 144.00207782812947
GT vs centralized in node 231.82537092439395
***************************************************************
NODE 4
GT vs inferred in node: 139.47133394147986
GT vs centralized in node 228.18312396662668
***************************************************************


In [None]:
print(word_topic_all[0].shape)

(10, 10000)


In [None]:
for node in np.arange(n_nodes):
  # GT vs inferred in node
  print(node)
  topic_words_gt_sqrt = np.sqrt(topic_vectors)
  topic_words_lda_node_sqrt = np.sqrt(word_topic_all[node])
  simmat_t_w = topic_words_gt_sqrt.dot(topic_words_lda_node_sqrt.T)

  simmat_t_w_pd = pd.DataFrame(simmat_t_w)
  maxValues_rows = simmat_t_w_pd.max(axis = 1)
  max_values_rows_sum = maxValues_rows.sum()

  print("NODE", str(node))
  print("Original vs inferred in node sum max row:", max_values_rows_sum)
  print("***************************************************************")

# GT vs centralized
topic_words_lda_centr_sqrt = np.sqrt(topic_words_centr)
simmat_t_w = topic_words_gt_sqrt.dot(topic_words_lda_centr_sqrt.T)

simmat_t_w_pd = pd.DataFrame(simmat_t_w)
maxValues_rows = simmat_t_w_pd.max(axis = 1)
max_values_rows_sum_centr = maxValues_rows.sum()

print("CENTRALIZED")
print("Original vs avg of inferred in nodes sum max row", max_values_rows_sum_centr)

0
NODE 0
Original vs inferred in node sum max row: 8.814833845463367
***************************************************************
1
NODE 1
Original vs inferred in node sum max row: 8.82388545054106
***************************************************************
2
NODE 2
Original vs inferred in node sum max row: 8.819747396645102
***************************************************************
3
NODE 3
Original vs inferred in node sum max row: 8.827138968908683
***************************************************************
4
NODE 4
Original vs inferred in node sum max row: 8.830822617397017
***************************************************************
CENTRALIZED
Original vs avg of inferred in nodes sum max row 8.830822617397017
