-
Notifications
You must be signed in to change notification settings - Fork 0
/
lda_matching.py
301 lines (242 loc) · 12.8 KB
/
lda_matching.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
"""main file to run the lda matching"""
# -*- coding: utf-8 -*-
# !/usr/bin/env python3
import time
import socket
import datetime
import argparse
import os
from pprint import pprint
import torch
import torch.nn as nn
import numpy as np
import gensim
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary
from matplotlib import pyplot as plt
from utils.network import KLDivLoss
from utils.lda_matcher import LdaMatcher
LDA_PATH = "./models/"
DATA_PATH = "./data/wiki_data.tar" # the path of the data on which the lda should be tested
PLOT_PATH = "./plots/"
NUM_TOPICS = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
def plot_difference(mdiff: np.array, num_topics: int) -> None:
"""Helper function to plot difference between models.
:mdiff: the confusion matrix between the two models
:num_topics: number of topics used to train the two LDA models
:retrn: None (but saves the figure)
"""
_, axs = plt.subplots(figsize=(18, 14))
data = axs.imshow(mdiff, cmap="RdBu_r", origin="lower")
plt.title(f"Topic Difference Matrix for {num_topics} topics")
plt.colorbar(data)
plt.savefig(PLOT_PATH+f"difference_matrix_{num_topics}.png")
#plt.show()
def train_lda(num_topics: int, path_suffix: str) -> LdaMulticore:
"""helper function for lda training
:param num_topics: number of topics to train the LDA model
:param path_suffix: suffix for the save path of the lda model
:return: trained LDA Model
"""
dictionary = gensim.corpora.Dictionary.load_from_text("./data/wikipedia_dump/wiki_wordids.txt")
bow_list = gensim.corpora.MmCorpus("./data/wikipedia_dump/wiki_bow.mm.bz2")
# compute the lda model
ldamodel = LdaMulticore(bow_list, num_topics=num_topics,
id2word=dictionary,
passes=1, workers=os.cpu_count(),
eval_every=0)
save_path = LDA_PATH + "matching_lda_model" + str(path_suffix)
if not os.path.isdir(LDA_PATH):
os.mkdir(LDA_PATH)
try:
ldamodel.save(save_path)
except Exception as exception_lda:
print("[ could not save the lda model ]")
print(exception_lda)
return ldamodel
def save_results(sim_dict: dict, diff_dict: dict) -> None:
"""helper function to visualize and save the resulting differences as an graph image
:param sim_dict: dict of tensors with tuples of (NUM_TOPICS, similarites) -> cos sim.
:param diff_dict: diction of tensors with tuples of (NUM_TOPICS, difference) -> KLDiv
:return: None
"""
print("Average similarity per topic number: ")
pprint(sim_dict)
print("Average difference per topic number: ")
pprint(diff_dict)
if not os.path.isdir(PLOT_PATH):
os.mkdir(PLOT_PATH)
plt.style.use("ggplot")
fig, axs = plt.subplots()
idx = np.arange(len(diff_dict))
width = 0.35
rects1 = axs.bar(idx - width/2, list(diff_dict.values()), width=width, label="KLDiv")
rects2 = axs.bar(idx + width/2, list(sim_dict.values()), width=width, label="Cosine_Similarity")
axs.set_xticks(idx)
# use the biggest value of the difference dictionary as the maximum reference for y-axis
axs.set_yticks(np.arange(0., diff_dict[max(diff_dict, key=diff_dict.get)], 0.1))
axs.set_xticklabels(list(diff_dict.keys()), rotation=85)
for rect in rects1:
height = rect.get_height()
axs.annotate("{}".format(np.round(height, 2)),
xy=(rect.get_x() + rect.get_width() / 2, height),
xytext=(0, 3), # 3 points vertical offset
textcoords="offset points",
ha="center", va="bottom")
for rect in rects2:
height = rect.get_height()
axs.annotate("{}".format(np.round(height, 2)),
xy=(rect.get_x() + rect.get_width() / 2, height),
xytext=(0, 3), # 3 points vertical offset
textcoords="offset points",
ha="center", va="bottom")
axs.legend()
axs.set_xlabel("# Topics")
axs.set_ylabel("Difference & Similarity")
fig.tight_layout()
plt.savefig(PLOT_PATH+"difference_plot.png")
# plt.show()
def match_topics(lda1: LdaMulticore, lda2: LdaMulticore, topic_id: int, num_topics: int) -> int:
"""helper function to calculate the difference between two lda models according to their
topic-word distribution.
:param lda1: the first of the two LDA models
:param lda2: the second LDA models
:param topic_id: number of topics used to train the two LDA models
:param num_topics: number of topics used to train the two LDA models
:return: the most similar topic of the second LDA w.r.t the chosen topic of the first LDA
"""
print("--> Match topics")
kldiv_loss = KLDivLoss()
dictionary = gensim.corpora.Dictionary.load_from_text("./data/wikipedia_dump/wiki_wordids.txt")
topics1 = lda1.get_topic_terms(topicid=topic_id, topn=len(dictionary))
word_prob_vec1 = torch.zeros(len(dictionary)) + 10e-16
# initialize the start values and differences
topics2 = lda2.get_topic_terms(topicid=topic_id, topn=len(dictionary))
word_prob_vec2 = torch.zeros(len(dictionary)) + 10e-16
# calculate the difference for the initially chosen topic on both LDAs
for (word_tuple1, word_tuple2) in zip(topics1, topics2):
# assign the word probabilites to their ID in the vector
word_prob_vec1[word_tuple1[0]] = torch.FloatTensor([word_tuple1[1]])
word_prob_vec2[word_tuple2[0]] = torch.FloatTensor([word_tuple2[1]])
difference = kldiv_loss(word_prob_vec1, word_prob_vec2) # initial difference of the same topic
print(f"Initial diff. of the same topic ({topic_id}) on both LDAs: {difference:.{5}f}")
best_topic = topic_id # the topic itself. Gets updated if a better is found
# iterate over every topic of the second LDA to calculate their difference and find the best
for curr_topic_id in range(num_topics):
topics2 = lda2.get_topic_terms(topicid=curr_topic_id, topn=len(dictionary))
# empty vectors for the word probabilities to calculate their difference
word_prob_vec2 = torch.zeros(len(dictionary)) + 10e-16
for word_tuple in topics2:
# assign the word probabilites to their ID in the vector
word_prob_vec2[word_tuple[0]] = torch.FloatTensor([word_tuple[1]])
curr_difference = kldiv_loss(word_prob_vec1, word_prob_vec2)
if curr_difference < difference:
# if the current diff. is better, update the reference diff. and the best topic id
difference = curr_difference
best_topic = curr_topic_id
print(f"Topic {topic_id} ({dictionary[topic_id]}) of LDA1 got matched with topic "
f"{best_topic} ({dictionary[best_topic]}) from LDA2 "
f"with kldiv: {difference:.{5}f}")
return best_topic
def compare_lda_models(lda1: LdaMulticore, lda2: LdaMulticore, num_topics: int) -> torch.Tensor:
"""helper function to calculate the difference between two lda models according to their
topic-word distribution.
:param lda1: the first of the two LDA models
:param lda2: the second LDA models
:param num_topics: number of topics used to train the two LDA models
:return: torch.tensor
"""
print("--> Compare LDA models")
kldiv_loss = KLDivLoss()
cosine_similarity = nn.CosineSimilarity(dim=0, eps=1e-8)
dictionary = gensim.corpora.Dictionary.load_from_text("./data/wikipedia_dump/wiki_wordids.txt")
# compute the confusion matrix between two models
mdiff, _ = lda1.diff(lda2, distance="hellinger", num_words=500)
plot_difference(mdiff, num_topics)
# iterate over every topic and calculate the average difference
tmp_sim = 0.0 # temp var for cosine similarity
tmp_diff = 0.0 # temp var for kullback leibler divergence
for topic_id in range(num_topics):
topics1 = lda1.get_topic_terms(topicid=topic_id, topn=len(dictionary))
topics2 = lda2.get_topic_terms(topicid=topic_id, topn=len(dictionary))
# vectors with the word IDs in their contribution order for the current topic ID
word_id_vec1 = [tuple[0] for tuple in topics1]
word_id_vec2 = [tuple[0] for tuple in topics2]
# calculate the similarity using the cosine similarity where -1 means the two topic vectors
# are completely opposite to each other, while 1 means they are completely similar and
# rectified. A value of 0 means, they are orthogonal to each other
cos_sim = cosine_similarity(torch.FloatTensor(word_id_vec1),
torch.FloatTensor(word_id_vec2))
# since cos_sim is the distance between the vectors, (1 - cos_sim) is the similarity
tmp_sim += (1 - cos_sim)
# empty vectors for the word probabilities to calculate their difference
word_prob_vec1 = torch.zeros(len(dictionary)) + 10e-16
word_prob_vec2 = torch.zeros(len(dictionary)) + 10e-16
for (word_tuple1, word_tuple2) in zip(topics1, topics2):
# assign the word probabilites to their ID in the vector
word_prob_vec1[word_tuple1[0]] = torch.FloatTensor([word_tuple1[1]])
word_prob_vec2[word_tuple2[0]] = torch.FloatTensor([word_tuple2[1]])
tmp_diff += kldiv_loss(word_prob_vec1, word_prob_vec2)
similarity = tmp_sim / num_topics
print(f"--> similarity between current two LDA word vectors: {similarity:.{5}f}")
difference = tmp_diff / num_topics
print(f"--> difference between current two LDA prob. distributions: {difference:.{5}f}")
del dictionary
return similarity, difference
def main(benchmark: bool) -> None:
"""main function for lda matching"""
start = time.perf_counter()
print("\n\n\n"+"#"*55)
print("## " + str(datetime.datetime.now().strftime("%A, %d. %B %Y %I:%M%p")))
print(f"## System: {torch.get_num_threads()} CPU cores with "
f"{os.cpu_count()} threads and "
f"{torch.cuda.device_count()} GPUs on {socket.gethostname()}")
print("#"*55)
print()
if benchmark:
# dictionary for the similarity results in the format: {NUM_TOPIC : similarity}
similarity_results = dict() # similarity between the word vectors and their order
difference_results = dict() # difference between the probability distributions
for curr_num_topics in NUM_TOPICS:
print(f"--> Current number of topics: {curr_num_topics}")
# train or load a reference lda for the current topic number
if os.path.isfile("./models/matching_lda_model_ref_"+str(curr_num_topics)):
print("--> Loading reference LDA")
ref_lda = LdaMulticore.load("./models/matching_lda_model_ref_"+str(curr_num_topics))
else:
print("--> Training reference LDA")
ref_lda = train_lda(curr_num_topics, "_ref_"+str(curr_num_topics))
# train or load LDA_ITERS new lda's to compare their results with CE
if os.path.isfile("./models/matching_lda_model_tmp_"+str(curr_num_topics)):
print("--> Loading second LDA")
tmp_lda = LdaMulticore.load("./models/matching_lda_model_tmp_"+str(curr_num_topics))
else:
print("--> Training second LDA")
tmp_lda = train_lda(curr_num_topics, "_tmp_"+str(curr_num_topics))
# match random selected topics as a test
_ = match_topics(ref_lda, tmp_lda, np.random.randint(curr_num_topics), curr_num_topics)
# compare the two LDA models and build the result matrix
curr_lda_sim, curr_lda_diff = compare_lda_models(ref_lda, tmp_lda, curr_num_topics)
# add to the dictionary
similarity_results[curr_num_topics] = curr_lda_sim
difference_results[curr_num_topics] = curr_lda_diff
save_results(similarity_results, difference_results)
else:
# create a list with loaded LDA's
lda_list = []
for _ in range(1):
lda_list.append(LdaMulticore.load("./models/matching_lda_model_ref_10"))
lda_list.append(LdaMulticore.load("./models/matching_lda_model_tmp_10"))
lda_dict = Dictionary.load_from_text("./data/wikipedia_dump/wiki_wordids.txt")
matcher = LdaMatcher(lda_list=lda_list, threshold=0.5, num_topics=10, dictionary=lda_dict)
print(matcher.get_mapping())
print(matcher.get_core_topic_ids())
end = time.perf_counter()
duration = (np.round(end - start) / 60.) / 60.
print(f"\nComputation time: {duration:0.4f} hours")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--benchmark", "-b", help="runs the difference and similarity benchmarks",
action="store_true", default=False)
args = parser.parse_args()
main(**vars(args))