Skip to content

Commit

Permalink
fix #45 and update history
Browse files Browse the repository at this point in the history
  • Loading branch information
silviatti committed Dec 8, 2021
1 parent 44e4115 commit a8ac5d2
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 17 deletions.
3 changes: 3 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
=======
History
=======
1.10.1 (2021-12-08)
--------------------
* Bug fix Coherence with word embeddings (#43, #45)

1.10.0 (2021-11-21)
--------------------
Expand Down
39 changes: 23 additions & 16 deletions octis/evaluation_metrics/coherence_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,13 +119,16 @@ def score(self, model_output):
for word in topic[0:self.topk]:
if word in self._wv.key_to_index.keys():
word_embedding = self._wv.__getitem__(word)
normalized_we = word_embedding/word_embedding.sum()
normalized_we = word_embedding / word_embedding.sum()
E.append(normalized_we)
E = np.array(E)
if len(E) > 0:
E = np.array(E)

# Perform cosine similarity between E rows
distances = np.sum(pairwise_distances(E, metric='cosine'))
topic_coherence = (distances)/(2*self.topk*(self.topk-1))
# Perform cosine similarity between E rows
distances = np.sum(pairwise_distances(E, metric='cosine'))
topic_coherence = distances/(2*self.topk*(self.topk-1))
else:
topic_coherence = -1

# Update result with the computed coherence of the topic
result += topic_coherence
Expand Down Expand Up @@ -177,8 +180,7 @@ def score(self, model_output):
for topic in topics:
E = []
# average vector of the words in topic (centroid)
t = [0] * len(self._wv.__getitem__(topic[0]))

t = np.zeros(self._wv.vector_size)
# Create matrix E (normalize word embeddings of
# words represented as vectors in wv) and
# average vector of the words in topic
Expand All @@ -188,16 +190,21 @@ def score(self, model_output):
normalized_we = word_embedding/sum(word_embedding)
E.append(normalized_we)
t = list(map(add, t, word_embedding))

t = np.array(t)
t = t/(len(t)*sum(t))

topic_coherence = 0
# Perform cosine similarity between each word embedding in E
# and t.
for word_embedding in E:
distance = spatial.distance.cosine(word_embedding, t)
topic_coherence += distance
topic_coherence = topic_coherence/self.topk
if t.ndim > 1:
t = t/(len(t)*sum(t))

if len(E) > 0:
topic_coherence = 0
# Perform cosine similarity between each word embedding in E
# and t.
for word_embedding in E:
distance = spatial.distance.cosine(word_embedding, t)
topic_coherence += distance
topic_coherence = topic_coherence/self.topk
else:
topic_coherence = -1

# Update result with the computed coherence of the topic
result += topic_coherence
Expand Down
19 changes: 18 additions & 1 deletion tests/test_evaluation_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def test_npmi_coherence_measures(dataset, model_output):
assert type(score) == np.float64 or type(score) == float
assert -1 <= score <= 1


def test_we_coherence_measures(dataset, model_output):
metric = WECoherenceCentroid(topk=5)
score = metric.score(model_output)
Expand All @@ -92,6 +93,23 @@ def test_we_coherence_measures(dataset, model_output):
assert -1 <= score <= 1


def test_we_coherence_measures_oov(dataset):
model_output = {'topics':
[['dsa', 'dsadgfd', '11111', '22222', 'bbbbbbbb'],
['aaaaa', 'bbb', 'cc', 'd', 'EEE']]}
metric = WECoherenceCentroid(topk=5)
score = metric.score(model_output)
assert type(score) == np.float64 or type(score) == np.float32 or type(score) == float
assert -1 <= score <= 1
print(score)

metric = WECoherencePairwise(topk=10)
score = metric.score(model_output)
assert type(score) == np.float64 or type(score) == np.float32 or type(score) == float
assert -1 <= score <= 1
print(score)


def test_diversity_measures(dataset, model_output):
metric = TopicDiversity(topk=10)
score = metric.score(model_output)
Expand Down Expand Up @@ -141,7 +159,6 @@ def test_similarity_measures(dataset, model_output):
assert 0 <= score <= 1



def test_irbo(dataset, model_output):
metric = InvertedRBO(topk=10)
score = metric.score(model_output)
Expand Down

0 comments on commit a8ac5d2

Please sign in to comment.