-
Notifications
You must be signed in to change notification settings - Fork 343
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Added MMR * Update documentation and pypi version
- Loading branch information
Showing
6 changed files
with
99 additions
and
89 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,92 +1,56 @@ | ||
# Copyright (c) 2017-present, Swisscom (Schweiz) AG. | ||
# All rights reserved. | ||
# | ||
#Authors: Kamil Bennani-Smires, Yann Savary | ||
|
||
|
||
import numpy as np | ||
from sklearn.metrics.pairwise import cosine_similarity | ||
from typing import List | ||
|
||
|
||
def MMR(doc_embedd, candidates, X, beta, N): | ||
""" | ||
Core method using Maximal Marginal Relevance in charge to return the top-N candidates | ||
:param candidates: list of candidates (string) | ||
:param X: numpy array with the embedding of each candidate in each row | ||
:param beta: hyperparameter beta for MMR (control tradeoff between informativeness and diversity) | ||
:param N: number of candidates to extract | ||
:return: A tuple with 3 elements : | ||
1)list of the top-N candidates (or less if there are not enough candidates) (list of string) | ||
2)list of associated relevance scores (list of float) | ||
3)list containing for each keyphrase a list of alias (list of list of string) | ||
""" | ||
|
||
N = min(N, len(candidates)) | ||
doc_sim = cosine_similarity(X, doc_embedd.reshape(1, -1)) | ||
|
||
doc_sim_norm = doc_sim/np.max(doc_sim) | ||
doc_sim_norm = 0.5 + (doc_sim_norm - np.average(doc_sim_norm)) / np.std(doc_sim_norm) | ||
|
||
sim_between = cosine_similarity(X) | ||
np.fill_diagonal(sim_between, np.NaN) | ||
def mmr(doc_embedding: np.ndarray, | ||
word_embeddings: np.ndarray, | ||
words: List[str], | ||
top_n: int = 5, | ||
diversity: float = 0.8) -> List[str]: | ||
""" Calculate Maximal Marginal Relevance (MMR) | ||
between candidate keywords and the document. | ||
sim_between_norm = sim_between/np.nanmax(sim_between, axis=0) | ||
sim_between_norm = \ | ||
0.5 + (sim_between_norm - np.nanmean(sim_between_norm, axis=0)) / np.nanstd(sim_between_norm, axis=0) | ||
selected_candidates = [] | ||
unselected_candidates = [c for c in range(len(candidates))] | ||
MMR considers the similarity of keywords/keyphrases with the | ||
document, along with the similarity of already selected | ||
keywords and keyphrases. This results in a selection of keywords | ||
that maximize their within diversity with respect to the document. | ||
j = int(np.argmax(doc_sim)) | ||
selected_candidates.append(j) | ||
unselected_candidates.remove(j) | ||
Arguments: | ||
doc_embedding: The document embeddings | ||
word_embeddings: The embeddings of the selected candidate keywords/phrases | ||
words: The selected candidate keywords/keyphrases | ||
top_n: The number of keywords/keyhprases to return | ||
diversity: How diverse the select keywords/keyphrases are. | ||
Values between 0 and 1 with 0 being not diverse at all | ||
and 1 being most diverse. | ||
for _ in range(N - 1): | ||
selec_array = np.array(selected_candidates) | ||
unselec_array = np.array(unselected_candidates) | ||
Returns: | ||
List[str]: The selected keywords/keyphrases | ||
distance_to_doc = doc_sim_norm[unselec_array, :] | ||
dist_between = sim_between_norm[unselec_array][:, selec_array] | ||
if dist_between.ndim == 1: | ||
dist_between = dist_between[:, np.newaxis] | ||
j = np.argmax(beta * distance_to_doc - (1 - beta) * np.max(dist_between, axis=1).reshape(-1, 1)) | ||
item_idx = unselected_candidates[j] | ||
selected_candidates.append(item_idx) | ||
unselected_candidates.remove(item_idx) | ||
|
||
return candidates, selected_candidates | ||
""" | ||
|
||
# Extract similarity within words, and between words and the document | ||
word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding) | ||
word_similarity = cosine_similarity(word_embeddings) | ||
|
||
def max_normalization(array): | ||
""" | ||
Compute maximum normalization (max is set to 1) of the array | ||
:param array: 1-d array | ||
:return: 1-d array max- normalized : each value is multiplied by 1/max value | ||
""" | ||
return 1/np.max(array) * array.squeeze(axis=1) | ||
# Initialize candidates and already choose best keyword/keyphras | ||
keywords_idx = [np.argmax(word_doc_similarity)] | ||
candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]] | ||
|
||
for _ in range(top_n - 1): | ||
# Extract similarities within candidates and | ||
# between candidates and selected keywords/phrases | ||
candidate_similarities = word_doc_similarity[candidates_idx, :] | ||
target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1) | ||
|
||
def get_aliases(kp_sim_between, candidates, threshold): | ||
""" | ||
Find candidates which are very similar to the keyphrases (aliases) | ||
:param kp_sim_between: ndarray of shape (nb_kp , nb candidates) containing the similarity | ||
of each kp with all the candidates. Note that the similarity between the keyphrase and itself should be set to | ||
NaN or 0 | ||
:param candidates: array of candidates (array of string) | ||
:return: list containing for each keyphrase a list that contain candidates which are aliases | ||
(very similar) (list of list of string) | ||
""" | ||
# Calculate MMR | ||
mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1) | ||
mmr_idx = candidates_idx[np.argmax(mmr)] | ||
|
||
kp_sim_between = np.nan_to_num(kp_sim_between, 0) | ||
idx_sorted = np.flip(np.argsort(kp_sim_between), 1) | ||
aliases = [] | ||
for kp_idx, item in enumerate(idx_sorted): | ||
alias_for_item = [] | ||
for i in item: | ||
if kp_sim_between[kp_idx, i] >= threshold: | ||
alias_for_item.append(candidates[i]) | ||
else: | ||
break | ||
aliases.append(alias_for_item) | ||
# Update keywords & candidates | ||
keywords_idx.append(mmr_idx) | ||
candidates_idx.remove(mmr_idx) | ||
|
||
return aliases | ||
return [words[idx] for idx in keywords_idx] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters