Skip to content

Commit

Permalink
Add MMR (#4)
Browse files Browse the repository at this point in the history
* Added MMR
* Update documentation and pypi version
  • Loading branch information
MaartenGr committed Oct 27, 2020
1 parent 222cc5b commit abb5231
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 89 deletions.
1 change: 0 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ install-test:

pypi:
python setup.py sdist
python setup.py bdist_wheel --universal
twine upload dist/*

clean:
Expand Down
25 changes: 25 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,31 @@ of words you would like in the resulting keyphrases:
'learning function']
```

To diversify the results, we can use Maximal Margin Relevance (MMR) to create
keywords / keyphrases which is also based on cosine similarity. The results
with **high diversity**:

```python
>>> model.extract_keywords(doc, keyphrase_length=3, stop_words='english', use_mmr=True, diversity=0.7)
['algorithm generalize training',
'labels unseen instances',
'new examples optimal',
'determine class labels',
'supervised learning algorithm']
```

The results with **low diversity**:

```python
>>> model.extract_keywords(doc, keyphrase_length=3, stop_words='english', use_mmr=True, diversity=0.2)
['algorithm generalize training',
'learning machine learning',
'learning algorithm analyzes',
'supervised learning algorithm',
'algorithm analyzes training']
```


## References
Below, you can find several resources that were used for the creation of KeyBERT
but most importantly, are amazing resources for creating impressive keyword extraction models:
Expand Down
118 changes: 41 additions & 77 deletions keybert/mmr.py
Original file line number Diff line number Diff line change
@@ -1,92 +1,56 @@
# Copyright (c) 2017-present, Swisscom (Schweiz) AG.
# All rights reserved.
#
#Authors: Kamil Bennani-Smires, Yann Savary


import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from typing import List


def MMR(doc_embedd, candidates, X, beta, N):
"""
Core method using Maximal Marginal Relevance in charge to return the top-N candidates
:param candidates: list of candidates (string)
:param X: numpy array with the embedding of each candidate in each row
:param beta: hyperparameter beta for MMR (control tradeoff between informativeness and diversity)
:param N: number of candidates to extract
:return: A tuple with 3 elements :
1)list of the top-N candidates (or less if there are not enough candidates) (list of string)
2)list of associated relevance scores (list of float)
3)list containing for each keyphrase a list of alias (list of list of string)
"""

N = min(N, len(candidates))
doc_sim = cosine_similarity(X, doc_embedd.reshape(1, -1))

doc_sim_norm = doc_sim/np.max(doc_sim)
doc_sim_norm = 0.5 + (doc_sim_norm - np.average(doc_sim_norm)) / np.std(doc_sim_norm)

sim_between = cosine_similarity(X)
np.fill_diagonal(sim_between, np.NaN)
def mmr(doc_embedding: np.ndarray,
word_embeddings: np.ndarray,
words: List[str],
top_n: int = 5,
diversity: float = 0.8) -> List[str]:
""" Calculate Maximal Marginal Relevance (MMR)
between candidate keywords and the document.
sim_between_norm = sim_between/np.nanmax(sim_between, axis=0)
sim_between_norm = \
0.5 + (sim_between_norm - np.nanmean(sim_between_norm, axis=0)) / np.nanstd(sim_between_norm, axis=0)
selected_candidates = []
unselected_candidates = [c for c in range(len(candidates))]
MMR considers the similarity of keywords/keyphrases with the
document, along with the similarity of already selected
keywords and keyphrases. This results in a selection of keywords
that maximize their within diversity with respect to the document.
j = int(np.argmax(doc_sim))
selected_candidates.append(j)
unselected_candidates.remove(j)
Arguments:
doc_embedding: The document embeddings
word_embeddings: The embeddings of the selected candidate keywords/phrases
words: The selected candidate keywords/keyphrases
top_n: The number of keywords/keyhprases to return
diversity: How diverse the select keywords/keyphrases are.
Values between 0 and 1 with 0 being not diverse at all
and 1 being most diverse.
for _ in range(N - 1):
selec_array = np.array(selected_candidates)
unselec_array = np.array(unselected_candidates)
Returns:
List[str]: The selected keywords/keyphrases
distance_to_doc = doc_sim_norm[unselec_array, :]
dist_between = sim_between_norm[unselec_array][:, selec_array]
if dist_between.ndim == 1:
dist_between = dist_between[:, np.newaxis]
j = np.argmax(beta * distance_to_doc - (1 - beta) * np.max(dist_between, axis=1).reshape(-1, 1))
item_idx = unselected_candidates[j]
selected_candidates.append(item_idx)
unselected_candidates.remove(item_idx)

return candidates, selected_candidates
"""

# Extract similarity within words, and between words and the document
word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
word_similarity = cosine_similarity(word_embeddings)

def max_normalization(array):
"""
Compute maximum normalization (max is set to 1) of the array
:param array: 1-d array
:return: 1-d array max- normalized : each value is multiplied by 1/max value
"""
return 1/np.max(array) * array.squeeze(axis=1)
# Initialize candidates and already choose best keyword/keyphras
keywords_idx = [np.argmax(word_doc_similarity)]
candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

for _ in range(top_n - 1):
# Extract similarities within candidates and
# between candidates and selected keywords/phrases
candidate_similarities = word_doc_similarity[candidates_idx, :]
target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

def get_aliases(kp_sim_between, candidates, threshold):
"""
Find candidates which are very similar to the keyphrases (aliases)
:param kp_sim_between: ndarray of shape (nb_kp , nb candidates) containing the similarity
of each kp with all the candidates. Note that the similarity between the keyphrase and itself should be set to
NaN or 0
:param candidates: array of candidates (array of string)
:return: list containing for each keyphrase a list that contain candidates which are aliases
(very similar) (list of list of string)
"""
# Calculate MMR
mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
mmr_idx = candidates_idx[np.argmax(mmr)]

kp_sim_between = np.nan_to_num(kp_sim_between, 0)
idx_sorted = np.flip(np.argsort(kp_sim_between), 1)
aliases = []
for kp_idx, item in enumerate(idx_sorted):
alias_for_item = []
for i in item:
if kp_sim_between[kp_idx, i] >= threshold:
alias_for_item.append(candidates[i])
else:
break
aliases.append(alias_for_item)
# Update keywords & candidates
keywords_idx.append(mmr_idx)
candidates_idx.remove(mmr_idx)

return aliases
return [words[idx] for idx in keywords_idx]
32 changes: 25 additions & 7 deletions keybert/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from tqdm import tqdm
from typing import List, Union
import warnings
from .mmr import mmr


class KeyBERT:
Expand Down Expand Up @@ -36,7 +37,9 @@ def extract_keywords(self,
keyphrase_length: int = 1,
stop_words: Union[str, List[str]] = 'english',
top_n: int = 5,
min_df: int = 1) -> Union[List[str], List[List[str]]]:
min_df: int = 1,
use_mmr: bool = False,
diversity: float = 0.5) -> Union[List[str], List[List[str]]]:
""" Extract keywords/keyphrases
NOTE:
Expand All @@ -61,6 +64,10 @@ def extract_keywords(self,
top_n: Return the top n keywords/keyphrases
min_df: Minimum document frequency of a word across all documents
if keywords for multiple documents need to be extracted
use_mmr: Whether to use Maximal Marginal Relevance (MMR) for the
selection of keywords/keyphrases
diversity: The diversity of the results between 0 and 1 if use_mmr
is set to True
Returns:
keywords: the top n keywords for a document
Expand All @@ -71,7 +78,9 @@ def extract_keywords(self,
return self._extract_keywords_single_doc(docs,
keyphrase_length,
stop_words,
top_n)
top_n,
use_mmr,
diversity)
elif isinstance(docs, list):
warnings.warn("Although extracting keywords for multiple documents is faster "
"than iterating over single documents, it requires significant memory "
Expand All @@ -86,14 +95,18 @@ def _extract_keywords_single_doc(self,
doc: str,
keyphrase_length: int = 1,
stop_words: Union[str, List[str]] = 'english',
top_n: int = 5) -> List[str]:
top_n: int = 5,
use_mmr: bool = False,
diversity: float = 0.5) -> List[str]:
""" Extract keywords/keyphrases for a single document
Arguments:
doc: The document for which to extract keywords/keyphrases
keyphrase_length: Length, in words, of the extracted keywords/keyphrases
stop_words: Stopwords to remove from the document
top_n: Return the top n keywords/keyphrases
use_mmr: Whether to use MMR
diversity: The diversity of results between 0 and 1 if use_mmr is True
Returns:
keywords: The top n keywords for a document
Expand All @@ -106,14 +119,17 @@ def _extract_keywords_single_doc(self,
words = count.get_feature_names()

# Extract Embeddings
doc_embeddings = self.model.encode([doc])
doc_embedding = self.model.encode([doc])
word_embeddings = self.model.encode(words)

# Calculate distances and extract keywords
distances = cosine_similarity(doc_embeddings, word_embeddings)
keywords = [words[index] for index in distances.argsort()[0][-top_n:]]
if use_mmr:
keywords = mmr(doc_embedding, word_embeddings, words, top_n, diversity)
else:
distances = cosine_similarity(doc_embedding, word_embeddings)
keywords = [words[index] for index in distances.argsort()[0][-top_n:]][::-1]

return keywords[::-1]
return keywords
except ValueError:
return []

Expand All @@ -125,6 +141,8 @@ def _extract_keywords_multiple_docs(self,
min_df: int = 1):
""" Extract keywords/keyphrases for a multiple documents
This currently does not use MMR as
Arguments:
docs: The document for which to extract keywords/keyphrases
keyphrase_length: Length, in words, of the extracted keywords/keyphrases
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
setuptools.setup(
name="keybert",
packages=["keybert"],
version="0.0.1",
version="0.1.0",
author="Maarten Grootendorst",
author_email="maartengrootendorst@gmail.com",
description="KeyBERT performs keyword extraction with state-of-the-art transformer models.",
Expand Down
10 changes: 7 additions & 3 deletions tests/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,15 @@ def test_single_doc(keyphrase_length, base_keybert):
assert len(keyword.split(" ")) == keyphrase_length


@pytest.mark.parametrize("keyphrase_length", [i+1 for i in range(5)])
def test_extract_keywords_single_doc(keyphrase_length, base_keybert):
@pytest.mark.parametrize("keyphrase_length, mmr", [(i+1, truth) for i in range(5) for truth in [True, False]])
def test_extract_keywords_single_doc(keyphrase_length, mmr, base_keybert):
""" Test extraction of protected single document method """
top_n = 5
keywords = base_keybert._extract_keywords_single_doc(doc_one, top_n=top_n, keyphrase_length=keyphrase_length)
keywords = base_keybert._extract_keywords_single_doc(doc_one,
top_n=top_n,
keyphrase_length=keyphrase_length,
use_mmr=mmr,
diversity=0.5)
assert isinstance(keywords, list)
assert isinstance(keywords[0], str)
assert len(keywords) == top_n
Expand Down

0 comments on commit abb5231

Please sign in to comment.