In [16]:
doc = """
         Reuters - Private investment firm Carlyle Group, which has a reputation for making well-timed and occasionally controversial plays in the defense industry, has quietly placed its bets on another part of the market.
      """

In [45]:

from sklearn.feature_extraction.text import CountVectorizer

n_gram_range = (1, 3)
stop_words = "english"

# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
candidates = count.get_feature_names()

In [10]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([doc])
candidate_embeddings = model.encode(candidates)

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

top_n = 3
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

IndexError: list index out of range

In [47]:
import numpy as np
import itertools

def max_sum_sim(doc_embedding, word_embeddings, words, top_n, nr_candidates):
    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # Get top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

In [48]:
max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n=5, nr_candidates=10)

IndexError: list index out of range

In [23]:
mmr(doc_embedding, candidate_embeddings, candidates, top_n=5, diversity=0.9)

['controversial plays',
 'private',
 'investment firm',
 'carlyle group',
 'defense']

In [29]:
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

def mmr(doc_embedding, word_embeddings, words, top_n, diversity):

    # Extract similarity within words, and between words and the document
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
    word_similarity = cosine_similarity(word_embeddings)

    # Initialize candidates and already choose best keyword/keyphras
    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        # Extract similarities within candidates and
        # between candidates and selected keywords/phrases
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # Calculate MMR
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # Update keywords & candidates
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [words[idx] for idx in keywords_idx]

def load_document_phrase_from_keybert(datafolder):
    # TODOS load what phrase each document have?
    filename = "data/"+datafolder+"/"+datafolder+"_train.txt"
    with open(filename,encoding="UTF-8") as file:
        lines = file.readlines()
        lines = [line.rstrip().replace("-"," ").replace("/"," ") for line in lines]
    
    # init, load transformers
    n_gram_range = (1, 3)
    stop_words = "english"
    top_n = 3
    model = SentenceTransformer('distilbert-base-nli-mean-tokens')
    
    document_id_to_phrases = []
    phrases = set()
    print("mining document phrases...")
    overall_phrases = []
    for i in tqdm(range(len(lines))):
        doc = lines[i]
        # Extract candidate words/phrases
        count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
        candidates = count.get_feature_names()

        # get embeddings
        doc_embedding = model.encode([doc])
        candidate_embeddings = model.encode(candidates)
        res = mmr(doc_embedding, candidate_embeddings, candidates, top_n=5, diversity=0.9)
        document_id_to_phrases.append(res)
        for p in res:
            phrases.add(p)
            
    return (document_id_to_phrases, list(phrases), overall_phrases)

In [31]:
import warnings
warnings.filterwarnings('ignore')
load_document_phrase_from_keybert("agnews")

mining document phrases...



  0%|                                                                                          | 0/500 [00:00<?, ?it/s]
  0%|▏                                                                                 | 1/500 [00:00<03:25,  2.43it/s]
  0%|▎                                                                                 | 2/500 [00:00<03:46,  2.20it/s]
  1%|▍                                                                                 | 3/500 [00:01<04:21,  1.90it/s]
  1%|▋                                                                                 | 4/500 [00:02<04:44,  1.74it/s]
  1%|▊                                                                                 | 5/500 [00:03<05:11,  1.59it/s]
  1%|▉                                                                                 | 6/500 [00:03<05:34,  1.48it/s]
  1%|█▏                                                                                | 7/500 [00:04<05:41,  1.44it/s]
  2%|█▎                                

 14%|███████████                                                                      | 68/500 [00:38<05:04,  1.42it/s]
 14%|███████████▏                                                                     | 69/500 [00:38<04:47,  1.50it/s]
 14%|███████████▎                                                                     | 70/500 [00:39<05:44,  1.25it/s]
 14%|███████████▌                                                                     | 71/500 [00:40<05:59,  1.19it/s]
 14%|███████████▋                                                                     | 72/500 [00:41<05:25,  1.31it/s]
 15%|███████████▊                                                                     | 73/500 [00:41<04:47,  1.48it/s]
 15%|███████████▉                                                                     | 74/500 [00:42<04:36,  1.54it/s]
 15%|████████████▏                                                                    | 75/500 [00:42<04:27,  1.59it/s]
 15%|████████████▎                      

 27%|█████████████████████▊                                                          | 136/500 [01:19<03:33,  1.70it/s]
 27%|█████████████████████▉                                                          | 137/500 [01:19<03:35,  1.68it/s]
 28%|██████████████████████                                                          | 138/500 [01:20<03:22,  1.79it/s]
 28%|██████████████████████▏                                                         | 139/500 [01:20<03:15,  1.84it/s]
 28%|██████████████████████▍                                                         | 140/500 [01:21<03:45,  1.59it/s]
 28%|██████████████████████▌                                                         | 141/500 [01:22<04:01,  1.48it/s]
 28%|██████████████████████▋                                                         | 142/500 [01:23<04:23,  1.36it/s]
 29%|██████████████████████▉                                                         | 143/500 [01:23<04:07,  1.44it/s]
 29%|███████████████████████            

 41%|████████████████████████████████▋                                               | 204/500 [02:17<01:59,  2.48it/s]
 41%|████████████████████████████████▊                                               | 205/500 [02:18<02:15,  2.17it/s]
 41%|████████████████████████████████▉                                               | 206/500 [02:18<02:30,  1.95it/s]
 41%|█████████████████████████████████                                               | 207/500 [02:19<02:31,  1.94it/s]
 42%|█████████████████████████████████▎                                              | 208/500 [02:19<02:35,  1.87it/s]
 42%|█████████████████████████████████▍                                              | 209/500 [02:20<02:35,  1.88it/s]
 42%|█████████████████████████████████▌                                              | 210/500 [02:21<02:49,  1.71it/s]
 42%|█████████████████████████████████▊                                              | 211/500 [02:22<03:38,  1.32it/s]
 42%|█████████████████████████████████▉ 

 54%|███████████████████████████████████████████▌                                    | 272/500 [03:03<03:51,  1.02s/it]
 55%|███████████████████████████████████████████▋                                    | 273/500 [03:04<03:41,  1.02it/s]
 55%|███████████████████████████████████████████▊                                    | 274/500 [03:05<04:01,  1.07s/it]
 55%|████████████████████████████████████████████                                    | 275/500 [03:06<03:50,  1.03s/it]
 55%|████████████████████████████████████████████▏                                   | 276/500 [03:07<03:37,  1.03it/s]
 55%|████████████████████████████████████████████▎                                   | 277/500 [03:08<03:39,  1.01it/s]
 56%|████████████████████████████████████████████▍                                   | 278/500 [03:09<03:42,  1.00s/it]
 56%|████████████████████████████████████████████▋                                   | 279/500 [03:10<03:21,  1.10it/s]
 56%|███████████████████████████████████

KeyboardInterrupt: 

In [6]:
import numpy as np
import itertools

def max_sum_sim(doc_embedding, word_embeddings, words, top_n, nr_candidates):
    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # Get top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

In [7]:
import numpy as np

def mmr(doc_embedding, word_embeddings, words, top_n, diversity):

    # Extract similarity within words, and between words and the document
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
    word_similarity = cosine_similarity(word_embeddings)

    # Initialize candidates and already choose best keyword/keyphras
    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        # Extract similarities within candidates and
        # between candidates and selected keywords/phrases
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # Calculate MMR
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # Update keywords & candidates
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [words[idx] for idx in keywords_idx]