In [None]:
!pip install sentence-transformers keybert spacy spacycake

Collecting sentence-transformers
  Using cached https://files.pythonhosted.org/packages/f5/5a/6e41e8383913dd2ba923cdcd02be2e03911595f4d2f9de559ecbed80d2d3/sentence-transformers-0.3.9.tar.gz
Collecting keybert
  Using cached https://files.pythonhosted.org/packages/e3/43/ba9b9be17d1831a112e63625ed328b292f57ace0c570062df9a5e6fea87c/keybert-0.1.2.tar.gz
Collecting spacycake
  Using cached https://files.pythonhosted.org/packages/a0/d3/4a4ad10ca61d6fb18b3b8c62b91a6136822b9c5618c1c53d25e24e1ec07d/spacycake-1.0.0-py3-none-any.whl
Collecting transformers<3.6.0,>=3.1.0
  Using cached https://files.pythonhosted.org/packages/3a/83/e74092e7f24a08d751aa59b37a9fc572b2e4af3918cb66f7766c3affb1b4/transformers-3.5.1-py3-none-any.whl
Collecting spacybert>=1.0.0
  Using cached https://files.pythonhosted.org/packages/f2/ec/f693d08a1d2c16901123c7a07247df202b3b95375263feae501bb39ffe88/spacybert-1.0.1-py3-none-any.whl
Collecting sentencepiece==0.1.91
  Using cached https://files.pythonhosted.org/packages/d4/a4

In [None]:
doc = """Best Dentists in NYC
209 NYC Dental is the oldest continuing dental practice in New York State. 
Established in 1887 our dental office has been providing quality dental care to New York City patients for over a century. 
This legacy of treatment comes with responsibility. A responsibility to treat people with respect, excellence, and compassion.
209 NYC Dental team is a wonderfully eclectic group of top rated dentists, hygienists, and staff. 
We have great clinical and people skills. We the very best, high quality of dental care. 
Having all dental specialties at 209 NYC Dental, we can comprehensively serve all your dental needs, 
from routine cleanings to dental implants, from whitening to more advanced cosmetic dentistry.
Dental Care at 209 NYC Dental
Our 209 NYC Dentists and staff understand the challenges that NYC patients face. Our patients have stayed with us for decades.
They travel from all over the US, Europe and distant parts of the world to see us.
Whether you have Dental Insurance or not, take advantage of a Free Consultation!
NYC Smile Design.
NYC Cosmetic Dentists Serving Manhattan and New York, NY.
COVID-19 Message to Our Patients, Future Patients, and Friends.
At NYC Smile Design, experienced New York City cosmetic dentists Dr. Elisa Mello and Dr. Ramin Tabib collaborate to provide you with comprehensive dental care and outstanding cosmetic dentistry results. Together they founded NYC Smile Design in 1994 and have been dedicated to providing life-changing dentistry ever since. To schedule a consultation, please call us at 212-452-3344.
As partners in marriage as well as business, Dr. Mello and Dr. Tabib have a warmth and commitment to excellence that influences all aspects of their lives. Their collaboration at NYC Smile Design means your care is approached in a multi-disciplinary, comprehensive manner. The dentists and entire staff share a belief in the importance of being a complete person: adhering to the highest professional standards while continuing to grow at the personal level.
Dr. Mello and Dr. Tabib are committed to understanding your perspective and respecting that you are entrusting them with your smile and dental health. Our dentists are also active in giving back to the community and have worked with programs including "Smiles for Success," restoring the smiles of battered women to build their confidence as they begin anew. Dr. Mello and Dr. Tabib have also provided services to participants of the Doe Fund, an organization that helps homeless men attain housing and employment.
"""

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

n_gram_range = (1, 2)
stop_words = "english"

# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
candidates = count.get_feature_names()

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([doc])
candidate_embeddings = model.encode(candidates)

100%|██████████| 245M/245M [00:13<00:00, 17.8MB/s]


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

top_n = 5
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

In [None]:
keywords

['dentists nyc',
 'world dental',
 'dental oldest',
 'nyc dentists',
 'best dentists']

In [None]:
import numpy as np
import itertools

def max_sum_sim(doc_embedding, word_embeddings, words, top_n, nr_candidates):
    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # Get top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

In [None]:
max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n=5, nr_candidates=10)

['quality dental',
 'dentists entire',
 'nyc dental',
 'dentists nyc',
 'dental oldest']

In [None]:
from keybert import KeyBERT

In [None]:
model = KeyBERT('distilbert-base-nli-mean-tokens')
keywords = model.extract_keywords(doc, keyphrase_length=1, stop_words=None)

In [None]:
keywords

['dentists', 'dentistry', 'dental', 'nyc', 'professional']

In [None]:
model.extract_keywords(doc, keyphrase_length=3, stop_words='english', use_maxsum=True, nr_candidates=20, top_n=5)

['209 nyc dental',
 'dentists entire staff',
 'high quality dental',
 'oldest continuing dental',
 'nyc cosmetic dentists']

In [None]:
model.extract_keywords(doc, keyphrase_length=3, stop_words='english', use_mmr=True, diversity=0.7)

['best dentists nyc',
 'patients stayed decades',
 'implants whitening advanced',
 'homeless men attain',
 'dentists entire staff']

In [None]:
model.extract_keywords(doc, keyphrase_length=3, stop_words='english', use_mmr=True, diversity=0.2)

['best dentists nyc',
 'nyc dental oldest',
 'dentists serving manhattan',
 'nyc dental comprehensively',
 'dental team wonderfully']

In [None]:
import spacy
from spacycake import BertKeyphraseExtraction as bake
nlp = spacy.load('en')

In [None]:
cake = bake(nlp, from_pretrained='bert-base-uncased', top_k=5)
nlp.add_pipe(cake, last=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…

In [None]:
print(sdoc(doc)._.extracted_phrases)