In [13]:
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

Matplotlib is building the font cache; this may take a moment.


In [14]:
embeddings_dict = {}
with open("data/glove/glove.6B.100d.txt", 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        token = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[token] = vector

In [15]:
def find_closest_embeddings(embedding, cutoff=25):
    return sorted(embeddings_dict.keys(), key=lambda token: spatial.distance.euclidean(embeddings_dict[token], embedding))

In [16]:
print(find_closest_embeddings(
    embeddings_dict["twig"] - embeddings_dict["branch"] + embeddings_dict["hand"]
)[:5])

['flashlight', 'twig', 'clipboard', 'shove', 'hand']


In [46]:
sentence_a = "Here we perform 5-fold cross validation of a KNN model after using a standard scaler"
sentence_b = "In this kernel I present a very simple K-nearest neighbors model based on the quantiles of the distribution"
sentence_c = "And so it begins but you can't have them all. Your heart has to settle down somewhere"

In [47]:
embedding_a = np.zeros_like(embeddings_dict["branch"])
embedding_b = np.zeros_like(embeddings_dict["branch"])
embedding_c = np.zeros_like(embeddings_dict["branch"])

In [48]:
for w in sentence_a.split():
    if w not in embeddings_dict:
        continue
    embedding_a += embeddings_dict[w]

for w in sentence_b.split():
    if w not in embeddings_dict:
        continue
    embedding_b += embeddings_dict[w]

for w in sentence_c.split():
    if w not in embeddings_dict:
        continue
    embedding_c += embeddings_dict[w]

In [50]:
print(find_closest_embeddings(
    embedding_a
)[:5])

['a', 'an', 'this', 'be', 'for']


In [54]:
spatial.distance.euclidean(embedding_a, embedding_b)

23.454050064086914

In [52]:
spatial.distance.euclidean(embedding_a, embedding_c)

32.27524185180664

In [53]:
spatial.distance.euclidean(embedding_b, embedding_c)

30.673566818237305

In [None]:
spatial.distance.euclidean(embeddings_dict[token], embedding))

In [None]:
return sorted(embeddings_dict.keys(), key=lambda token: spatial.distance.euclidean(embeddings_dict[token], embedding))

# bert

In [1]:
! pip install -U sentence-transformers
# https://www.sbert.net/

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting sentence-transformers
  Using cached sentence_transformers-2.2.0-py3-none-any.whl
Collecting sentencepiece
  Using cached sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
Collecting nltk
  Using cached nltk-3.7-py3-none-any.whl (1.5 MB)
Installing collected packages: sentencepiece, nltk, sentence-transformers
Successfully installed nltk-3.7 sentence-transformers-2.2.0 sentencepiece-0.1.96


In [4]:
from sentence_transformers import SentenceTransformer, util

In [5]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [6]:
sentences = ['The cat sits outside',
             'A man is playing guitar',
             'I love pasta',
             'The new movie is awesome',
             'The cat plays in the garden',
             'A woman watches TV',
             'The new movie is so great',
             'Do you like pizza?']

paraphrases = util.paraphrase_mining(model, sentences)

for paraphrase in paraphrases[0:10]:
    score, i, j = paraphrase
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i], sentences[j], score))


The new movie is awesome 		 The new movie is so great 		 Score: 0.9298
The cat sits outside 		 The cat plays in the garden 		 Score: 0.6497
I love pasta 		 Do you like pizza? 		 Score: 0.4269
I love pasta 		 The new movie is so great 		 Score: 0.2594
I love pasta 		 The new movie is awesome 		 Score: 0.2528
Do you like pizza? 		 The new movie is awesome 		 Score: 0.1423
The new movie is so great 		 Do you like pizza? 		 Score: 0.1327
The new movie is awesome 		 A woman watches TV 		 Score: 0.1318
A woman watches TV 		 Do you like pizza? 		 Score: 0.1185
The new movie is so great 		 A woman watches TV 		 Score: 0.0961
