In [1]:
"""
This basic example loads a pre-trained model from the web and uses it to
generate sentence embeddings for a given list of sentences.
"""

from sentence_transformers import SentenceTransformer, LoggingHandler
import numpy as np
import logging

# Just some code to print debug information to stdout
np.set_printoptions(threshold=100)

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
# /print debug information to stdout


# Load pre-trained Sentence Transformer Model (based on DistilBERT). It will be downloaded automatically
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

2020-08-26 14:35:59 - Load pretrained SentenceTransformer: distilbert-base-nli-stsb-mean-tokens
2020-08-26 14:35:59 - Did not find a '/' or '\' in the name. Assume to download model from server.
2020-08-26 14:35:59 - Load SentenceTransformer from folder: /Users/medbeji/.cache/torch/sentence_transformers/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_distilbert-base-nli-stsb-mean-tokens.zip
2020-08-26 14:35:59 - loading configuration file /Users/medbeji/.cache/torch/sentence_transformers/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_distilbert-base-nli-stsb-mean-tokens.zip/0_Transformer/config.json
2020-08-26 14:35:59 - Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertModel"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_

In [2]:
# Embed a list of sentences
sentences = ['Coronavirus en France : 22 nouveaux décès, le taux de positivité progresse encore',
             'Port du masque obligatoire à Toulouse : il fallait "passer à autre chose de beaucoup plus fort", affirme le maire Jean-Luc Moudenc',
             'Coronavirus : la Tunisie impose le port obligatoire du masque']
sentence_embeddings = model.encode(sentences)

2020-08-26 14:36:02 - Start tokenization 3 sentences


HBox(children=(IntProgress(value=0, description='Batches', max=1, style=ProgressStyle(description_width='initi…




In [3]:
from scipy.spatial import distance as dst

In [4]:
# The result is a list of sentence embeddings as numpy arrays
for sentence, embedding in zip(sentences, sentence_embeddings):
    for sentence_1, embedding_1 in zip(sentences, sentence_embeddings):
        print('The cosine distance between : \n - {} \n - {} '.format(sentence, sentence_1))
        print('is equal to : {}'.format(1 - dst.cosine(embedding, embedding_1)))
        
        

The cosine distance between : 
 - Coronavirus en France : 22 nouveaux décès, le taux de positivité progresse encore 
 - Coronavirus en France : 22 nouveaux décès, le taux de positivité progresse encore 
is equal to : 1.0
The cosine distance between : 
 - Coronavirus en France : 22 nouveaux décès, le taux de positivité progresse encore 
 - Port du masque obligatoire à Toulouse : il fallait "passer à autre chose de beaucoup plus fort", affirme le maire Jean-Luc Moudenc 
is equal to : 0.43000683188438416
The cosine distance between : 
 - Coronavirus en France : 22 nouveaux décès, le taux de positivité progresse encore 
 - Coronavirus : la Tunisie impose le port obligatoire du masque 
is equal to : 0.5688125491142273
The cosine distance between : 
 - Port du masque obligatoire à Toulouse : il fallait "passer à autre chose de beaucoup plus fort", affirme le maire Jean-Luc Moudenc 
 - Coronavirus en France : 22 nouveaux décès, le taux de positivité progresse encore 
is equal to : 0.430006831