# SiameseBert Semantic Sentence Similiarity

In [3]:
from sentence_transformers import SentenceTransformer

# Load the BERT model. Various models trained on Natural Language Inference (NLI) https://github.com/UKPLab/sentence-transformers/blob/master/docs/pretrained-models/nli-models.md and 
# Semantic Textual Similarity are available https://github.com/UKPLab/sentence-transformers/blob/master/docs/pretrained-models/sts-models.md

model = SentenceTransformer('xlm-r-100langs-bert-base-nli-stsb-mean-tokens')

In [4]:
# A corpus is a list with documents split by sentences.
sentences = ['clock', 'time', 'how many hours', 'minutes',"date",'date', 'calendar',"schedule",'timetable','plan','agenda','task list', 'things to do', 'todo list', 'work','music','play music', 'rythm','melody']
# Each sentence is encoded as a 1-D vector with 78 columns
sentence_embeddings = model.encode(sentences)

print('Sample BERT embedding vector - length', len(sentence_embeddings[0]))

print('Sample BERT embedding vector - note includes negative values', sentence_embeddings[0])

Sample BERT embedding vector - length 768
Sample BERT embedding vector - note includes negative values [ 1.96707711e-01  5.17741740e-01  1.93440509e+00 -2.43900269e-01
  1.98107734e-01  5.86221039e-01  7.48901293e-02  7.85903573e-01
 -1.08110690e+00 -2.58463383e-01  6.10251367e-01  2.63684779e-01
 -2.53063068e-03  1.23407829e+00  2.57874489e-01 -7.06427395e-01
 -2.49437064e-01 -1.60612524e-01  2.40972549e-01 -6.54870510e-01
  3.94866824e-01  5.05520105e-01  1.10606885e+00 -2.20283806e-01
 -8.17135453e-01 -6.08660221e-01 -7.56687641e-01 -1.05306840e+00
 -1.10706568e-01 -9.81161371e-02 -1.05209678e-01 -3.34081620e-01
 -1.13889903e-01  1.53898582e-01 -3.98865968e-01 -8.63566637e-01
 -1.18840545e-01  1.94269121e-01 -6.77994639e-02 -1.50896221e-01
  4.13627177e-02 -4.20629889e-01 -9.50373709e-02  2.05497652e-01
 -1.69430465e-01 -1.53817445e-01 -4.84115243e-01 -2.45825469e-01
 -5.59904397e-01 -1.01968706e+00 -3.83721679e-01 -1.21297412e-01
  6.25217557e-01  8.76988709e-01 -3.26742351e-01  8.

In [5]:
#@title Sematic Search Form

# code adapted from https://github.com/UKPLab/sentence-transformers/blob/master/examples/application_semantic_search.py
import scipy
#query = "second one" #@param {type: 'string'}
query = "what is my plan of work for tomorrow?"

queries = [query]
query_embeddings = model.encode(queries)

# Find the closest 3 sentences of the corpus for each query sentence based on cosine similarity
number_top_matches = 5 #@param {type: "number"}

print("Semantic Search Results")

for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], sentence_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for idx, distance in results[0:number_top_matches]:
        print(sentences[idx].strip(), "(Cosine Score: %.4f)" % (1-distance))

Semantic Search Results




Query: what is my plan of work for tomorrow?

Top 5 most similar sentences in corpus:
plan (Cosine Score: 0.5234)
schedule (Cosine Score: 0.4780)
timetable (Cosine Score: 0.4590)
agenda (Cosine Score: 0.4222)
task list (Cosine Score: 0.3617)
