<a href="https://colab.research.google.com/github/JacopoMangiavacchi/SBERT-ZSC/blob/main/SBERT-Cosine-Similarity-Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
sentences = [
    'This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.'
]

# Test Sentence BERT with Hugginface Transformer 

In [2]:
!pip install transformers

from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F
from scipy import spatial

tokenizer = AutoTokenizer.from_pretrained('deepset/sentence_bert')
model = AutoModel.from_pretrained('deepset/sentence_bert')



Some weights of the model checkpoint at deepset/sentence_bert were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# run inputs through model and mean-pool over the sequence
# dimension to get sequence-level representations
inputs = tokenizer.batch_encode_plus(sentences,
                                     return_tensors='pt',
                                     pad_to_max_length=True)



In [4]:
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
output = model(input_ids, attention_mask=attention_mask)[0]
sentence_rep = output.mean(dim=1)

In [5]:
# find the highest cosine similarities between sentences
print(F.cosine_similarity(sentence_rep[0], sentence_rep[0], dim=0))
print(F.cosine_similarity(sentence_rep[0], sentence_rep[1], dim=0))
print(F.cosine_similarity(sentence_rep[0], sentence_rep[2], dim=0))
print(F.cosine_similarity(sentence_rep[1], sentence_rep[2], dim=0))

tensor(1., grad_fn=<DivBackward0>)
tensor(0.5969, grad_fn=<DivBackward0>)
tensor(-0.1929, grad_fn=<DivBackward0>)
tensor(-0.0781, grad_fn=<DivBackward0>)


In [6]:
embeddings = sentence_rep.detach().numpy()

print(1 - spatial.distance.cosine(embeddings[0], embeddings[0]))
print(1 - spatial.distance.cosine(embeddings[0], embeddings[1]))
print(1 - spatial.distance.cosine(embeddings[0], embeddings[2]))
print(1 - spatial.distance.cosine(embeddings[1], embeddings[2]))

1.0
0.5969038605690002
-0.19288042187690735
-0.07813181728124619


# Test Sentence BERT with Sentence-Transformer

In [7]:
!pip install -U sentence-transformers

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')



In [8]:
print("Max Sequence Length:", model.max_seq_length)

embeddings = model.encode(sentences)

Max Sequence Length: 256


In [9]:
print(1 - spatial.distance.cosine(embeddings[0], embeddings[0]))
print(1 - spatial.distance.cosine(embeddings[0], embeddings[1]))
print(1 - spatial.distance.cosine(embeddings[0], embeddings[2]))
print(1 - spatial.distance.cosine(embeddings[1], embeddings[2]))

1.0
0.5380793213844299
0.11805637180805206
0.10358978062868118


# Test Huggingface with 'bert-base-uncased' Tokenizer

In [11]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('deepset/sentence_bert')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Some weights of the model checkpoint at deepset/sentence_bert were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
inputs = tokenizer.batch_encode_plus(sentences,
                                     return_tensors='pt',
                                     pad_to_max_length=True)

input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
output = model(input_ids, attention_mask=attention_mask)[0]
sentence_rep = output.mean(dim=1)



In [13]:
print(F.cosine_similarity(sentence_rep[0], sentence_rep[0], dim=0))
print(F.cosine_similarity(sentence_rep[0], sentence_rep[1], dim=0))
print(F.cosine_similarity(sentence_rep[0], sentence_rep[2], dim=0))
print(F.cosine_similarity(sentence_rep[1], sentence_rep[2], dim=0))

tensor(1., grad_fn=<DivBackward0>)
tensor(0.5969, grad_fn=<DivBackward0>)
tensor(-0.1929, grad_fn=<DivBackward0>)
tensor(-0.0781, grad_fn=<DivBackward0>)
