## import model

In [11]:
from sentence_transformers import SentenceTransformer

In [12]:
from transformers import AutoTokenizer, AutoModel
import torch

## mean pooling 

In [13]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


## initialize model and tokenizer 

In [15]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/471M [00:00<?, ?B/s]

In [61]:
sentences = ['i love ice cream','i love summer','my name is nikos','today is raining']

## encode input 

In [62]:
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')


## compute embedings 

In [63]:
with torch.no_grad():
    model_output = model(**encoded_input)

## pooling 

In [64]:
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])


In [65]:
for i in range(len(sentence_embeddings)):
    print(f"embedings of sentence {i} == {sentences[i]}")
    print(sentence_embeddings[i])

embedings of sentence 0 == i love ice cream
tensor([-8.0233e-02, -5.9561e-01,  1.3601e-01,  2.4189e-02,  7.2587e-02,
         3.7004e-01,  6.9979e-01, -6.0084e-02, -9.6385e-02, -1.8673e-01,
        -4.6589e-01, -4.3960e-01, -4.2343e-03, -1.6616e-01,  2.2284e-03,
         6.7262e-02,  4.0143e-01, -1.0349e-01,  9.8340e-02, -5.3053e-01,
        -2.8681e-01,  2.5533e-01, -2.8782e-01,  4.1160e-01, -5.1841e-01,
        -1.6700e-01,  3.4259e-01,  2.7450e-01, -2.5793e-01,  2.8928e-01,
         3.4717e-01, -2.1574e-01,  5.5264e-01,  2.2878e-01, -1.1593e-01,
        -7.1598e-02,  1.8216e-03, -2.5760e-01,  3.0719e-01,  2.6315e-01,
         2.1875e-02, -1.2320e-01,  6.4599e-02, -2.0752e-02, -1.0291e-01,
         1.5998e-01,  8.8520e-02, -2.5669e-01,  1.2080e-01,  5.9477e-01,
         1.8772e-01, -1.4886e-01, -2.4022e-01,  3.1221e-01,  4.8516e-01,
        -2.6546e-01, -3.3504e-01, -5.5933e-01, -4.1757e-02, -2.1948e-01,
        -1.3416e-01, -4.2179e-01,  2.0841e-01, -1.4743e-01, -2.2229e-01,
       

## calculate similatiry using cosine similarity

In [66]:
from sklearn.metrics.pairwise import cosine_similarity
similarites = {sentences[i]:0 for i in range(1,len(sentences))}

for i in range(1,len(sentence_embeddings)):
    similarity = cosine_similarity(sentence_embeddings[0].reshape(1,-1),sentence_embeddings[i].reshape(1,-1))[0][0]
    key = sentences[i]
    similarites[key] = similarity





In [67]:
print(f'similarites with sentense{sentences[0]} : ')
for key,value in similarities.items():
    print(key , )

{'i love summer': 0.3511556,
 'my name is nikos': 0.12415543,
 'today is raining': 0.1207068}