In [None]:
!pip install transformers torch



In [None]:
from transformers import BertModel, BertTokenizer, AutoTokenizer, AutoModel
import torch
from scipy.spatial.distance import cosine

#Model and tokenizer
model_name = 'mixedbread-ai/mxbai-embed-large-v1'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

#The two sets of answers
conjunto_a = ["Agent_2 has been to Rome and Florence",
              "Agent_2 couldn't go to Venice",
              "Agent_1 wants to go to Florence",
              "Agent_1 has been to Tokio",
              "Agent_2 will need help from Agent_1 for the travel planning of Tokio",
              "Agent_1 thinks that Tokio looks like anime"
              ]
conjunto_b = ["Agent_2 loves to travel, especially now as a student",
              "Agent_2 recently visited Rome and Florence",
              "Agent_2 wanted to visit Venice but lacked time",
              "Agent_2 thinks Florence is the best city in Italy",
              "Agent_1 wants to visit Florence",
              "Agent_1 recently traveled to Tokyo",
              "Agent_2 is interested in visiting Tokyo",
              "Agent_1 found Tokyo very nice but big, hard to see all",
              "Tokyo's scenery is similar to anime, according to Agent_1",
              "Japanese culture is very different from European culture, noted by Agent_1",
              "Agent_2 plans to visit Tokyo next year and seeks Agent_1's help",
              "Agent_1 is willing to help Agent_2 with the trip to Tokyo, mentioning knowing how to go there"
              ]

#Function for obtaining the embeddings
def get_sentence_embedding(sentence, tokenizer, model):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state
    embeddings_mean = torch.mean(embeddings, dim=1)
    return embeddings_mean.squeeze().detach().numpy()


#Compute the semantic similitity on one direcction
def calculate_similarity_and_print(conjunto_a, conjunto_b, tokenizer, model):
    max_similarities = []
    for sent_a in conjunto_a:
        embedding_a = get_sentence_embedding(sent_a, tokenizer, model)
        max_similarity = -1
        selected_sent_b = ""
        for sent_b in conjunto_b:
            embedding_b = get_sentence_embedding(sent_b, tokenizer, model)
            similarity = 1 - cosine(embedding_a, embedding_b)
            if similarity > max_similarity:
                max_similarity = similarity
                selected_sent_b = sent_b
        max_similarities.append(max_similarity)
        #print(f"Para la sentencia: '{sent_a}'\nLa más similar es: '{selected_sent_b}'\nCon una similitud de: {max_similarity}\n")

    mean_similarity = sum(max_similarities) / len(max_similarities)
    return mean_similarity

#Compute the similarity for both direcctions
similarity_a_b = calculate_similarity_and_print(conjunto_a, conjunto_b, tokenizer, model)
print(f"Similitud media de A a B: {similarity_a_b}\n")
similarity_b_a = calculate_similarity_and_print(conjunto_b, conjunto_a, tokenizer, model)
print(f"Similitud media de B a A: {similarity_b_a}\n")


Similitud media de A a B: 0.9210421939690908

Similitud media de B a A: 0.8690484861532847

