In [6]:
import os
from decouple import config, AutoConfig
from langchain_community.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
config = AutoConfig(search_path="/home/harry/Chatbot") 

In [7]:
MISTRAL_API_KEY = config("MISTRAL_API_KEY")

In [8]:
phrase_1 = "The dog ate my homework"
phrase_2 = "The homework ate my dog"

In [9]:
phrase_1_as_list = sorted([x.lower() for x in phrase_1.split(" ")])
phrase_2_as_list = sorted([x.lower() for x in phrase_2.split(" ")])
print(phrase_1_as_list)
print(phrase_2_as_list)

['ate', 'dog', 'homework', 'my', 'the']
['ate', 'dog', 'homework', 'my', 'the']


In [10]:
phrase_1_as_list == phrase_2_as_list

True

In [11]:
from mistralai import Mistral

model = "mistral-embed"

client = Mistral(api_key=MISTRAL_API_KEY)

In [16]:
def get_embedding(text, model=model):
    text = text.replace("\n", " ")
    return client.embeddings.create(model=model, inputs=[text]).data[0].embedding

In [17]:
documents = [
    "The cat jumped over the dog",
    "The cow jumped over the moon",
    "The turkey ran in circles",
]

In [19]:
embeddings = [get_embedding(x) for x in documents]

In [20]:
embeddings[0]

[0.021087646484375,
 -0.004070281982421875,
 0.058624267578125,
 0.0195465087890625,
 0.033599853515625,
 -0.0079345703125,
 0.05157470703125,
 -0.0252227783203125,
 -0.0090484619140625,
 -0.028472900390625,
 -0.034881591796875,
 0.0333251953125,
 -0.0516357421875,
 0.01184844970703125,
 -0.0285797119140625,
 0.020263671875,
 -0.00946807861328125,
 -0.01092529296875,
 0.05975341796875,
 -0.01153564453125,
 -0.00417327880859375,
 -0.0121002197265625,
 -0.0281982421875,
 0.004718780517578125,
 -0.01464080810546875,
 0.018341064453125,
 0.0115509033203125,
 -0.032958984375,
 -0.035186767578125,
 0.0009150505065917969,
 -0.01280975341796875,
 -0.057403564453125,
 -0.00926971435546875,
 0.01160430908203125,
 0.0131988525390625,
 -0.046630859375,
 -0.01369476318359375,
 -0.039031982421875,
 0.03240966796875,
 0.016510009765625,
 -0.03814697265625,
 -0.0264892578125,
 -0.01369476318359375,
 0.00022423267364501953,
 0.0120697021484375,
 0.013671875,
 -0.0016298294067382812,
 -0.045684814453125

In [21]:
import numpy as np

In [22]:
np.array(embeddings[0]).shape

(1024,)

In [23]:
def calculate_cosine_metrics(v1, v2):
    dot_product = np.dot(v1, v2)
    magnitude1 = np.linalg.norm(v1)
    magnitude2 = np.linalg.norm(v2)
    cosine_similarity = dot_product / (magnitude1 * magnitude2)
    cosine_distance = 1 - cosine_similarity
    return int(cosine_similarity * 100), int(cosine_distance * 100)

In [24]:
print(calculate_cosine_metrics(embeddings[0], embeddings[0]))
print(calculate_cosine_metrics(embeddings[0], embeddings[1]))
print(calculate_cosine_metrics(embeddings[0], embeddings[2]))

(100, 0)
(80, 19)
(72, 27)


In [25]:
query_str = "The moose sat by the turkey"
query_embedding = get_embedding(query_str)

for embedding in embeddings:
    print(calculate_cosine_metrics(query_embedding, embedding))

(70, 29)
(74, 25)
(82, 17)


In [26]:
phrase_1_embedding = get_embedding("The dog ate my homework and then burped it up")
phrase_2_embedding = get_embedding("The green dog ate my homework and then burped it up")

In [27]:
calculate_cosine_metrics(phrase_1_embedding, phrase_2_embedding)

(94, 5)