In [18]:
import os
import numpy as np
from decouple import config, AutoConfig
config = AutoConfig(search_path="/home/harry/chatbotDjango") 

In [19]:
MISTRAL_API_KEY = config("MISTRAL_API_KEY")

In [20]:
from mistralai import Mistral

model = "mistral-embed"

client = Mistral(api_key=MISTRAL_API_KEY)

In [21]:
def get_embedding(text, model=model):
    if not isinstance(text, list):
        text = [text]
    response = client.embeddings.create(model=model, inputs=text)
    return np.array([entry.embedding for entry in response.data])

In [22]:
def cosine_similarity(emb1, emb2):
    return np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))

In [23]:
docs = [
    "Harry Was Here before you", 
    "You Were Here before Harry",
    "Harry Was not Here",
    "The new Assassin's Creed game would be release in two weeks",
]

In [24]:
docs[0]

'Harry Was Here before you'

In [25]:
documents = []
for i, x in enumerate(docs):
    print(i, x)
    row = {
        "index": i,
        "data": x
    }

    documents.append(row)

documents

0 Harry Was Here before you
1 You Were Here before Harry
2 Harry Was not Here
3 The new Assassin's Creed game would be release in two weeks


[{'index': 0, 'data': 'Harry Was Here before you'},
 {'index': 1, 'data': 'You Were Here before Harry'},
 {'index': 2, 'data': 'Harry Was not Here'},
 {'index': 3,
  'data': "The new Assassin's Creed game would be release in two weeks"}]

In [26]:
dataset = [
    {'index': 0, 'data': 'Harry Was Here before you'},
     {'index': 1, 'data': 'You Were Here before Harry'},
     {'index': 2, 'data': 'Harry Was not Here'},
     {'index': 3, 'data': "The new Assassin's Creed game would be release in two weeks"}
]

In [27]:
import time
document_embeddings = []

for x in dataset:
    embed = get_embedding(x['data'])
    document_embeddings.append((x['index'], embed))
    time.sleep(2)

In [31]:
query = "I bought a new game called Assassin's Creed"
query_embedding = get_embedding([query])

In [32]:
query_embedding.shape

(1, 1024)

In [33]:
results = []
for doc_id, doc_embedding in document_embeddings:
    similarity = cosine_similarity(query_embedding[0], doc_embedding[0]) 
    results.append((doc_id, similarity))

sorted_results = sorted(results, key=lambda x: x[1], reverse=True)
sorted_results

[(3, np.float64(0.7721628054416644)),
 (2, np.float64(0.5582020361938994)),
 (0, np.float64(0.5134759028374886)),
 (1, np.float64(0.4897160057580173))]