In [46]:
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
from tqdm import tqdm
import numpy as np
import pandas as pd
import requests 

In [28]:
model_name = 'multi-qa-distilbert-cos-v1'
embedding_model = SentenceTransformer(model_name)

## Q1: What's the first value of the resulting vector?

In [29]:
user_question = "I just discovered the course. Can I still join it?"
user_question_embedding=embedding_model.encode(user_question)
user_question_embedding[0]

0.078222655

### Preparing the documents

In [30]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [31]:
course='machine-learning-zoomcamp'
course_documents=[doc for doc in documents if doc['course']==course]
print(len(course_documents))

375


In [34]:
course_documents[0]

{'text': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
 'section': 'General course-related questions',
 'question': 'How do I sign up?',
 'course': 'machine-learning-zoomcamp',
 'id': '0227b872'}

## Q2: What's the shape of X?

In [32]:
course_documents_embeddings=[]
for doc in tqdm(course_documents):
    question=doc['question']
    answer=doc['text']
    qa_text=f'{question} {answer}'
    course_documents_embeddings.append(embedding_model.encode(qa_text))

X=np.array(course_documents_embeddings)
X.shape

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 375/375 [01:02<00:00,  6.02it/s]


(375, 768)

## Q3: What's the highest score in the results?

In [35]:
scores=X.dot(user_question_embedding)
scores.max()

0.6506573

In [37]:
class VectorSearchEngine():
    # This documents and the embedding of the documents 
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    # here we pass the embedding of the query 
    # and return the top "num_results"  documents that this query is similary to
    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]
    
def hit_rate(queries_relevance):
    hits=0
    for record in queries_relevance:
        if True in record:
            hits+=1
    return hits/len(queries_relevance)

In [41]:
search_engine = VectorSearchEngine(documents=course_documents, embeddings=X)
result=search_engine.search(user_question_embedding, num_results=5)

In [42]:
# load the ground truth dataset
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [43]:
ground_truth[0]

{'question': 'Where can I sign up for the course?',
 'course': 'machine-learning-zoomcamp',
 'document': '0227b872'}

## Q4: calculate the hitrate of VectorSearchEngine with num_results=5

In [44]:
# 1.we have initialized the Vector Search with the data-with-ids and its embeddings
# 2.we will calculate the relevance and the queries relevance

queries_relevance=[]
for doc in tqdm(ground_truth):
     question=doc['question']
     question_id= doc['document']
     question_embedding=embedding_model.encode(question)
     retrieved_docs=search_engine.search(question_embedding ,num_results=5)
     relevance =[question_id == doc['id'] for doc in retrieved_docs]  
     queries_relevance.append(relevance)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1830/1830 [01:23<00:00, 21.79it/s]


In [45]:
hit_rate(queries_relevance)

0.9398907103825137

### Now let's index these documents with elasticsearch

In [None]:
es_client= Elasticsearch('http://localhost:9200')

In [None]:
# set the mapping
index_settings= {
    "settings":{
        "number_of_shards":1,
        "number_of_replicas":0
    },
    "mappings":{
        "properties":{
            "text":{"type":"text"},
            "section":{"type":"text"},
            "question":{"type":"text"},
            "course":{"type":"keyword"},
            "id":{"type":"keyword"},
            "question_vector":{"type":"dense_vector",
                          "dims":768,
                          "index":True,
                          "similarity":"cosine"},
            "text_vector":{"type":"dense_vector",
                          "dims":768,
                          "index":True,
                          "similarity":"cosine"},
            "question_text_vector":{"type":"dense_vector",
                          "dims":768,
                          "index":True,
                          "similarity":"cosine"},
        }
    }  
}