In [1]:
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
from tqdm import tqdm
import numpy as np
import pandas as pd
import requests 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = 'multi-qa-distilbert-cos-v1'
embedding_model = SentenceTransformer(model_name)

## Q1: What's the first value of the resulting vector?

In [3]:
user_question = "I just discovered the course. Can I still join it?"
user_question_embedding=embedding_model.encode(user_question)
user_question_embedding[0]

0.078222655

### Preparing the documents

In [4]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [5]:
course='machine-learning-zoomcamp'
course_documents=[doc for doc in documents if doc['course']==course]
print(len(course_documents))

375


In [6]:
course_documents[0]

{'text': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
 'section': 'General course-related questions',
 'question': 'How do I sign up?',
 'course': 'machine-learning-zoomcamp',
 'id': '0227b872'}

## Q2: What's the shape of X?

In [7]:
course_documents_embeddings=[]
for doc in tqdm(course_documents):
    question=doc['question']
    answer=doc['text']
    qa_text=f'{question} {answer}'
    course_documents_embeddings.append(embedding_model.encode(qa_text))

X=np.array(course_documents_embeddings)
X.shape

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 375/375 [01:06<00:00,  5.67it/s]


(375, 768)

## Q3: What's the highest score in the results?

In [8]:
scores=X.dot(user_question_embedding)
scores.max()

0.6506573

In [9]:
class VectorSearchEngine():
    # This documents and the embedding of the documents 
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    # here we pass the embedding of the query 
    # and return the top "num_results"  documents that this query is similary to
    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]
    
def hit_rate(queries_relevance):
    hits=0
    for record in queries_relevance:
        if True in record:
            hits+=1
    return hits/len(queries_relevance)

In [10]:
search_engine = VectorSearchEngine(documents=course_documents, embeddings=X)
result=search_engine.search(user_question_embedding, num_results=5)

In [11]:
# load the ground truth dataset
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [12]:
ground_truth[0]

{'question': 'Where can I sign up for the course?',
 'course': 'machine-learning-zoomcamp',
 'document': '0227b872'}

## Q4: calculate the hitrate of VectorSearchEngine with num_results=5

In [13]:
# 1.we have initialized the Vector Search with the data-with-ids and its embeddings
# 2.we will calculate the relevance and the queries relevance

queries_relevance=[]
for doc in tqdm(ground_truth):
     question=doc['question']
     question_id= doc['document']
     question_embedding=embedding_model.encode(question)
     retrieved_docs=search_engine.search(question_embedding ,num_results=5)
     relevance =[question_id == doc['id'] for doc in retrieved_docs]  
     queries_relevance.append(relevance)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1830/1830 [01:32<00:00, 19.81it/s]


In [14]:
hit_rate(queries_relevance)

0.9398907103825137

### Now let's index these documents with elasticsearch

In [15]:
es_client= Elasticsearch('http://localhost:9200')

In [16]:
# set the mapping
index_settings= {
    "settings":{
        "number_of_shards":1,
        "number_of_replicas":0
    },
    "mappings":{
        "properties":{
            "text":{"type":"text"},
            "section":{"type":"text"},
            "question":{"type":"text"},
            "course":{"type":"keyword"},
            "id":{"type":"keyword"},
            "question_vector":{"type":"dense_vector",
                          "dims":768,
                          "index":True,
                          "similarity":"cosine"},
            "text_vector":{"type":"dense_vector",
                          "dims":768,
                          "index":True,
                          "similarity":"cosine"},
            "question_text_vector":{"type":"dense_vector",
                          "dims":768,
                          "index":True,
                          "similarity":"cosine"},
        }
    }  
}

In [17]:
# create the index after we have created the mapping 
index_name = "course-questions"
es_client.indices.delete(index = index_name , ignore_unavailable=True) # delete index if it exists  
es_client.indices.create(index = index_name , body= index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [18]:
course_documents[0]

{'text': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
 'section': 'General course-related questions',
 'question': 'How do I sign up?',
 'course': 'machine-learning-zoomcamp',
 'id': '0227b872'}

In [20]:
# create embeddings for the data & put it in the db
for doc in tqdm(course_documents):
    question=doc['question']
    text=doc['text']
    question_text= question +' '+text

    doc['question_vector']= embedding_model.encode(question)
    doc['text_vector']= embedding_model.encode(text)
    doc['question_text_vector']= embedding_model.encode(question_text)

    es_client.index(index= index_name, document=doc)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 375/375 [02:28<00:00,  2.52it/s]


## Q5: Indexing with Elasticsearch: What is the ID with the highest score

In [22]:
# construct search query
query ={
    "field": "question_vector", 
    "query_vector": user_question_embedding,
    "k":5,
    "num_candidates":10000,
    "filter":{
        "term":{
            "course":course
        }
    }
}
res=es_client.search(index= index_name , knn=query ,
                     source=["text","section", "question","course", "id"])
res['hits']['hits'][0]

{'_index': 'course-questions',
 '_id': '9oCBwZABJrY4uIZmF1yO',
 '_score': 0.96726,
 '_source': {'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'id': 'ee58a693'}}

In [28]:
def elastic_search_knn(field,vector,course):
    query ={
        "field": field, 
        "query_vector": vector,
        "k":5,
        "num_candidates":10000,
        "filter":{
            "term":{
                "course":course
            }
        }
    }

    res=es_client.search(index= index_name , knn=query ,
                     source=["text","section", "question","course", "id"])

    res_doc=[]
    for hit in res['hits']['hits']:
        res_doc.append(hit['_source'])

    return res_doc

def question_vector_knn(q,field):
    question = q['question']
    course= q['course']

    embeddings= embedding_model.encode(question)
    return elastic_search_knn(field=field, 
                   vector= embeddings,
                   course=course)

In [29]:
def evaluate(ground_truth,search_function,field):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q,field)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
    }

In [None]:
evaluate(ground_truth, question_vector_knn, "question_text_vector")

 21%|██████████████████████████████▍                                                                                                                   | 381/1830 [00:19<01:07, 21.42it/s]