## In this notebook we are going to evaluate retrival that are using vector search so basically combining the previous 2 notbooks

In [35]:
import json
import pandas as pd
from tqdm import tqdm
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer

In [13]:
# 1. read data
with open('data-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [8]:
#initialize the vector db and the embeddings model
model_name ='multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)
es_client= Elasticsearch('http://localhost:9200')

In [9]:
# set the mapping
index_settings= {
    "settings":{
        "number_of_shards":1,
        "number_of_replicas":0
    },
    "mappings":{
        "properties":{
            "text":{"type":"text"},
            "section":{"type":"text"},
            "question":{"type":"text"},
            "course":{"type":"keyword"},
            "id":{"type":"keyword"},
            "question_vector":{"type":"dense_vector",
                          "dims":384,
                          "index":True,
                          "similarity":"cosine"},
            "text_vector":{"type":"dense_vector",
                          "dims":384,
                          "index":True,
                          "similarity":"cosine"},
            "question_text_vector":{"type":"dense_vector",
                          "dims":384,
                          "index":True,
                          "similarity":"cosine"},
        }
    }  
}

In [11]:
# create the index after we have created the mapping 
index_name = "course-questions"
es_client.indices.delete(index = index_name , ignore_unavailable=True) # delete index if it exists  
es_client.indices.create(index = index_name , body= index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [15]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [16]:
# create embeddings for the data & put it in the db
for doc in tqdm(documents):
    question=doc['question']
    text=doc['text']
    question_text= question +' '+text

    doc['question_vector']= model.encode(question)
    doc['text_vector']= model.encode(text)
    doc['question_text_vector']= model.encode(question_text)

    es_client.index(index= index_name, document=doc)

100%|████████████████████████████████████████| 948/948 [02:18<00:00,  6.86it/s]


In [18]:
query = 'I just discovered the course, can i still join it?'
query_embeddings= model.encode(query)

In [24]:
# construct search query
query ={
    "field": "question_vector", 
    "query_vector": query_embeddings,
    "k":5,
    "num_candidates":10000,
    "filter":{
        "term":{
            "course":"data-engineering-zoomcamp"
        }
    }
}

In [25]:
res=es_client.search(index= index_name , knn=query ,
                     source=["text","section", "question","course", "id"])

In [26]:
res['hits']['hits']

[{'_index': 'course-questions',
  '_id': 'el2MtJABJzd8G14Cn8T_',
  '_score': 0.8980063,
  '_source': {'question': 'Course - Can I still join the course after the start date?',
   'course': 'data-engineering-zoomcamp',
   'section': 'General course-related questions',
   'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
   'id': '7842b56a'}},
 {'_index': 'course-questions',
  '_id': 'f12MtJABJzd8G14CosRF',
  '_score': 0.85993636,
  '_source': {'question': 'Course - Can I follow the course after it finishes?',
   'course': 'data-engineering-zoomcamp',
   'section': 'General course-related questions',
   'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the nex

In [32]:
## Let's make the above code generic to the field 

def elastic_search_knn(field,vector,course):
    query ={
        "field": field, 
        "query_vector": vector,
        "k":5,
        "num_candidates":10000,
        "filter":{
            "term":{
                "course":course
            }
        }
    }

    res=es_client.search(index= index_name , knn=query ,
                     source=["text","section", "question","course", "id"])

    res_doc=[]
    for hit in res['hits']['hits']:
        res_doc.append(hit['_source'])

    return res_doc

### 1. Evaluate & Search based on question field

In [48]:
def question_vector_knn(q,field):
    question = q['question']
    course= q['course']

    embeddings= model.encode(question)
    return elastic_search_knn(field=field, 
                   vector= embeddings,
                   course=course)

In [49]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [50]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [51]:
def evaluate(ground_truth, search_function,field):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q,field)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [52]:
df_ground_truth=pd.read_csv('ground-truth-data.csv')
ground_truth=df_ground_truth.to_dict(orient='records')

In [53]:
evaluate(ground_truth , question_vector_knn,'question_vector')

100%|██████████████████████████████████████| 4627/4627 [01:39<00:00, 46.59it/s]


{'hit_rate': 0.773071104387292, 'mrr': 0.6666810748505158}

In [54]:
evaluate(ground_truth , question_vector_knn,'text_vector')

100%|██████████████████████████████████████| 4627/4627 [01:35<00:00, 48.41it/s]


{'hit_rate': 0.8286146531229739, 'mrr': 0.7062315395144454}

In [55]:
evaluate(ground_truth , question_vector_knn,'question_text_vector')

100%|██████████████████████████████████████| 4627/4627 [01:35<00:00, 48.46it/s]


{'hit_rate': 0.9172249837907932, 'mrr': 0.824306606152295}

In [57]:
# Just good to know that there is also a way to emphasize
# which parameter to give more weight to while searching

def elastic_search_knn_combined(vector, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": [
                    {
                        "script_score": {
                            "query": {
                                "term": {
                                    "course": course
                                }
                            },
                            "script": {
                                "source": """
                                    cosineSimilarity(params.query_vector, 'question_vector') + 
                                    cosineSimilarity(params.query_vector, 'text_vector') + 
                                    cosineSimilarity(params.query_vector, 'question_text_vector') + 
                                    1
                                """,
                                "params": {
                                    "query_vector": vector
                                }
                            }
                        }
                    }
                ],
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        },
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs