In [26]:
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
import requests
from openai import OpenAI
from tqdm.auto import tqdm
import hashlib

In [27]:
def generate_document_id(doc:dict) -> str:
    """Function to take dictionary and return dictionary"""

    combined_string = f"{doc['section']}-{doc["course"]}-{doc['question']}-{doc['text'][:16]}"
    generated_hash = hashlib.md5(combined_string.encode())
    stored_hash = generated_hash.hexdigest()[:16]
    return stored_hash

In [28]:
DOCS_URL = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
HOST = "http://localhost:9200"

In [29]:
model_name = "all-MiniLM-L6-v2" #Bert model
model = SentenceTransformer(model_name)

In [30]:
with requests.get(DOCS_URL) as docs_resp:
    if docs_resp.status_code == 200:
        docs_raw = docs_resp.json()
        documents = []
        for course in docs_raw:
            course_name = course['course']
            for doc in course['documents']:
                doc['course'] = course_name
                documents.append(doc)
    else:
        print(f"""Documents could not be fetched due to status code 
        {docs_resp.status_code} and issue being: 
        {docs_resp.reason}""")

In [31]:
for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    qt = question + ' ' + text

    doc['question_vector'] = model.encode(question)
    doc['text_vector'] = model.encode(text)
    doc['question_text_vector'] = model.encode(qt)

  0%|          | 0/948 [00:00<?, ?it/s]

In [32]:
class ElasticSearch:
    def __init__(self, host: str, index_name: str):
        self.host = host
        self.client = Elasticsearch(self.host)
        self.index_name = index_name
        self.create_index()

    def delete_index(self):
        self.client.indices.delete(index=self.index_name, ignore_unavailable=True)
    
    def create_index(self):
        index_settings = {
            "settings": {
                "number_of_shards": 1,
                "number_of_replicas": 0
            },
            "mappings": {
                "properties": {
                    "text": {"type": "text"},
                    "section": {"type": "text"},
                    "question": {"type": "text"},
                    "course": {"type": "keyword"},
                    "id": {"type": "keyword"},
                    "question_vector": {
                        "type": "dense_vector",
                        "dims": 384,
                        "index": True,
                        "similarity": "cosine"
                    },
                    "text_vector": {
                        "type": "dense_vector",
                        "dims": 384,
                        "index": True,
                        "similarity": "cosine"
                    },
                    "question_text_vector": {
                        "type": "dense_vector",
                        "dims": 384,
                        "index": True,
                        "similarity": "cosine"
                    },
                }
            }
        }
        if not self.client.indices.exists(index=self.index_name):
            response = self.client.indices.create(index=self.index_name, body=index_settings)
            print(f"Index creation response: {response}")
        else:
            print(f"Index '{self.index_name}' already exists.")


    def insert_docs(self, documents: dict):

        for doc in tqdm(documents):
            try:
                self.client.index(index=self.index_name, document=doc, id=generate_document_id(doc=doc))
            except Exception as e:
                print(e)

    
        

In [34]:
es = ElasticSearch(host=HOST, index_name="zoomcamp_qa_docs")

Index creation response: {'acknowledged': True, 'shards_acknowledged': True, 'index': 'zoomcamp_qa_docs'}


In [35]:
es.insert_docs(documents=documents)

  0%|          | 0/948 [00:00<?, ?it/s]

In [36]:
query = 'I just discovered the course. Can I still join it?'
v_q = model.encode(query)

In [41]:
keyword_query = {
    "bool": {
        "must": {
            "multi_match": {
                "query": f"{query}",
                "fields": ["question^3", "text", "section"],
                "type": "best_fields",
                "boost": 0.5
            }
        },
        "filter": {
            "term": {
                "course": "data-engineering-zoomcamp"
            }
        }
    }
}

knn_query = {
    "field": "text_vector",
    "query_vector": v_q,
    "k": 5,
    "num_candidates": 10000,
    "boost": 0.5,
    "filter": {
        "term": {
            "course": "data-engineering-zoomcamp"
        }
    }
}

In [42]:
response = es.client.search(
    index=es.index_name,
    query=keyword_query,
    knn=knn_query,
    size=5)

In [57]:
for res in response["hits"]["hits"]:
    print(res["_id"], ":", res['_source']["text"])
    print()

3762f7e98fd36bd9 : Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

7bc493b31f7e84ed : Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.

195959f49fcd0b4e : You can start by installing and setting up all the dependencies and requirements:
Google cloud account
Google Cloud SDK
Python 3 (installed with Anaconda)
Terraform
Git
Look over the prerequisites and syllabus to see if you are comfortable with these subjects.

91c9a4a23d656770 : Yes, the slack channel remains open and you can ask questions there. But always sDocker containers exit code w search the channel first and second, check the FAQ (t

In [48]:
import pandas as pd

In [58]:
gt_data = pd.read_csv("../../03-evaluation/search_evaluation/ground-truth-data-practice.csv")

In [59]:
gt_data.head()

Unnamed: 0,question,course,document
0,Could you let me know the specific date and ti...,data-engineering-zoomcamp,12ba9d1bc6ca27c5
1,What is the starting date for the first live O...,data-engineering-zoomcamp,12ba9d1bc6ca27c5
2,Is there a way to add the course schedule to m...,data-engineering-zoomcamp,12ba9d1bc6ca27c5
3,When exactly should I register for the course ...,data-engineering-zoomcamp,12ba9d1bc6ca27c5
4,How do I stay updated with announcements regar...,data-engineering-zoomcamp,12ba9d1bc6ca27c5


In [60]:
ground_truth = gt_data.to_dict(orient='records')

In [77]:
def elastic_search_hybrid(query, v_q, course, field):
    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": f"{query}",
                    "fields": ["question", "text", "section"],
                    "type": "best_fields",
                    "boost": 0.5
                }
            },
            "filter": {
                "term": {
                    "course": f"{course}"
                }
            }
        }
    }

    knn_query = {
        "field": f"{field}",
        "query_vector": v_q,
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.5,
        "filter": {
            "term": {
                "course": f"{course}"
            }
        }
    }

    search_query = {
        "knn": knn_query,
        "query": keyword_query,
        "size": 5,
        "_source": ["text", "section", "question", "course", "id"]
    }

    response = es.client.search(
        index=es.index_name,
        body=search_query
    )

    result_docs = []
    for res in response["hits"]["hits"]:
        data = res['_source']
        data["id"] = res["_id"]
        result_docs.append(data)

    return result_docs

In [78]:
def question_hybrid(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_hybrid(question, v_q, course, 'question_vector')

In [79]:
example = ground_truth[0]

In [80]:
example

{'question': 'Could you let me know the specific date and time when the course will begin?',
 'course': 'data-engineering-zoomcamp',
 'document': '12ba9d1bc6ca27c5'}

In [81]:
relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q['document']
    results = question_hybrid(q)
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4119 [00:00<?, ?it/s]

In [82]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt += 1 

    return cnt / len(relevance_total)

In [83]:
hit_rate(relevance_total)

0.8579752367079388

In [84]:
def mrr(relevance_total):
    total_score = 0.0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + (1 / (rank + 1))
    return total_score / len(relevance_total)

In [85]:
mrr(relevance_total)

0.7407178117666108

In [86]:
def evaluate(ground_truth: dict, search_function):
    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = question_hybrid(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    return {
        "hit_rate": hit_rate(relevance_total),
        "mrr": mrr(relevance_total)
    }

In [87]:
evaluate(ground_truth, elastic_search_hybrid)

  0%|          | 0/4119 [00:00<?, ?it/s]

{'hit_rate': 0.8579752367079388, 'mrr': 0.7407178117666108}

In [88]:
def elastic_search_hybrid(query, v_q, course, field, boost):
    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": f"{query}",
                    "fields": ["question", "text", "section"],
                    "type": "best_fields",
                    "boost": boost
                }
            },
            "filter": {
                "term": {
                    "course": f"{course}"
                }
            }
        }
    }

    knn_query = {
        "field": f"{field}",
        "query_vector": v_q,
        "k": 5,
        "num_candidates": 10000,
        "boost": 1 - boost,
        "filter": {
            "term": {
                "course": f"{course}"
            }
        }
    }

    search_query = {
        "knn": knn_query,
        "query": keyword_query,
        "size": 5,
        "_source": ["text", "section", "question", "course", "id"]
    }

    response = es.client.search(
        index=es.index_name,
        body=search_query
    )

    result_docs = []
    for res in response["hits"]["hits"]:
        data = res['_source']
        data["id"] = res["_id"]
        result_docs.append(data)

    return result_docs

In [93]:
def question_hybrid_boost(q, boost):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_hybrid(question, v_q, course, 'question_vector', boost)

In [94]:
def evaluate(ground_truth: dict, search_function, boost):
    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q, boost)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    return {
        "hit_rate": hit_rate(relevance_total),
        "mrr": mrr(relevance_total),
        "boost": boost
    }

In [90]:
import numpy as np

In [91]:
boosting_options = np.linspace(0.0, 1.0, num=11)

In [92]:
boosting_options

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])

In [95]:
import concurrent.futures

In [96]:
results_list = []
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # Submit tasks individually to pass extra arguments
    # We store the future object mapped back to the original item/index if needed for ordering
    future_to_item = {
        executor.submit(evaluate, ground_truth, question_hybrid_boost, val): val
        for val in boosting_options
    }

    # Iterate over futures as they complete
    for future in concurrent.futures.as_completed(future_to_item):
        # Retrieve the original item (key) associated with the completed future
        item = future_to_item[future]
        try:
            # Get the actual result returned by the function
            result = future.result()
            results_list.append(result)
        except Exception as exc:
            print(f'{item} generated an exception: {exc}')


  0%|          | 0/4119 [00:00<?, ?it/s]

  0%|          | 0/4119 [00:00<?, ?it/s]

  0%|          | 0/4119 [00:00<?, ?it/s]

  0%|          | 0/4119 [00:00<?, ?it/s]

  0%|          | 0/4119 [00:00<?, ?it/s]

  0%|          | 0/4119 [00:00<?, ?it/s]

  0%|          | 0/4119 [00:00<?, ?it/s]

  0%|          | 0/4119 [00:00<?, ?it/s]

  0%|          | 0/4119 [00:00<?, ?it/s]

  0%|          | 0/4119 [00:00<?, ?it/s]

  0%|          | 0/4119 [00:00<?, ?it/s]

In [97]:
results_list

[{'hit_rate': 0.7885409079873755,
  'mrr': 0.6708100671684071,
  'boost': np.float64(0.0)},
 {'hit_rate': 0.8436513716921583,
  'mrr': 0.7243384316581699,
  'boost': np.float64(0.8)},
 {'hit_rate': 0.8506919155134741,
  'mrr': 0.7333414259124387,
  'boost': np.float64(0.6000000000000001)},
 {'hit_rate': 0.9249817916970139,
  'mrr': 0.8081087642631709,
  'boost': np.float64(0.1)},
 {'hit_rate': 0.8579752367079388,
  'mrr': 0.7407178117666108,
  'boost': np.float64(0.5)},
 {'hit_rate': 0.8409808205875212,
  'mrr': 0.7214008254430692,
  'boost': np.float64(0.9)},
 {'hit_rate': 0.8470502549162418,
  'mrr': 0.7286072671360366,
  'boost': np.float64(0.7000000000000001)},
 {'hit_rate': 0.8805535324107793,
  'mrr': 0.7636643198187264,
  'boost': np.float64(0.30000000000000004)},
 {'hit_rate': 0.8970623937848993,
  'mrr': 0.7831431577243675,
  'boost': np.float64(0.2)},
 {'hit_rate': 0.8667152221412965,
  'mrr': 0.7503924900865917,
  'boost': np.float64(0.4)},
 {'hit_rate': 0.8375819373634378,


In [98]:
results = pd.DataFrame(results_list)

In [107]:
final_boost = results.sort_values(by='mrr', ascending=False).reset_index()['boost'][0]

In [108]:
evaluate(ground_truth, search_function=question_hybrid_boost, boost=final_boost)

  0%|          | 0/4119 [00:00<?, ?it/s]

{'hit_rate': 0.9249817916970139,
 'mrr': 0.8081087642631709,
 'boost': np.float64(0.1)}

In [109]:
def text_hybrid_boost(q, boost):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_hybrid(question, v_q, course, 'text_vector', boost)

In [110]:
evaluate(ground_truth, search_function=text_hybrid_boost, boost=final_boost)

  0%|          | 0/4119 [00:00<?, ?it/s]

{'hit_rate': 0.9196406894877397,
 'mrr': 0.8003196568746469,
 'boost': np.float64(0.1)}

In [111]:
def question_text_hybrid_boost(q, boost):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_hybrid(question, v_q, course, 'question_text_vector', boost)

In [112]:
evaluate(ground_truth, search_function=question_text_hybrid_boost, boost=final_boost)

  0%|          | 0/4119 [00:00<?, ?it/s]

{'hit_rate': 0.952172857489682,
 'mrr': 0.8389576758112817,
 'boost': np.float64(0.1)}