Execute that in terminal

docker run -it \
    --rm \
    --name elasticsearch \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3 

In [1]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer("multi-qa-distilbert-cos-v1")


In [3]:
user_question = "I just discovered the course. Can I still join it?"

# Question 1

In [4]:
embedding_model.encode(user_question)

array([ 7.82226175e-02, -4.01311554e-02,  3.86136323e-02, -1.79005598e-04,
        8.92346352e-02, -5.04591055e-02, -1.05026914e-02,  3.71055715e-02,
       -4.18714248e-02,  3.48085053e-02, -1.20702200e-02, -2.36942675e-02,
        3.87900211e-02,  1.60988197e-02,  3.50747108e-02,  3.04754637e-03,
        5.79672754e-02, -4.10627425e-02, -3.41552831e-02, -2.56396569e-02,
       -3.55264246e-02,  1.42907687e-02, -1.62800327e-02,  3.21446396e-02,
       -4.66897264e-02,  7.89185986e-02,  4.90160696e-02,  1.56761184e-02,
       -1.69109926e-02,  2.26482246e-02,  5.60206175e-02, -3.98361124e-02,
        6.77409768e-02, -1.20210275e-02,  1.12626399e-03, -1.94394737e-02,
       -2.65951045e-02,  1.06177805e-02,  1.69687476e-02,  1.13488249e-02,
       -2.97063179e-02,  5.25258519e-02, -1.41453370e-02,  4.61700149e-02,
        1.17066465e-02, -2.38053054e-02, -6.32557794e-02, -1.92041900e-02,
       -7.10595492e-03,  3.24167795e-02,  2.49617826e-02, -5.27504086e-03,
        2.01149434e-02, -

In [5]:
v = embedding_model.encode(user_question)

# Question 2

In [6]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [7]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'id': '1f6520ca'}

In [8]:
documents = [doc for doc in documents if doc['course'] == 'machine-learning-zoomcamp']
len(documents)

375

In [9]:
#created the dense vector using the pre-trained model
embeddings = []
for doc in documents:
    qa_text = f"{doc['question']} {doc['text']}"
    embedding = embedding_model.encode(qa_text).tolist()
    embeddings.append(embedding)

In [10]:
import numpy as np

In [11]:
X = np.array(embeddings)
X.shape

(375, 768)

# Question 3 

In [12]:
scores = X.dot(v)


In [13]:
np.max(scores)

0.6506574730745047

# Question 4

In [14]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=documents, embeddings=X)
search_engine.search(v, num_results=5)

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'id': 'ee58a693'},
 {'text': 'Welcome to the course! Go to the course page (http://mlzoomcamp.com/), scroll down and start going through the course materials. Then read everything in the cohort folder for your cohort’s year.\nClick on the links and start watching the videos. Also watch office hours from previous cohorts. Go to DTC youtube channel and click on Playlists and search for {course yyyy}. ML Zoomcamp was first launched in 2021.\nOr you c

In [15]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [16]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [17]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [18]:
def evaluate(ground_truth, search_engine, num_results=5):
    relevance_total = []
    for q in tqdm(ground_truth):
        if 'document' not in q or 'question' not in q:
            continue  # Saltar entradas que no tienen las claves necesarias
        doc_id = q['document']
        query_text = q['question']
        query_vector = embedding_model.encode(query_text).tolist()
        results = search_engine.search(query_vector, num_results=num_results)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [19]:
from tqdm.auto import tqdm

In [20]:
# Calcular la tasa de aciertos y MRR
metrics = evaluate(ground_truth, search_engine, num_results=5)
print(metrics)

100%|██████████| 1830/1830 [02:09<00:00, 14.15it/s]

{'hit_rate': 0.9398907103825137, 'mrr': 0.8516484517304189}





# Question 5

In [21]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

es_client.info()

ObjectApiResponse({'name': 'bac31eb1ae57', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'RUWGgSf5QLSFA9gLKuninQ', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [22]:
index_name = 'document_embeddings'
index_settings = {
    "mappings": {
        "properties": {
            "embedding": {
                "type": "dense_vector",
                "dims": 768  # Dimensiones del embedding
            },
            "id": {"type": "keyword"},
            "question": {"type": "text"},
            "text": {"type": "text"},
            "section": {"type": "text"},
            "course": {"type": "text"}
        }
    }
}

# Crear el índice
if es_client.indices.exists(index=index_name):
    es_client.indices.delete(index=index_name)

In [23]:
for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    qt = question + ' ' + text

    doc['question_vector'] = embedding_model.encode(question)
    doc['text_vector'] = embedding_model.encode(text)
    doc['question_text_vector'] = embedding_model.encode(qt)

100%|██████████| 375/375 [02:59<00:00,  2.08it/s]


In [24]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 375/375 [00:10<00:00, 37.09it/s]


In [25]:
query = 'how to use docker?'

In [31]:
v_q = embedding_model.encode(query)
v_q

array([ 3.85062955e-02,  2.29738858e-02,  9.52375308e-03, -2.23390535e-02,
        3.21360230e-02,  1.53493993e-02, -2.57815737e-02,  4.82244007e-02,
       -2.86499001e-02, -7.92281181e-02, -1.00249844e-02, -3.97894904e-02,
        4.22686487e-02,  5.83696216e-02,  8.31514150e-02, -4.91285957e-02,
        8.47205613e-03, -3.38121806e-03, -2.56250445e-02,  5.94909266e-02,
       -5.37398644e-03, -1.93727836e-02, -1.95335429e-02,  1.26192076e-02,
        8.45499686e-04, -5.58549017e-02, -3.25437728e-03,  3.21300849e-02,
       -5.08057419e-05, -4.77186665e-02,  3.12293321e-03, -9.62192714e-02,
       -1.36028742e-02,  3.20049413e-02,  2.34329235e-02, -2.65110545e-02,
       -5.56211583e-02, -1.11019481e-02, -6.18364401e-02, -2.62998138e-02,
       -1.52710835e-02,  2.65363324e-02,  5.07975593e-02, -6.09526113e-02,
        3.87651064e-02,  3.00291870e-02,  2.75227278e-02, -3.80145162e-02,
       -3.52059640e-02,  1.82718933e-02,  3.09263505e-02, -2.30730306e-02,
        5.62731642e-03,  

In [27]:
search_query = {
    "size": 1,
    "query": {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                "params": {"query_vector": v_q}
            }
        }
    }
}

In [28]:
try:
    response = es_client.search(index=index_name, body=search_query)
    print(response)
    
    # Verificar si hay resultados y obtener el ID del documento con el puntaje más alto
    if response['hits']['hits']:
        best_match_id = response['hits']['hits'][0]['_id']
        print(f"ID of the document with the highest score: {best_match_id}")
    else:
        print("No results found.")
except Exception as e:
    print(f"Error: {e}")

Error: BadRequestError(400, 'search_phase_execution_exception', 'runtime error')


In [29]:
simple_search_query = {
    "query": {
        "match_all": {}
    }
}
simple_response = es_client.search(index=index_name, body=simple_search_query)
print(simple_response)

{'took': 14, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 366, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'document_embeddings', '_id': 'GnNYwZABmo4uYDdSY3r8', '_score': 1.0, '_ignored': ['text.keyword'], '_source': {'text': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork', 'section': 'General course-related questions', 'question': 'How do I sign up?', 'course': 'machine-learning-zoomcamp', 'id': '0227b872', 'question_vector': [0.08708027005195618, 0.05983049049973488, 0.04626588895916939, -0.08179920166730881, 0.04933812841773033, 0.06700921803712845

# Question 6

In [30]:
# Calcular hit-rate y MRR para Elasticsearch
def evaluate_elastic(ground_truth, search_engine, num_results=5):
    relevance_total = []
    for q in tqdm(ground_truth):
        if 'document' not in q or 'question' not in q:
            continue  # Saltar entradas que no tienen las claves necesarias
        doc_id = q['document']
        query_text = q['question']
        query_vector = embedding_model.encode(query_text).tolist()
        
        # Realizar la búsqueda en Elasticsearch
        search_query = {
            "size": num_results,
            "query": {
                "script_score": {
                    "query": {"match_all": {}},
                    "script": {
                        "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                        "params": {"query_vector": query_vector}
                    }
                }
            }
        }
        
        response = es_client.search(index=index_name, body=search_query)
        
        if 'hits' in response and 'hits' in response['hits']:
            results = response['hits']['hits']
            relevance = [d['_id'] == doc_id for d in results]
            relevance_total.append(relevance)
    
    # Calcular hit-rate
    hit_rate = sum([any(relevance) for relevance in relevance_total]) / len(relevance_total)
    return hit_rate

# Calcular la tasa de aciertos para Elasticsearch
hit_rate_elastic = evaluate_elastic(ground_truth, es_client, num_results=5)
print(f"Hit-rate for Elasticsearch: {hit_rate_elastic}")

  0%|          | 0/1830 [00:00<?, ?it/s]


BadRequestError: BadRequestError(400, 'search_phase_execution_exception', 'runtime error')