In [24]:
from mistralai import Mistral
import yaml
import json
import requests 
from tqdm.auto import notebook_tqdm
from elasticsearch import Elasticsearch
import time
from sentence_transformers import SentenceTransformer #embeddings

Concepts related to elastic search:
* documents: collection of fields with its associate values
* index: concept of organizing the documents in "table of contents" so it can be search more efficiently

In this case, what we are indexing is the embeddings, not the documents

In [3]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

# elastic search wants evrything in a single level, the original json has different levels of hierarchy
for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [4]:
model = SentenceTransformer("all-mpnet-base-v2")

In [5]:
test = model.encode("hello world")

In [6]:
print(len(test), test)

768 [ 2.62497254e-02  1.33956019e-02 -4.53314278e-03 -2.17914544e-02
  5.45518659e-02 -4.96648299e-03  6.65560272e-03  3.06262691e-02
 -5.76278474e-03 -4.56204684e-03 -3.31329461e-03 -4.84962501e-02
 -1.13640223e-02  3.50774229e-02  9.30946916e-02 -8.66874084e-02
  5.10865487e-02  9.88610461e-03 -6.35692477e-02 -8.55022110e-03
  7.05439178e-03 -3.86239309e-03  2.47442871e-02  4.28849421e-02
  3.50941271e-02 -2.98482478e-02  1.02525903e-02  2.23449301e-02
  2.08899938e-02  9.49220732e-03 -3.30444127e-02 -1.22841550e-02
  5.35289273e-02  2.54291948e-02  2.02217666e-06 -3.41910198e-02
  9.61000286e-03 -1.64845269e-02  5.60948672e-03 -4.25002817e-03
 -2.28012074e-02  4.03546654e-02  3.05205584e-03  3.13725919e-02
 -1.08123543e-02 -3.55707854e-02  2.22929213e-02  1.68712495e-03
  2.07724981e-03  2.31162217e-02  6.88587269e-03 -6.83094654e-03
 -4.87613045e-02 -2.70107724e-02  1.54910982e-02  3.73168960e-02
  2.72793565e-02  2.64989194e-02 -1.69234653e-03 -2.88223643e-02
  2.56629270e-02 -4.6

In [7]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [9]:
#embedding of some of the fields of the documents

operations = []

for doc in documents:
    doc['test_vector'] = model.encode(doc['text']).tolist()
    operations.append(doc)

In [None]:
es_client = Elasticsearch(
    hosts=[{'host': 'localhost', 'port': 9200, 'scheme': 'http'}],
    request_timeout=60,
    max_retries=10,
    retry_on_timeout=True
)

# Prueba la conexión
if es_client.ping():
    print("Conectado a Elasticsearch!")
else:
    print("No se pudo conectar a Elasticsearch.")

Conectado a Elasticsearch!


  es_client = Elasticsearch(


before defining the index, we need to provide a mapping. the mapping is like an schema for a database, it gives the name of the fields plus the data types so it can stablish the index

In [15]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "text_vector":{"type":"dense_vector", "dims":768, "index":True, "similarity":"cosine"} #dims comes from the model and the kind of similarity chosen
        }
    }
}

In [None]:
index_name = "questions_sem4"

if es_client.indices.exists(index=index_name):
    try:
        es_client.indices.delete(index=index_name, ignore_unavailable=True)
        print(f"Index {index_name} deleted successfully.")
    except Exception as e:
        print(f"Error deleting index {index_name}: {e}")

# Wait a short time before attempting to create the index again 
while es_client.indices.exists(index=index_name):
    print("Waiting for index to be fully deleted...")
    time.sleep(0.5)

try:
    es_client.indices.create(index=index_name, body=index_settings)
    print(f"Index {index_name} created successfully.")
except Exception as e:
    print(f"Error creating index {index_name}: {e}")


Index questions_sem4 created successfully.


In [26]:
# index_info = es_client.indices.get(index=index_name)
# print(index_info)


In [27]:
# es_client.indices.close(index=index_name)


In [28]:
# cluster_health = es_client.cluster.health()
# print(cluster_health)


In [None]:
for doc in operations:
    for attempt in range(3):  # Intentar hasta 3 veces
        try:
            es_client.index(index=index_name, document=doc)
            break  # Salir del bucle si la operación tiene éxito
        except ConnectionTimeout:
            print(f"Tiempo de espera agotado al indexar el documento. Reintentando ({attempt + 1}/3)...")
            time.sleep(5) 

  0%|          | 0/948 [00:00<?, ?it/s]

ConnectionTimeout: Connection timeout caused by: ConnectionTimeout(Connection timeout caused by: ReadTimeoutError(HTTPConnectionPool(host='localhost', port=9200): Read timed out. (read timeout=30)))

### query search

In [None]:
search_term = "windows or mac?"
vec_search = model.encode(search_term)

In [None]:
query = {
    "field" : "text_vector", #where to look
    "query_vector": vec_search,
    "k": 5,#nb of responess
    "num_candidates":10000
}

In [None]:
query_complex = {
    "bool": {
        "must": {
            "multi_match" :
            {"query": "windows or python?",
            "fields": ["text", "question", "course", "title"],
            "type": "best_fields"}
        }
    }
}

In [31]:
res = es_client.search(index=index_name, knn=query, source=["text", "section", "question", "course"])
res["hits"]["hits"]

NameError: name 'query' is not defined