# Step 1: Prepare documents

In [19]:
import json
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [20]:
documents = []
for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)
        

In [21]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [33]:
# !pip install --upgrade sentence-transformers transformers


# Step 2: Create Embeddings using Pretrained Models

In [30]:
from sentence_transformers import SentenceTransformer, models

# Create a SentenceTransformer model with mean pooling
word_embedding_model = models.Transformer(model_name_or_path="/home/mahtab/transformers/all-MiniLM-L6-v2")
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
sentence_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


In [32]:
len(sentence_model.encode("This is my simple sentence"))

384

In [35]:
operations = []
for doc in documents:
    doc["text_vector"] = sentence_model.encode(doc["text"]).tolist()
    operations.append(doc)

# Step 3: Setup ElasticSearch connection

In [49]:
from elasticsearch import Elasticsearch
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

es_client = Elasticsearch('https://localhost:9200',
                           http_auth=('elastic', 'xMR77YWEPbFGsMJvin_7'),
                           verify_certs=False) 

es_client.info()

  _transport = transport_class(
  es_client = Elasticsearch('https://localhost:9200',


ObjectApiResponse({'name': 'mahtab-lap', 'cluster_name': 'elasticsearch', 'cluster_uuid': '-a-4Yjx5TlOB22a_T_PSgg', 'version': {'number': '8.15.0', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': '1a77947f34deddb41af25e6f0ddb8e830159c179', 'build_date': '2024-08-05T10:05:34.233336849Z', 'build_snapshot': False, 'lucene_version': '9.11.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

# Step 4: Create Mappings and Index


In [52]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} ,
            "text_vector": {"type": "dense_vector", "dims": 384, "index": True, "similarity": "cosine"},
        }
    }
}

In [53]:
index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

# Step 5: Add documents into index


In [54]:
for doc in operations:
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)
        

# Step 6: Create end user query


In [55]:
search_term = "windows or mac?"
vector_search_term = sentence_model.encode(search_term)

In [56]:
query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000, 
}

In [57]:
res = es_client.search(index=index_name, knn=query, source=["text", "section", "question", "course"])
res["hits"]["hits"]

[{'_index': 'course-questions',
  '_id': 'EX4Zb5EBLJZ_qX1KKA06',
  '_score': 0.71210575,
  '_source': {'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully',
   'section': 'General course-related questions',
   'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
   'course': 'data-engineering-zoomcamp'}},
 {'_index': 'course-questions',
  '_id': 'rH4Zb5EBLJZ_qX1KNw6h',
  '_score': 0.6621313,
  '_source': {'text': 'For the Machine Learning part, all you need is a working laptop with an internet connection. The Deep Learning part is more resource intensive, but for that you can use a cloud (we use Saturn cloud but can be anything else).\n(Rileen Sinha; based on response by Alexey on Slack)',
   'section': 'General course-related questions',
   'question': "Any particular hardware requirements for the course or everything is mostly cloud? TIA! Couldn't really find this in the FAQ.",
   'course': 'machin

# Step 7: Perform Keyword search with Semantic Search (Hybrid/Advanced Search)


In [58]:
knn_query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000
}


In [59]:
response = es_client.search(
    index=index_name,
    query={
        "match": {"section": "General course-related questions"},
    },
    knn=knn_query,
    size=5
)

In [61]:
response["hits"]["hits"]


{'_index': 'course-questions',
 '_id': 'rH4Zb5EBLJZ_qX1KNw6h',
 '_score': 11.562052,
 '_source': {'text': 'For the Machine Learning part, all you need is a working laptop with an internet connection. The Deep Learning part is more resource intensive, but for that you can use a cloud (we use Saturn cloud but can be anything else).\n(Rileen Sinha; based on response by Alexey on Slack)',
  'section': 'General course-related questions',
  'question': "Any particular hardware requirements for the course or everything is mostly cloud? TIA! Couldn't really find this in the FAQ.",
  'course': 'machine-learning-zoomcamp',
  'text_vector': [-0.04022354632616043,
   -0.2121017426252365,
   0.19438740611076355,
   -0.06717463582754135,
   0.04492761567234993,
   0.155565083026886,
   -0.20841869711875916,
   -0.03542742505669594,
   -0.062366217374801636,
   -0.06822192668914795,
   -0.16319969296455383,
   -0.027968276292085648,
   -0.08379203081130981,
   -0.1226072832942009,
   0.05186492949724

# Perform Semantic Search & Advanced Search

In [66]:
knn_query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000
}

In [69]:
response = es_client.search(
       index= index_name,
       query={
           "match":{
               "course": "data-engineering-zoomcamp"
           },
       },
       knn= knn_query,
       size=5,
       explain=True
)

In [70]:
response["hits"]["hits"][1]

{'_shard': '[course-questions][0]',
 '_node': 'oAhEW1LLTC6122cqyK3KrA',
 '_index': 'course-questions',
 '_id': 'An4Zb5EBLJZ_qX1KJw2t',
 '_score': 1.4182067,
 '_source': {'text': 'You can set it up on your laptop or PC if you prefer to work locally from your laptop or PC.\nYou might face some challenges, especially for Windows users. If you face cnd2\nIf you prefer to work on the local machine, you may start with the week 1 Introduction to Docker and follow through.\nHowever, if you prefer to set up a virtual machine, you may start with these first:\nUsing GitHub Codespaces\nSetting up the environment on a cloudV Mcodespace\nI decided to work on a virtual machine because I have different laptops & PCs for my home & office, so I can work on this boot camp virtually anywhere.',
  'section': 'General course-related questions',
  'question': 'Environment - Should I use my local machine, GCP, or GitHub Codespaces for my environment?',
  'course': 'data-engineering-zoomcamp',
  'text_vector':