# Prepare Open AI client object

In [1]:
from openai import OpenAI

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [3]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [4]:
client

<openai.OpenAI at 0x7f2e1b17c640>

# Prepare documents for information retrieval

In [5]:
import json

In [6]:
with open("documents.json", "rt") as f_in:
    docs_raw = json.load(f_in)

In [7]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict["documents"]:
        doc["course"] = course_dict["course"]
        documents.append(doc)

# Create embeddings

In [8]:
from sentence_transformers import SentenceTransformer

In [9]:
model = SentenceTransformer("all-mpnet-base-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:
# created the dense vector using the pre-trained model
operations = []
for doc in documents:
    # Transforming the title into an embedding using the model
    doc["text_vector"] = model.encode(doc["text"]).tolist()
    operations.append(doc)

# Elastic Search

In [13]:
from elasticsearch import Elasticsearch

In [14]:
es_client = Elasticsearch("http://localhost:9200")

In [15]:
es_client.info()

ObjectApiResponse({'name': 'd9d355a53607', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'mykHg1ngRYmjQQO5Lei-og', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

### Keyword vs Semantic Search

- In the keyword search we give the input of the user directly to the elastic search engine
- In semantic search we first convert this input to an embedding and then use this embedding for the search.

## Create mappings and index

In [16]:
index_settings = {
    "settings": {"number_of_shards": 1, "number_of_replicas": 0},
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine",
            },
        }
    },
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [17]:
# Add documents to the index

for doc in operations:
    try:
        es_client.index(index=index_name, document=doc)

    except Exception as e:
        print(e)

# Perform semantic search 

In [21]:
question = "i just found out about this course, can i still join?"

In [22]:
vector_search_term = model.encode(question)

In [23]:
query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000,
}

In [24]:
res = es_client.search(
    index=index_name, knn=query, source=["text", "section", "question", "course"]
)
res["hits"]["hits"]

[{'_index': 'course-questions',
  '_id': 'vMzK3pABjfuScd5cxk8b',
  '_score': 0.7716128,
  '_source': {'question': 'The course has already started. Can I still join it?',
   'course': 'machine-learning-zoomcamp',
   'section': 'General course-related questions',
   'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.'}},
 {'_index': 'course-questions',
  '_id': 'vczK3pABjfuScd5cxk8h',
  '_score': 0.74707013,
  '_source': {'question': 'When does the next iteration start?',
   'course': 'machine-learning-zoomcamp',
   'section': 'General course-related questions',
   'text': 'The course is available in the self-paced mode too, so you can go through the materials at

In [25]:
knn_query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000,
}

In [26]:
response = es_client.search(
    index=index_name,
    query={
        "match": {"section": "General course-related questions"},
    },
    knn=knn_query,
    size=5,
)

In [27]:
response["hits"]["hits"]

[{'_index': 'course-questions',
  '_id': 'vMzK3pABjfuScd5cxk8b',
  '_score': 11.671534,
  '_source': {'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
   'section': 'General course-related questions',
   'question': 'The course has already started. Can I still join it?',
   'course': 'machine-learning-zoomcamp',
   'text_vector': [-0.05821846053004265,
    0.016635756939649582,
    -0.015453789383172989,
    -0.0009686130797490478,
    0.014512831345200539,
    -0.006483542267233133,
    0.004492341540753841,
    0.002761312760412693,
    -0.021292896941304207,
    -0.030807800590991974,
    0.05864739045500755,
    -0.008818311616778374,
    -0.0363317131

# Perform advanced semantic search

In [28]:
knn_query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000,
}

In [29]:
response = es_client.search(
    index=index_name,
    query={
        "match": {"course": "data-engineering-zoomcamp"},
    },
    knn=knn_query,
    size=5,
)

In [30]:
response["hits"]["hits"]

[{'_index': 'course-questions',
  '_id': 'BszK3pABjfuScd5ctk6K',
  '_score': 1.5061636,
  '_source': {'text': "No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.",
   'section': 'General course-related questions',
   'question': 'Certificate - Can I follow the course in a self-paced mode and get a certificate?',
   'course': 'data-engineering-zoomcamp',
   'text_vector': [-0.04734233766794205,
    -0.009002341888844967,
    -0.012055711820721626,
    0.014271305873990059,
    -0.00808296911418438,
    -0.016457580029964447,
    0.02150489017367363,
    -0.005105022806674242,
    -0.00390930799767375,
    0.02507534623146057,
    0.0439869724214077,
    0.03641694411635399,
    -0.04019477590918541,
    0.055686306208372116,
    0.010882469825446606,
    0.012