In [57]:
import minsearch
import json
from openai import OpenAI
from elasticsearch import Elasticsearch

In [3]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Access the OpenAI API key
api_key = os.getenv("OPENAI_API_KEY")

# Verify that the key is loaded
if api_key:
    print("API key loaded successfully!")
else:
    print("Failed to load API key. Check .env file.")

API key loaded successfully!


#### Get and load the data

In [None]:
# !wget https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/refs/heads/main/01-intro/documents.json

In [6]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [7]:
docs_raw[0]['documents'][0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?'}

#### Flatten the json

In [8]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [9]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

#### Index the data

In [10]:
index = minsearch.Index(
    text_fields=['question', 'text', 'section'],
    keyword_fields=['course']
)

In [11]:
index.fit(documents)

<minsearch.Index at 0x7f85ecf9f0b0>

#### Search the index

In [12]:
q = 'The course has already started.  Can I still enroll?'

In [13]:
boost = {'question':3.0, 'section': 0.5}
results = index.search(
    query=q,
    boost_dict=boost,
    filter_dict={'course':'data-engineering-zoomcamp'},
    num_results=5
)

In [14]:
results

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 202

#### Use the OpenAI API to generate a result

In [15]:
client = OpenAI()

In [17]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{'role':'user', 'content':q}]
)

In [22]:
response.choices[0].message.content

"It's best to contact the institution or provider offering the course directly to find out if late enrollment is possible. Some courses allow late registration, especially if they are flexible or self-paced. However, others may have strict deadlines. Reach out to the admissions office or the course instructor as soon as possible to inquire about your options."

In [26]:
prompt_template = '''
You are a course teaching assistant.  Answer the QUESTION based on the CONTEXT.
Use only the information contained in the CONTEXT.
If the CONTEXT doesn't contain the answer, output "Hmm.  That information isn't in the documents I have available..."

QUESTION: {question}
CONTEXT: {context}
'''.strip()

In [27]:
context = ""

for doc in results:
    context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

In [30]:
prompt = prompt_template.format(question=q, context=context).strip()

In [31]:
print(prompt)

You are a course teaching assistant.  Answer the QUESTION based on the CONTEXT.
Use only the information contained in the CONTEXT.
If the CONTEXT doesn't contain the answer, output "Hmm.  That information isn't in the documents I have available..."

QUESTION: The course has already started.  Can I still enroll?
CONTEXT: section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Course - Can I follow the course after it finishes?
answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can a

In [32]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{'role':'user', 'content':prompt}]
)

response.choices[0].message.content

"Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."

#### Clean and modularize

In [37]:
def search(query):
    boost = {'question':3.0, 'section': 0.5}
    
    results = index.search(
        query=query,
        boost_dict=boost,
        filter_dict={'course':'data-engineering-zoomcamp'},
        num_results=5
    )

    return results

In [39]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [45]:
def prompt_LLM(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [47]:
query = 'How do I earn a certificate?'
search_results = search(query)
prompt = build_prompt(query, search_results)
answer = prompt_LLM(prompt)

In [48]:
print(answer)

To earn a certificate, you must finish the course with a "live" cohort. Certificates are not awarded for self-paced modes because you need to peer-review capstone projects after submitting a project, which is only possible while the course is actively running.


In [49]:
query = 'How much is that doggie in the window?'
search_results = search(query)
prompt = build_prompt(query, search_results)
answer = prompt_LLM(prompt)
print(answer)

I'm sorry, I cannot provide information about the cost of a doggie in the window as the context provided is related to course details and not about animals or prices.


In [50]:
def rag(query):
    query = query
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = prompt_LLM(prompt)

    return answer

In [55]:
q = 'Do I have to peer review others work?'

response = rag(q)

print(response)

The context does not provide specific information about whether peer reviewing others' work is required in the course. Please check your course syllabus or reach out to your course instructor for clarification on peer review requirements.


#### Upgrade to Elastic Search

In [56]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

We need to start an Elasticsearch instance.  We'll do it locally.  

In a terminal:
```bash
docker run -it \
    --rm \
    --name elasticsearch \
    -m 4GB \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3
```
For truly persistent results we might need to mount a volume to the Docker container...

In [58]:
es_client = Elasticsearch('http://localhost:9200')

In [59]:
es_client.info()

ObjectApiResponse({'name': '3ad582577285', 'cluster_name': 'docker-cluster', 'cluster_uuid': '8h8xDSLpRs2a6dqeOWPfwg', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [60]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

In [62]:
index_name = 'course-questions'

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [63]:
from tqdm.auto import tqdm

In [64]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

#### Query the new index

In [65]:
query = 'How do I earn a course certificate?'

In [66]:
search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}

In [68]:
response = es_client.search(index=index_name, body=search_query)

In [69]:
result_docs = []

for hit in response['hits']['hits']:
    result_docs.append(hit['_source'])

In [70]:
result_docs

[{'text': "No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.",
  'section': 'General course-related questions',
  'question': 'Certificate - Can I follow the course in a self-paced mode and get a certificate?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'After you create a GitHub account, you should clone the course repo to your local machine using the process outlined in this video: Git for Everybody: How to Clone a Repository from GitHub\nHaving this local repository on your computer will make it easy for you to access the instructors’ code and make pull requests (if you want to add your own notes or make changes to the course content).\nYou will probably also create your own repositories that host your notes, versions of your file, to do this. He

#### Functionize the search

In [71]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [72]:
elastic_search(query)

[{'text': "No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.",
  'section': 'General course-related questions',
  'question': 'Certificate - Can I follow the course in a self-paced mode and get a certificate?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'After you create a GitHub account, you should clone the course repo to your local machine using the process outlined in this video: Git for Everybody: How to Clone a Repository from GitHub\nHaving this local repository on your computer will make it easy for you to access the instructors’ code and make pull requests (if you want to add your own notes or make changes to the course content).\nYou will probably also create your own repositories that host your notes, versions of your file, to do this. He

#### Swap search functions

In [73]:
def rag_elastic(query):
    query = query
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = prompt_LLM(prompt)

    return answer

In [75]:
rag_elastic(query)

'To earn a course certificate, you must finish the course with a “live” cohort. Certificates are not awarded for the self-paced mode because you need to peer-review capstone projects after submitting your own, which requires participation while the course is actively running.'

#### Bonus: Semantic Search  (Module 3)
We've already loaded the raw documents json and flattened it with a for loop.

In [78]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

Previously, we took these documents and created an index, either with minsearch or Elastic Search.  This time, we'll first create embeddings and append them to each document before indexing.

In [79]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-mpnet-base-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [81]:
len(model.encode('How much is that doggie in the window?'))

768

Now we create the vector embeddings using the pretrained model.  We loop through the documents, feeding the text field into the Sentence Transformer and appending the resulting vector back onto each document.

In [82]:
#created the dense vector using the pre-trained model
operations = []
for doc in documents:
    # Transforming the title into an embedding using the model
    doc["text_vector"] = model.encode(doc["text"]).tolist()
    operations.append(doc)

Now we'll get these into an Elastic Search index...

In [85]:
es_client.info()

ObjectApiResponse({'name': '3ad582577285', 'cluster_name': 'docker-cluster', 'cluster_uuid': '8h8xDSLpRs2a6dqeOWPfwg', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [86]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} ,
            "text_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"},
        }
    }
}

We'll delete and re-add the index.  We should do this every time we need to update the mappings.

In [87]:
index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

Now, we'll add the documents to the index, with their embeddings.

In [89]:
for doc in operations:
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

#### Creating embeddings for queries

Queries will also need embeddings.  We pass the query to Sentence Transformers and add it to the query before searching.

In [105]:
question = 'What is the best platform to complete the course?'
vector_question = model.encode(question)

Instead of passing the question directly to the search engine, we pass the embeddings vector.  Elastic search returns results with the closest similarity.

In [109]:
query = {
    "field": "text_vector",
    "query_vector": vector_question,
    "k": 5,
    "num_candidates": 10000, 
}

In [111]:
res = es_client.search(index=index_name, knn=query, source=["text", "section", "question", "course"])
result_docs = []

for hit in res['hits']['hits']:
    result_docs.append(hit['_source'])

result_docs

[{'question': 'Environment - Do we really have to use GitHub codespaces? I already have PostgreSQL & Docker installed.',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': "It's up to you which platform and environment you use for the course.\nGithub codespaces or GCP VM are just possible options, but you can do the entire course from your laptop."},
 {'question': 'Environment - Do we really have to use GitHub codespaces? I already have PostgreSQL & Docker installed.',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': "It's up to you which platform and environment you use for the course.\nGithub codespaces or GCP VM are just possible options, but you can do the entire course from your laptop."},
 {'question': 'Environment - Do I need both GitHub Codespaces and GCP?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Choose the approach th

In [112]:
def semantic_search(question):
    vector_question = model.encode(question)

    query = {
    "field": "text_vector",
    "query_vector": vector_question,
    "k": 5,
    "num_candidates": 10000, 
    }

    results = es_client.search(index=index_name, knn=query, source=["text", "section", "question", "course"])

    result_docs = []
    
    for hit in results['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [113]:
def rag_semantic(query):
    query = query
    search_results = semantic_search(query)
    prompt = build_prompt(query, search_results)
    answer = prompt_LLM(prompt)

    return answer

In [114]:
rag_semantic(question)

"It's up to you which platform and environment you use for the course. GitHub Codespaces, GCP VM, or even a setup on your own laptop are all possible options. You should choose the platform that aligns most with your end project goals."

In [115]:
rag_semantic('Can I use linux?')

'Yes, you can use Linux for the course. Linux is considered ideal, but technically, the course should be compatible with Windows and macOS as well, as students have used all three operating systems successfully in the past.'