In [1]:
import minsearch
import json

In [2]:
docs_raw = []
documents = []
filenames = ["documents.json", "documents-llm.json"]
for file in filenames:
    with open(f"../01-intro/{file}", "rt") as f_in:
        docs_raw = json.load(f_in)
    for course_dict in docs_raw:
        for doc in course_dict['documents']:
            doc['course'] = course_dict['course']
            documents.append(doc)

In [3]:
# Adding the minsearch
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)
index.fit(documents)

<minsearch.minsearch.Index at 0x109645e50>

In [4]:
query = "The course has started, can I still enrol?"

In [5]:
boost = {"question": 3.0, "section": 0.2}
filter_dict = {"course": "data-engineering-zoomcamp"}
results = index.search(
    query=query,
    boost_dict=boost,
    filter_dict=filter_dict,
    num_results=3)


In [6]:
from openai import OpenAI

In [7]:
client = OpenAI()

Without the knowledge base

In [8]:
response = client.chat.completions.create(model="gpt-4.1-nano", 
                               messages=[{"role": "user", "content": query}]
                              )

In [9]:
response.choices[0].message.content

"It depends on the course policies and the enrollment deadlines set by the institution. I recommend checking the course's official website or contacting the course administrator or admissions office directly to get accurate information about late enrollment options."

**Lets add the knowledge base**

In [10]:
prompt_template = """
You are a course assistant, and your goal is to answer questions of students, where QUESTION is provided below and CONTEXT is provided most of the times. 
Rules:
* Answer the QUESTION based on the CONTEXT.
* Use only the facts from the CONTEXT.
* If CONTEXT is empty, please let the student know, the information about their query is not there, however you found the following information on the web, by searching the web

QUESTION: {question}

CONTEXT: {context}
""".strip()

In [11]:
context = ""
for doc in results:
    context = context + f"Section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    course = doc['course']
context = f"Course: {course}" + "\n\n" + context
print(context)

Course: data-engineering-zoomcamp

Section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Section: General course-related questions
question: Course - Can I follow the course after it finishes?
answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.

Section: General course-related questions
question: Course - When will the course start?
answer: The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 

**Building the prompt**

In [12]:
prompt = prompt_template.format(question=query, context=context)

In [13]:
print(prompt)

You are a course assistant, and your goal is to answer questions of students, where QUESTION is provided below and CONTEXT is provided most of the times. 
Rules:
* Answer the QUESTION based on the CONTEXT.
* Use only the facts from the CONTEXT.
* If CONTEXT is empty, please let the student know, the information about their query is not there, however you found the following information on the web, by searching the web

QUESTION: The course has started, can I still enrol?

CONTEXT: Course: data-engineering-zoomcamp

Section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Section: General course-related questions
question: Course - Can I follow the course after it finishes?
answer: Yes, we will keep all the materials a

**Added the knowledge base and rag with minsearch is completed**

In [14]:
response = client.chat.completions.create(model="gpt-4.1-nano", 
                               messages=[{"role": "user", "content": prompt}]
                              )
response.choices[0].message.content

"Yes, you can still join the course after it has started. Even if you register now, you will be eligible to submit the homeworks. Just keep in mind that there will be deadlines for final projects, so it's best not to leave everything for the last minute."

In [15]:
from elasticsearch import Elasticsearch

In [16]:
es_client = Elasticsearch("http://localhost:9200")

In [17]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

In [18]:
es_client.indices.create(body=index_settings, index=index_name)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [19]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
for doc in tqdm(documents):
    es_client.index(document=doc, index=index_name)

100%|████████████████████████████████████████████████| 1034/1034 [00:02<00:00, 430.09it/s]


In [21]:
search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section^0.3"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}

In [22]:
results = es_client.search(index=index_name, body=search_query)

In [23]:
extracted_results = []
for res in results['hits']['hits']:
    extracted_results.append(res['_source'])

In [24]:
context = ""
for doc in extracted_results:
    context = context + f"Section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    course = doc['course']
context = f"Course: {course}" + "\n\n" + context
prompt = prompt_template.format(question=query, context=context)

In [25]:
response = client.chat.completions.create(model="gpt-4.1-nano", 
                               messages=[{"role": "user", "content": prompt}]
                              )
response.choices[0].message.content

"Yes, you can still enroll in the course even after it has started. However, if you don't register, you are still eligible to submit the homeworks. Keep in mind that there will be deadlines for turning in the final projects."