In [1]:
import os
import json
from tqdm.auto import tqdm

import minsearch

# Base search

In [2]:
with open("../assets/documents.json", "r") as fin:
    raw = json.load(fin)

In [3]:
len(raw)

3

In [4]:
raw[0].keys()

dict_keys(['course', 'documents'])

In [5]:
raw[0]["course"]

'data-engineering-zoomcamp'

In [6]:
documents = []

for course in raw:
    course_name = course["course"]
    for doc in course["documents"]:
        doc["course"] = course_name
        documents.append(doc)

In [7]:
len(documents)

948

In [8]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [9]:
index = minsearch.Index(
    text_fields=["section", "question", "text"],
    keyword_fields=["course"]
)

In [10]:
index.fit(documents);

In [11]:
query = 'the course has already started, can I still enroll?'

In [12]:
boost = {'question': 3.0, 'section': 0.5}

results = index.search(
    query=query,
    boost_dict=boost,
    num_results=5
)

In [13]:
results[:2]

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'}]

# OpenAI

In [14]:
from openai import OpenAI

In [15]:
client = OpenAI(api_key=os.environ["OPENAI_KEY"])

In [16]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": query}]
)

response.choices[0].message.content

"Whether you can still enroll in a course that has already started depends on the specific policies of the institution or organization offering the course. Here are some steps you can take to find out:\n\n1. **Check the Course Website:** Look for any information regarding late enrollment or registration deadlines on the course's official webpage.\n\n2. **Contact the Instructor:** Reach out to the instructor or course coordinator via email or campus messaging system. Explain your situation and ask if late enrollment is possible.\n\n3. **Contact the Registrar's Office:** The registrar's office at your institution typically handles course enrollments and academic records. Contact them to inquire about the possibility of late registration.\n\n4. **Review the Institution's Policies:** Some institutions have specific add/drop periods during which students can join or leave courses without penalty. Check if you are still within this period.\n\n5. **Check for Prerequisites or Conditions:** Ens

In [17]:
prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: {question}
    
    CONTEXT: 
    {context}
""".strip()

In [18]:
context = ""

for doc in results:
    context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

prompt = prompt_template.format(question=query, context=context).strip()

In [19]:
prompt

"You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.\n    Use only the facts from the CONTEXT when answering the QUESTION.\n    \n    QUESTION: the course has already started, can I still enroll?\n    \n    CONTEXT: \n    section: General course-related questions\nquestion: The course has already started. Can I still join it?\nanswer: Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.\n\nsection: General course-related questions\nquestion: Course - Can I still join the course after the start date?\nanswer: Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will b

In [20]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": prompt}]
)

In [21]:
response.choices[0].message.content

"Yes, you can still enroll in the course even though it has already started. You may not be able to submit some of the homeworks, but you can still participate. To obtain a certificate, you need to submit 2 out of 3 course projects and review 3 peers' projects by the deadline, so if you join late but manage to complete the necessary projects and reviews, you will still be eligible for a certificate."

# Elastic Search

In [22]:
from elasticsearch import Elasticsearch

In [23]:
es_client = Elasticsearch('http://elasticsearch:9200') 

In [24]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [25]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [26]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [27]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [28]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: {question}
    
    CONTEXT: 
    {context}
    """.strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [29]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [30]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [31]:
query = 'I just disovered the course. Can I still join it?'

In [32]:
rag(query)

"Yes, you can still join the course even if you discovered it after the start date. You are eligible to submit the homeworks and participate in the course activities. Just keep in mind the deadlines for the final projects, so it's best not to leave everything for the last minute."