In [17]:
from openai import OpenAI
import minsearch
import json
from tqdm.auto import tqdm
from elasticsearch import Elasticsearch
from dotenv import load_dotenv
import os

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [3]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [4]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

SELECT * WHERE course = 'data-engineering-zoomcamp';

In [5]:
q = 'the course has already started, can I still enroll?'

In [6]:
index.fit(documents)

<minsearch.Index at 0x731088bd3e50>

In [7]:
client = OpenAI()

In [9]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

"Whether you can still enroll in a course that has already started depends on several factors, including the policies of the institution offering the course, the specific course itself, and how much of the course has already taken place. Here are some steps you can take to see if late enrollment is possible:\n\n1. **Check the Institution's Policies:** Some institutions have a deadline for adding courses. Check the academic calendar or the relevant section of the institution’s website for information about add/drop dates.\n\n2. **Contact the Instructor:** Reach out directly to the course instructor. Explain your situation and express your interest in the course. The instructor may be willing to make an exception and allow you to enroll late, especially if you have a compelling reason.\n\n3. **Consult Academic Advising:** Speak with an academic advisor. They can provide guidance on the institution’s policies and help you navigate the process of late enrollment.\n\n4. **Assess Feasibility

In [10]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [11]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt


In [12]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [13]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [14]:
print(rag(query))

To run Kafka, you need to perform the following steps depending on the context you are working with. For Java Kafka users, here is how you can run producer, consumer, kstreams, etc., in the terminal:

In the project directory, run:
```sh
java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java
```

For Python Kafka users, you may need to set up a virtual environment and make sure you have the correct dependencies installed. First, create a virtual environment and install the required packages:

To create a virtual environment and install packages:
```sh
python -m venv env
source env/bin/activate
pip install -r ../requirements.txt
```

To activate the virtual environment (you'll need to run it every time you need the virtual environment):
```sh
source env/bin/activate
```

To deactivate the virtual environment:
```sh
deactivate
```

Make sure Docker images are up and running before running the Python Kafka files. 

Additionally, if you encounter an

# With Elasticsearch

In [15]:
es_client = Elasticsearch('http://localhost:9200') 

In [16]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [18]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:04<00:00, 236.87it/s]


In [23]:
query = 'I just disovered the course. Can I still join it?'

In [24]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [25]:
def rag_es(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [26]:
print(rag(query))

Yes, you can still join the course even after the start date. You're eligible to submit the homeworks, but please be aware that there will be deadlines for turning in the final projects, so make sure not to leave everything for the last minute.


In [27]:
print(rag_es(query))

Yes, you can still join the course even if you have just discovered it. You don't need to register to start learning and submitting homework. However, be mindful of the deadlines for turning in the final projects.


In [28]:
print(rag_es(query))

Yes, you can still join the course even if you just discovered it. You don't need to register to be eligible to submit the homeworks. Just be aware of the deadlines for turning in the final projects and try not to leave everything to the last minute.
