In [None]:
!pip install minsearch

Collecting minsearch
  Downloading minsearch-0.0.4-py3-none-any.whl.metadata (8.1 kB)
Downloading minsearch-0.0.4-py3-none-any.whl (11 kB)
Installing collected packages: minsearch
Successfully installed minsearch-0.0.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [None]:
import json

In [None]:
import minsearch

In [None]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [None]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [None]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  â€œOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDonâ€™t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [None]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [None]:
q = 'the course has already started, can I still enroll?'

In [None]:
index.fit(documents)

<minsearch.minsearch.Index at 0x79d13fbf13d0>

In [None]:
#from openai import OpenAI

In [None]:
#client = OpenAI()

In [None]:
from groq import Groq

In [None]:
client = Groq() 

In [None]:
response = client.chat.completions.create(
    model='qwen/qwen3-32b',
    messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

'<think>\nOkay, the user is asking if they can still enroll in a course that\'s already started. Let me think about how to approach this.\n\nFirst, I need to consider different possible scenarios. Courses can vary a lot in terms of structure and policies. For example, some courses might have strict deadlines for enrollment, while others are more flexible. Maybe the user is in a university setting versus an online course like Coursera or Udemy.\n\nI should start by checking the specific platform or institution\'s policy. If it\'s a university course, they might have a drop/add period. But since I don\'t know the exact context, I need to provide general advice. Let me recall some common policies. Often, you can join a course after it starts, but you might lose some points or not have full access to past materials.\n\nWait, the user might need to contact the instructor or administrator. That\'s a standard step. Also, online platforms usually have an enrollment option. If the course is fre

In [None]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(query=query,
             filter_dict={"course": "data-engineering-zoomcamp"},
             num_results=3,
             boost_dict=boost)

    return results

In [None]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: {question}
    
    CONTEXT: 
    {context}
    """.strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    prompt = prompt_template.format(question=query, context=context).strip()
    
    return prompt

In [None]:
def llm(prompt):
    response = client.chat.completions.create(
        model='qwen/qwen3-32b',
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [None]:
query = 'how to run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [None]:
rag(query)

'<think>\nOkay, let\'s tackle the question: "How to run Kafka?" based on the provided context. First, I need to parse through the given CONTEXT sections to extract relevant information.\n\nLooking at the first section under Module 6, there\'s a mention of creating a virtual environment and installing requirements. The user had an issue with the "Module â€˜kafkaâ€™ not found" when running producer.py. The solution involves setting up a virtual environment, activating it, installing dependencies from requirements.txt, and then running the Python files. It also notes that Docker images should be running first.\n\nThe second part under Module 6 talks about Java Kafka, giving a command to run Java classes like JsonProducer.java using a JAR file. The command includes the classpath and the specific Java file. \n\nIn the Project section, there\'s an answer about fixing a ModuleNotFoundError related to \'kafka.vendor.six.moves\', suggesting using kafka-python-ng instead of the standard kafka-py

# elastic search

In [None]:
from elasticsearch import Elasticsearch

In [None]:
es_client = Elasticsearch('http://localhost:9200')

In [None]:
#es_client.info()

In [None]:
#creating an Index
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index = index_name, body = index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [None]:
from tqdm.auto import tqdm

In [None]:
for doc in tqdm(documents):
    es_client.index(index = index_name, document = doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [None]:
query = "I just discovered the course. Can I still join it?"

In [None]:
def elastic_search(query):

    search_query = {
        "size": 5, # result gives 5 answers
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"], #^3 means the importance is the power of 3  (gives a boost)
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    
    response = es_client.search(index=index_name, body=search_query)
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [None]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [None]:
rag(query)

"<think>\nOkay, the user is asking if they can join the course now that they've just discovered it. Let me look at the context provided.\n\nFirst, there's a section about joining after the start date. The answer says yes, you can still join and submit homework even if you didn't register. But there's a note about final project deadlines, so they shouldn't procrastinate.\n\nAnother section mentions that materials are kept after the course ends, so you can follow along at your own pace. Also, you can keep working on homework and prepare for the next cohort or start the capstone project.\n\nRegistration isn't required for submission, and confirmation emails aren't sent. The registration is just for interest tracking. So even if they missed the start date, they can still join and participate.\n\nPutting it all together: Yes, they can join now. Registration isn't mandatory, but they can still access materials and submit homework. They should be mindful of deadlines for projects. Also, the s