# 1. Create retrieval engine

In [1]:
from minsearch import Index
import json

In [2]:
with open("documents.json", "rt") as f_in:
    docs_raw = json.load(f_in)

In [3]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [4]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [5]:
index = Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

The code above is equal to `SELECT * WHERE course = ...`

In [6]:
index.fit(documents)

<minsearch.Index at 0x78bea9e83a40>

In [7]:
q = 'the course has alraedy started, can I still enroll?'

In [8]:
boost = {'question': 3.0, 'section': 0.5}

results = index.search(
    query=q,
    filter_dict={"course": "data-engineering-zoomcamp"},
    boost_dict=boost,
    num_results=5
)

In [9]:
results

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 202

# 2. Create generation engine

In [1]:
from groq import Groq
from dotenv import dotenv_values
config = dotenv_values("../.env")
client = Groq(
    api_key=config['GROQ_API_KEY'],
)

Got the response. Now need to build a prompt template for the model.

In [16]:
context = ""

for doc in results:
    context += f"section: {doc["section"]}\nquestion: {doc["question"]}\nanswer:: {doc["text"]}\n\n"

prompt_template = """You're a course teaching assistant. You are supposed to help students by answering QUESTION using information from CONTEXT only.

QUESTION: {question}
CONTEXT:
{context}
"""
prompt = prompt_template.format(question=q, context=context).strip()

In [17]:
prompt

"You're a course teaching assistant. You are supposed to help students by answering QUESTION using information from CONTEXT only.\n\nQUESTION: the course has alraedy started, can I still enroll?\nCONTEXT:\nsection: General course-related questions\nquestion: Course - Can I still join the course after the start date?\nanswer:: Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.\n\nsection: General course-related questions\nquestion: Course - Can I follow the course after it finishes?\nanswer:: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.\n\nsection: General course-related questions\nquestion: 

In [18]:
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model="llama3-70b-8192",
)

print(chat_completion.choices[0].message.content)

According to the context, the answer to your question is:

Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.


# 3. Modularization of the code

In [21]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    search_results = index.search(
        query=query,
        filter_dict={"course": "data-engineering-zoomcamp"},
        boost_dict=boost,
        num_results=5
    )
    return search_results

In [24]:
def build_prompt(query, search_results):
    context = ""

    for doc in search_results:
        context += f"section: {doc["section"]}\nquestion: {doc["question"]}\nanswer:: {doc["text"]}\n\n"

    prompt_template = """You're a course teaching assistant. You will answer QUESTION using information from CONTEXT only.

    QUESTION: {question}

    CONTEXT:
    {context}
    """
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

'You\'re a course teaching assistant. You will answer QUESTION using information from CONTEXT only.\n\n    QUESTION: How do I run kafka\n\n    CONTEXT:\n    section: Module 6: streaming with kafka\nquestion: Java Kafka: How to run producer/consumer/kstreams/etc in terminal\nanswer:: In the project directory, run:\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n\nsection: Module 6: streaming with kafka\nquestion: Module “kafka” not found when trying to run producer.py\nanswer:: Solution from Alexey: create a virtual environment and run requirements.txt and the python files in that environment.\nTo create a virtual env and install packages (run only once)\npython -m venv env\nsource env/bin/activate\npip install -r ../requirements.txt\nTo activate it (you\'ll need to run it every time you need the virtual env):\nsource env/bin/activate\nTo deactivate it:\ndeactivate\nThis works on MacOS, Linux and Windows - but for Windows the path is sli

In [34]:
def llm(prompt):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama3-8b-8192",
    )

    return chat_completion.choices[0].message.content

In [26]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    return llm(prompt)

In [30]:
query = "How do I run kafka?"

rag(query)

"To run Kafka, you can use the following command in the project directory:\n\n`java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java`\n\nNote: This is based on the information provided in the context, and it's assumed that you have a Java-based Kafka project set up. If you're using Python, you may need to create a virtual environment, install the required packages, and then run the Python files in that environment."

In [32]:
rag('the course has already started, can I still enroll?')

"According to the context, the answer is: Yes, even if you don't register, you're still eligible to submit the homeworks."

# 4. Using ElasticSearch

All the function above is also moved to a new Python file that is imported here.

Now, to do it properly, we can run an ElasticSearch cluster instead of the minimal `minsearch.Index`

To run ElasticSearch, the easiest way is to spin a Docker container (wow, Docker is the *easiest way* for me now, truly a character development) and ingest + index the documents first before RAG-ging on it.
```bash
docker run -it \
    --name elasticsearch \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3
```

In [5]:
from elasticsearch import Elasticsearch
from rag import get_documents, build_prompt, llm
from tqdm.auto import tqdm

In [2]:
es_client = Elasticsearch("http://127.0.0.1:9200")
es_client.info()

ObjectApiResponse({'name': '866794e4e2a2', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'ueY2ScRMS7mrRNOwCG073w', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [None]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}
index_name = "course_questions"
es_client.indices.create(index=index_name, body=index_settings)

In [6]:
documents = get_documents()
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:20<00:00, 47.37it/s]


In [7]:
def elastic_search(query: str) -> list:
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name, body=search_query)
    result_docs = []

    for hit in response["hits"]["hits"]:
        result_docs.append(hit["_source"])
    return result_docs

In [10]:
def rag(query, model):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    return llm(prompt, model)

In [11]:
rag("How do I run Kafka?", "llama3-8b-8192")

'To run Kafka, you can use the instructions provided in the "Java Kafka" section. The instructions for running producer/consumer/kstreams, etc. in the terminal is as follows:\n\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java'