### Load data

In [1]:
import minsearch
from elasticsearch import Elasticsearch
import json
from tqdm import tqdm
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

True

In [18]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [19]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

### Use minsearch to retrieve information

In [20]:
index = minsearch.Index(
    text_fields = ["question", "text", "section"],
    keyword_fields = ["course"]
)

q = "the course has aleardy started, can i still enroll?"

index.fit(documents)

<minsearch.Index at 0x19afee20b00>

In [21]:
boost = {"question" : 3.0}

resutls = index.search(
    query=q,
    boost_dict=boost,
    num_results=5
)

resutls

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the cour

### Generate answer with GPT 3.5

In [22]:
client = OpenAI()

response = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[{"role": "user", "content": q}]
 )
response.choices[0].message.content

'It depends on the specific course and the policies of the institution offering it. Some courses may allow late enrollment or may have a grace period for students to join after the official start date. It is recommended to contact the course instructor or institution to inquire about the possibility of enrolling after the course has already started.'

### Building prompt

In [23]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT. Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
"""

In [24]:
context = ""

for doc in resutls:
    context = context + f"section: {doc['section']}\nquestion: {doc['question']}, \nanswer: {doc['text']}"

print(context)

section: General course-related questions
question: The course has already started. Can I still join it?, 
answer: Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.
In order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.section: General course-related questions
question: Course - Can I still join the course after the start date?, 
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.section: General course-related questions
question: Course - Can I follow the course after it finishes?, 
answer: Yes, we will keep all the materials after the course finishes, so you

In [25]:
prompt = prompt_template.format(question=q, context=context)

In [26]:
client = OpenAI()

response = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[{"role": "user", "content": prompt}]
 )
response.choices[0].message.content

"Based on the context provided, it seems that you can still enroll in the course after it has already started. You may not be able to submit some of the homework assignments, but you can still take part in the course. Additionally, you can work on two out of three course projects and review peers' projects to be eligible for a certificate. Therefore, you can still participate in the course even if it has already begun."

### Turning above code into functions

In [27]:
def search(query):
    boost = {"question" : 3.0}

    results = index.search(
        query=query,
        boost_dict=boost,
        num_results=5
    )

    return results


def build_prompt(query, context):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT. Use only the facts from the CONTEXT when answering the QUESTION.

    QUESTION: {question}

    CONTEXT:
    {context}
    """.strip()

    context = ""

    for doc in resutls:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}, \nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context)
    return prompt


def llm(prompt):
    response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

In [28]:
query = "how do I run kafka?"

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [29]:
rag(query)

'To run Kafka, you would need to install and set up all the dependencies and requirements, which may include Google cloud account, Google Cloud SDK, Python 3 (installed with Anaconda), Terraform, and Git. It would be best to look over the prerequisites and syllabus to ensure you are comfortable with these subjects before starting the course.'

In [30]:
rag("how can I build docker image?")

'To build a Docker image, you can start by installing and setting up all the dependencies and requirements such as Git. After that, you can follow the Docker documentation to create your Dockerfile and build the image based on your specifications. Remember to push your image to a Docker registry if needed.'

### Setup Elasticsearch client

In [5]:
es_client = Elasticsearch('http://localhost:9200')
es_client.info()

ObjectApiResponse({'name': '9304585cd0fc', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'MFtQhQxrSXGSR3WQ8on0RQ', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

### Indexing documents

In [None]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

In [7]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:47<00:00, 19.98it/s]


### Searching with Elasticsearch

In [13]:
query = "How can I run Kafka?"
results_docs = []
search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}

response = es_client.search(index=index_name, body=search_query)
for hit in response['hits']['hits']:
    results_docs.append(hit['_source'])

results_docs


[{'text': 'In Confluent Cloud:\nEnvironment → default (or whatever you named your environment as) → The right navigation bar →  “Stream Governance API” →  The URL under “Endpoint”\nAnd create credentials from Credentials section below it',
  'section': 'Module 6: streaming with kafka',
  'question': 'Confluent Kafka: Where can I find schema registry URL?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'In Confluent Cloud:\nEnvironment → default (or whatever you named your environment as) → The right navigation bar →  “Stream Governance API” →  The URL under “Endpoint”\nAnd create credentials from Credentials section below it',
  'section': 'Module 6: streaming with kafka',
  'question': 'Confluent Kafka: Where can I find schema registry URL?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'In the project directory, run:\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java',
  'section': 'Module 6: streaming with kafka',
  'questio

In [15]:
def elastic_search(query):
    results_docs = []
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    for hit in response['hits']['hits']:
        results_docs.append(hit['_source'])

    return results_docs

In [16]:
elastic_search(query)

[{'text': 'In Confluent Cloud:\nEnvironment → default (or whatever you named your environment as) → The right navigation bar →  “Stream Governance API” →  The URL under “Endpoint”\nAnd create credentials from Credentials section below it',
  'section': 'Module 6: streaming with kafka',
  'question': 'Confluent Kafka: Where can I find schema registry URL?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'In Confluent Cloud:\nEnvironment → default (or whatever you named your environment as) → The right navigation bar →  “Stream Governance API” →  The URL under “Endpoint”\nAnd create credentials from Credentials section below it',
  'section': 'Module 6: streaming with kafka',
  'question': 'Confluent Kafka: Where can I find schema registry URL?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'In the project directory, run:\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java',
  'section': 'Module 6: streaming with kafka',
  'questio

In [31]:
def rag_els(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

rag_els(query)

'To run Kafka, before the course starts you should install and set up the following dependencies and requirements:\n- Google cloud account\n- Google Cloud SDK\n- Python 3 (installed with Anaconda)\n- Terraform\n- Git\nMake sure to also review the prerequisites and syllabus to ensure you are comfortable with the subjects covered in the course.'