In [1]:
import json

In [2]:
with open('./documents.json', 'rt') as f_in:
    documents_file = json.load(f_in)

In [3]:

documents_file[0]['documents'][0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?'}

In [4]:

documents = []

In [5]:
for course in documents_file:
    course_name = course['course']
     
    for doc in course['documents']:
       doc['course'] = course_name
       documents.append(doc)

In [6]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [7]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")
es.info()

ObjectApiResponse({'name': 'e21286a6d9de', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'NDEcdqssQMOB-96y-9k49w', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [11]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "more-course-questions"
response = es.indices.create(index=index_name, body=index_settings)

response

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'more-course-questions'})

In [None]:
index_name = "more-course-questions"

In [None]:
from tqdm.auto import tqdm

In [None]:
for doc in tqdm(documents):
    es.index(index=index_name, document=doc)

In [None]:
user_question = "How do I join the course after it has started?"

search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": user_question,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}

In [None]:
response = es.search(index=index_name, body=search_query)

for hit in response['hits']['hits']:
    doc = hit['_source']
    print(f"Section: {doc['section']}\nQuestion: {doc['question']}\nAnswer: {doc['text']}\n\n")

In [13]:
def retrieve_documents(query, index_name="more-course-questions", max_results=5):
    es = Elasticsearch("http://localhost:9200")

    search_query = {
        "size": max_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    response = es.search(index=index_name, body=search_query)
    documents = [hit['_source'] for hit in response['hits']['hits']]
    return documents

In [14]:
user_question = "How do I join the course once it has already started?"

response = retrieve_documents(user_question)

for doc in response:
    print(f"Section: {doc['section']}\nQuestion: {doc['question']}\nAnswer: {doc['text']}\n\n")

In [15]:
context =""

for doc in response:
    doc_str = f"Section: {doc['section']}\nQuestion: {doc['question']}\nAnswer: {doc['text']}\n\n"
    context+=doc_str
    
context=context.strip()
print(context)




In [16]:
import os
from groq import Groq

client = Groq(
    api_key=os.getenv('GROQ_API_KEY')

)

In [17]:

response = client.chat.completions.create(
    model="llama3-8b-8192",
    messages=[{"role": "user", "content": "What's the formula for Energy?"}]
)
print(response.choices[0].message.content)

The formula for energy depends on the type of energy being measured. Here are some common formulas:

1. **Kinetic Energy**:

Kinetic energy (KE) is the energy of motion. It is defined as:

KE = 0.5 × m × v^2

Where:

* KE is the kinetic energy (in joules, J)
* m is the mass of the object (in kilograms, kg)
* v is the velocity of the object (in meters per second, m/s)

2. **Potential Energy**:

Potential energy (PE) is the energy of position or stored energy. It is defined as:

PE = m × g × h

Where:

* PE is the potential energy (in joules, J)
* m is the mass of the object (in kilograms, kg)
* g is the acceleration due to gravity (about 9.8 m/s^2 on Earth)
* h is the height of the object (in meters, m)

3. **Mechanical Energy**:

Mechanical energy (ME) is the sum of kinetic energy and potential energy. It is defined as:

ME = KE + PE

4. **Thermal Energy**:

Thermal energy (TE) is the energy of heat. It is defined as:

TE = mc × ΔT

Where:

* TE is the thermal energy (in joules, J)
* m

In [18]:
prompt = f"""
You're a course teaching assistant. Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database. 
Only use the facts from the CONTEXT. If the CONTEXT doesn't contan the answer, return "NONE"

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()

print(prompt)

You're a course teaching assistant. Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database. 
Only use the facts from the CONTEXT. If the CONTEXT doesn't contan the answer, return "NONE"

QUESTION: How do I join the course once it has already started?

CONTEXT:


In [19]:
response = client.chat.completions.create(
    model="llama3-8b-8192",
    messages=[{"role": "user", "content": prompt}]
)

answer = response.choices[0].message.content
print(answer)

Please provide the context so I can help you with the question. 

What is the context about the course? Please provide any relevant information about the course, such as its schedule, duration, or relevant documents.


In [20]:
context_template = """
Section: {section}
Question: {question}
Answer: {text}
""".strip()

prompt_template = """
You're a course teaching assistant.
Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database.
Don't use other information outside of the provided CONTEXT.  

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()

In [23]:
def build_context(documents):
    context_result = ""
    
    for doc in documents:
        doc_str = context_template.format(**doc)
        context_result += ("\n\n" + doc_str)
    
    return context_result.strip()


def build_prompt(user_question, documents):
    context = build_context(documents)
    prompt = prompt_template.format(
        user_question=user_question,
        context=context
    )
    return prompt

def ask_groq(prompt, model="llama3-8b-8192"):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    answer = response.choices[0].message.content
    return answer

def qa_bot(user_question):
    context_docs = retrieve_documents(user_question)
    prompt = build_prompt(user_question, context_docs)
    answer = ask_groq(prompt)
    return answer

In [24]:
qa_bot("I'm getting invalid reference format: repository name must be lowercase")

'According to our FAQ database, this error message usually occurs when the repository name in your reference is not in lowercase. To fix this, please ensure that the repository name in your reference is written in lowercase letters. For example, instead of " GitLab ", use " gitlab ".'

In [26]:
qa_bot("What is the meaning of life?")

'I\'m happy to help! However, I have to advise that the question "What is the meaning of life?" is not something that can be answered based on the provided CONTEXT, which does not seem to provide any specific information or relevance to the question.\n\nThe provided CONTEXT is empty, which means I don\'t have any relevant information to draw from when trying to answer this question. Therefore, I won\'t be able to provide a meaningful answer within the given parameters.\n\nIf you could provide more context or clarify the relevance of the question to the course or topic we are discussing, I\'ll be happy to help answer it to the best of my abilities!'