In [16]:
import minsearch

In [17]:
import json

In [18]:
with open('documents.json', 'rt') as file_input:
    docs_raw = json.load(file_input)

In [19]:
docs_raw[0]

documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [20]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [6]:
# define index

index = minsearch.Index(
    text_fields=["question", "text", "section"], # for answer search
    keyword_fields=["course"] # for filtering
)

In [7]:
# fit or load documents into index

index.fit(documents)

<minsearch.Index at 0x7524ee10f460>

In [21]:
import openai

from openai import OpenAI

client = OpenAI(
  organization='org-w4jeVzjvrfGGXxvKo2qmUg7Y',
  project='proj_DgpkFSatCGFL72AURTTlS3A8',
)



In [22]:
def search(query):
    boost = {"question": 3.0, "section": 0.5} # to provide search priority weights for text fields
    results = index.search(
        query=q,
        filter_dict={"course": "data-engineering-zoomcamp"},
        boost_dict=boost,
        num_results=3
    )
    return results

In [23]:
def build_prompt(query, search_results):
    prompt_template = """
    you are a course teaching assistant. Answer the QUESTION based on the CONTEXT from FAQ database.
    Use only the factss from the context when answeing the QUESTIONS.
    If the context does not contain the answer, output NONE.
    Question: {question}
    
    CONTEXT: {context}
    """.strip()
    
    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion:{doc['question']}\nanswer: {doc['text']}\n\n"
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [24]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content
    

In [25]:
from elasticsearch import Elasticsearch

In [26]:
es_client = Elasticsearch("http://localhost:9200")

In [12]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [27]:
from tqdm.auto import tqdm

In [28]:
for doc in documents:
    es_client.index(index=index_name, document=doc)

In [29]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name, body=search_query)

    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs 




In [30]:
query = "will i get completion certificate"

In [31]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [32]:
rag(query)

"No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running."

In [33]:
query = "How do I execute a command in a running docker container?"

In [34]:
def test_elastic_search(query):
    search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name, body=search_query)

    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs 

results = test_elastic_search(query)

In [35]:
def test_build_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: {question}
    
    CONTEXT:
    {context}
    """.strip()
    
    context = ""
    
    for doc in search_results:
        context = context + f"Q: {doc['question']}\nA: {doc['text']}\n\n"
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [38]:
prompt = test_build_prompt(query, results)
len(prompt)

1486

In [5]:
# %pip install --upgrade tiktoken
# %pip install --upgrade openai

In [39]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4o")

In [43]:
len(encoding.encode(prompt))

328