In [64]:
import requests
from minsearch import AppendableIndex
from openai import OpenAI

In [14]:
DOCS_URL = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'

In [18]:
with requests.get(DOCS_URL) as docs_resp:
    if docs_resp.status_code == 200:
        docs_raw = docs_resp.json()
        documents = []
        for course in docs_raw:
            course_name = course['course']
            for doc in course['documents']:
                doc['course'] = course_name
                documents.append(doc)
    else:
        print(f"""Documents could not be fetched due to status code 
        {docs_resp.status_code} and issue being: 
        {docs_resp.reason}""")

In [170]:
class IndexMinsearch:
    def __init__(self, documents: list[dict], text_fields: list=None, keyword_fields: list=None):
        self.documents = documents
        self.text_fields = text_fields
        self.keyword_fields = keyword_fields
        self.index = self.index_documents()

    def index_documents(self):
        index = AppendableIndex(
        text_fields=self.text_fields,
        keyword_fields=self.keyword_fields
        )
        index.fit(docs=self.documents)
        return index

class SearchMinsearch:
    def __init__(self, index):
        self.index = index
        
    def search_query(self, query: str="", results: int=5, filter_dict: dict={}):
        if query:
            if len(query) > 5:
                results = self.index.search(
                    query=query,
                    filter_dict=filter_dict,
                    num_results=results,
                    output_ids=True)
            else:
                print("Minimum number of characters need to be more than 10")
                return None
        return results

class OpenAIClient:
    def __init__(self):
        self.client = OpenAI()

    def llm(self, prompt: str=""):
        response = self.client.chat.completions.create(
            model = 'gpt-4o-mini',
            messages = [{"role": "user",
                         "content": prompt}],
            temperature=0.1,
            seed=42
        )
        return response.choices[0].message.content

# Traditional Rag

In [58]:
text_fields=["text", "section", "question"]
keyword_fields=["course"]

In [251]:
index_search = IndexMinsearch(
    documents=documents,
    text_fields=text_fields,
    keyword_fields=keyword_fields)

search = SearchMinsearch(
    index=index_search.index)
open_ai_client = OpenAIClient()

In [60]:
query = "When does the course start?"
filter_dict = {"course": "data-engineering-zoomcamp"} if"course" in index_search.keyword_fields else {}

In [76]:
results = search.search_query(
    query=query,
    filter_dict=filter_dict)

In [72]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

In [78]:
def build_prompt(query: str, search_results: list[dict]):
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [79]:
def rag(query: str, filter_dict: dict):
    search_results = search.search_query(query=query, filter_dict=filter_dict)
    prompt = build_prompt(query, search_results)
    answer = open_ai_client.llm(prompt=prompt)
    return answer

In [127]:
rag(
    query="What is WASD in llm rag",
    filter_dict=filter_dict
)

'I\'m sorry, but there is no information provided in the CONTEXT regarding "WASD" in "llm rag." Please provide more details or check the FAQ database for relevant information.'

# Agentic Rag

In [136]:
qa_prompt = """You are an Agentic FAQ Assistant. Your job is to answer user questions using:
1. Retrieved FAQ context from MinSearch (authoritative)
2. If no relevant FAQ context exists, use your own general reasoning and knowledge,
   especially when the iteration limit is reached.

You operate with short iterative steps. You remember:
- previous user queries
- your own previous final answers
- previous retrieval/search attempts
- iteration count for the current query

#############################
## CORE BEHAVIOUR RULES
#############################

1. Always prefer FAQ knowledge over your own knowledge.
   If FAQ context is relevant, treat it as the correct source of truth.

2. If FAQ context is empty, low-confidence, or irrelevant:
   - If current iteration number {iteration_count} < {max_iterations} iterations:
       perform a MinSearch lookup with topic expansion.
   - If current iteration number {iteration_count} == {max_iterations} iterations:
       DO NOT search again; answer using your own internal knowledge.

3. Never hallucinate FAQ entries or contradict retrieved FAQ context.

4. Keep answers concise unless the user requests detail.

5. Maintain consistency with wording used in the FAQ database.

6. Do NOT reveal internal chain-of-thought, retrieval decisions,
   topic expansions, or reasoning. Only reveal the final answer.

7. If a similar question exists in session_history:
   Reuse the prior answer unless new FAQ context changes the meaning.

#############################
## TOPIC EXPANSION FOR MINSEARCH
#############################

Before performing a search action:
- Expand the user query with 2–4 additional terms:
  • synonyms
  • domain-specific variations
  • common FAQ phrasing
- Only include terms logically related to the user’s words.
- Do NOT invent new entities, products, or facts.

Format the MinSearch query as:
"<original user query> | <topic1> | <topic2> | <topic3>"

This expansion is mandatory for all search actions.

#############################
## DECISION PROCESS
#############################

For each user turn, you will be provided:
- question
- session_history
- iteration_count
- search_queries
- context

Follow these rules:

1. If FAQ context is relevant → answer with source="faq".
2. If FAQ context is empty, low-confidence, or irrelevant:
   - If current iteration number {iteration_count} < {max_iterations} iterations:
       perform a MinSearch lookup with topic expansion.
   - If current iteration number {iteration_count} == {max_iterations} iterations:
       DO NOT search again; answer using your own internal knowledge.

#############################
## OUTPUT RULES
#############################

Respond strictly in the JSON formats below.

### When more retrieval is needed:
{{
  "action": "SEARCH",
  "reasoning": "<add your reasoning here>",
  "search_query": "original_query | topic1 | topic2 | topic3"
}}

### When a final answer is available:
{{
  "action": "ANSWER",
  "source": "faq",
  "answer": "<your answer>"
}}

### If no context is found after max iterations, use your own knowledge to answer the question
{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}}


<QUESTION>
{question}
</QUESTION>

<SEARCH_QUERIES>
{search_queries}
</SEARCH_QUERIES>

<CONTEXT> 
{context}
</CONTEXT>

<PREVIOUS_ACTIONS>
{session_history}
</PREVIOUS_ACTIONS>


Never output anything else.
Do not include explanations, chain-of-thought, or debugging notes.

""".strip()

In [110]:
class SearchMinsearch:
    def __init__(self, index):
        self.index = index
        
    def search_query(self, query: str="", results: int=5, filter_dict: dict={}):
        if query:
            if len(query) > 5:
                results = self.index.search(
                    query=query,
                    filter_dict=filter_dict,
                    num_results=results,
                    output_ids=True)
            else:
                print("Minimum number of characters need to be more than 10")
                return None
        return results

class OpenAIClient:
    def __init__(self):
        self.client = OpenAI()

    def llm(self, prompt: str=""):
        response = self.client.chat.completions.create(
            model = 'gpt-4o-mini',
            messages = [{"role": "user",
                         "content": prompt}],
            temperature=0.1,
            seed=42
        )
        return response.choices[0].message.content

In [190]:
context = "EMPTY"
query = "when does the engineering zoomcamp start?"
search_queries = []
session_history = []

In [191]:
prompt = qa_prompt.format(question=query,
                          context=context,
                          iteration_count=0,
                          max_iterations=3,
                         search_queries=search_queries,
                         session_history=session_history).strip()

In [192]:
print(prompt)

You are an Agentic FAQ Assistant. Your job is to answer user questions using:
1. Retrieved FAQ context from MinSearch (authoritative)
2. If no relevant FAQ context exists, use your own general reasoning and knowledge,
   especially when the iteration limit is reached.

You operate with short iterative steps. You remember:
- previous user queries
- your own previous final answers
- previous retrieval/search attempts
- iteration count for the current query

#############################
## CORE BEHAVIOUR RULES
#############################

1. Always prefer FAQ knowledge over your own knowledge.
   If FAQ context is relevant, treat it as the correct source of truth.

2. If FAQ context is empty, low-confidence, or irrelevant:
   - If current iteration number 0 < 3 iterations:
       perform a MinSearch lookup with topic expansion.
   - If current iteration number 0 == 3 iterations:
       DO NOT search again; answer using your own internal knowledge.

3. Never hallucinate FAQ entries or c

In [193]:
result = open_ai_client.llm(prompt=prompt)

In [194]:
search_results = []
if json.loads(result)['action'] == 'SEARCH':
    for topic in json.loads(result)['search_query'].split("|"):
        sr = search.search_query(query=topic.strip(), filter_dict=filter_dict)
        search_queries.append(topic.strip())
        search_results.extend(sr)

In [195]:
def dedup(sequence: list[dict]) -> list[dict]:
    seen = set()
    results = []
    for i in sequence:
        if i['_id'] in seen:
            continue
        else:
            seen.add(i['_id'])
            results.append(i)
    return results

def build_context(search_results: list[dict]):
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    return context

In [196]:
search_results = dedup(search_results)
context = build_context(search_results=search_results)
iteration_count=1
max_iterations=3
session_history.append(json.loads(result))

In [197]:
prompt = qa_prompt.format(question=query,
                 context=context,
                 iteration_count=iteration_count,
                 max_iterations=max_iterations,
                 search_queries="\n".join(search_queries),
                 session_history="\n".join([json.dumps(a) for a in session_history])
                         ).strip()
    

In [198]:
result = open_ai_client.llm(prompt=prompt)

In [200]:
import time

In [224]:
def run_agentic_rag(query: str):
    start_time = time.perf_counter()
    context = "EMPTY"
    search_queries = []
    session_history = []
    iteration_count = 0
    max_iterations = 3
    
    while True:
        prompt = qa_prompt.format(question=query,
                                  context=context,
                                  iteration_count=iteration_count,
                                  max_iterations=max_iterations,
                                  search_queries="\n".join(search_queries),
                                  session_history="\n".join([json.dumps(a) for a in session_history])
                                 ).strip()
        result = open_ai_client.llm(prompt=prompt)
        session_history.append(json.loads(result))
        search_results = []
        if json.loads(result)['action'] == 'SEARCH':
            for topic in json.loads(result)['search_query'].split("|"):
                sr = search.search_query(query=topic.strip(), filter_dict=filter_dict)
                search_queries.append(topic.strip())
                search_results.extend(sr)
            search_results = dedup(search_results)
            context = build_context(search_results=search_results)
            search_queries = list(set(search_queries))
            iteration_count += 1
        else:
            break
        if iteration_count >= 4:
            break
    end_time = time.perf_counter()
    elapsed_time = end_time - start_time
    print(f"Time taken {elapsed_time:.4f} seconds")
    print(f"Iteration number {iteration_count}")
    return json.loads(result)
    


In [232]:
run_agentic_rag("How long will it take to finish data engineering zoomcamp?")

Time taken 5.2198 seconds
Iteration number 2


{'action': 'ANSWER',
 'answer': "The Data Engineering Zoomcamp generally runs from January to April each year, but you can complete it at your own pace if you're not pursuing certification.",
 'source': 'OWN_KNOWLEDGE'}

# Function calling

In [246]:
def min_search(query: str, search, filter_dict):

    results = search.search_query(
        query=query,
        filter_dict=filter_dict)

    return results
        

In [298]:
search_tool = {
    "type": "function",
    "name": "min_search",
    "description": "Search the FAQ database",
    "parameters": {
        "type": "object",
        "properties": {
            "query": {
                "type": "string",
                "description": "Search query text to look up in the course FAQ."
            }
        },
        "required": ["query", "search", "filter_dict"],
        "additionalProperties": False
    }
}

In [299]:
def do_call(tool_call_response):
    function_name = tool_call_response.name
    arguments = json.loads(tool_call_response.arguments)
    if function_name == "min_search":
        arguments["search"] =  search
        arguments["filter_dict"] = filter_dict

    f = globals()[function_name]
    result = f(**arguments)

    return {
        "type": "function_call_output",
        "call_id": tool_call_response.call_id,
        "output": json.dumps(result, indent=2)
    }

In [253]:
question = "How do I do well in module 1?"

developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.
""".strip()

tools = [search_tool]

chat_messages = [
    {"role": "developer", "content": developer_prompt},
    {"role": "user", "content": question}
]

response = client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)
response.output

[ResponseFunctionToolCall(arguments='{"query":"How to do well in module 1"}', call_id='call_cSdyyO2NnB4OA7xdQ7WFv82R', name='min_search', type='function_call', id='fc_093bfc4ea782c01f0069268fd3ec788198840fb7275bd26fd4', status='completed')]

In [262]:
for call in response.output:
    print(do_call(call))

{'type': 'function_call_output', 'call_id': 'call_cSdyyO2NnB4OA7xdQ7WFv82R', 'output': '[\n  {\n    "text": "Even after installing pyspark correctly on linux machine (VM ) as per course instructions, faced a module not found error in jupyter notebook .\\nThe solution which worked for me(use following in jupyter notebook) :\\n!pip install findspark\\nimport findspark\\nfindspark.init()\\nThereafter , import pyspark and create spark contex<<t as usual\\nNone of the solutions above worked for me till I ran !pip3 install pyspark instead !pip install pyspark.\\nFilter based on conditions based on multiple columns\\nfrom pyspark.sql.functions import col\\nnew_final.filter((new_final.a_zone==\\"Murray Hill\\") & (new_final.b_zone==\\"Midwood\\")).show()\\nKrishna Anand",\n    "section": "Module 5: pyspark",\n    "question": "Module Not Found Error in Jupyter Notebook .",\n    "course": "data-engineering-zoomcamp",\n    "_id": 322\n  },\n  {\n    "text": "Error raised during the jupyter notebo

In [271]:
question = "How do I do well in module 1?"

developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

If you look up something in FAQ, convert the student question into multiple queries.
""".strip()

tools = [search_tool]

chat_messages = [
    {"role": "developer", "content": developer_prompt},
    {"role": "user", "content": question}
]

response = client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)
calls = response.output

In [272]:
for call in calls:
    result = do_call(call)
    chat_messages.append(call)
    chat_messages.append(result)

In [273]:
response = client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)
response.output

[ResponseOutputMessage(id='msg_0ee23314897fc0030069278e2076ac8199980f831e0a6fee07', content=[ResponseOutputText(annotations=[], text='To excel in Module 1 of your course, here\'s a summary of key tips and any relevant issues others have encountered:\n\n1. **Understand the Prerequisites**: Ensure you have the necessary tools installed, such as Docker and PostgreSQL. If you encounter problems with modules (e.g., `psycopg2`), you can resolve them by running the following commands:\n   - Install with pip: \n     ```bash\n     pip install psycopg2-binary\n     ```\n   - If issues persist, update:\n     ```bash\n     pip install psycopg2-binary --upgrade\n     ```\n\n2. **Set Up Your Environment**:\n   - Verify that your Python environment is configured properly. If you\'re getting "ModuleNotFoundError," it may indicate that libraries are not installed correctly or paths are not set.\n   - Ensure that PostgreSQL is installed if you encounter `pg_config not found`.\n\n3. **Utilize Resources**

In [281]:
for entry in response.output:
    chat_messages.append(entry)
    print(entry.type)

    if entry.type == 'function_call':      
        result = do_call(entry)
        chat_messages.append(result)
    elif entry.type == 'message':
        print(entry.content[0].text)

message
To excel in Module 1 of your course, here's a summary of key tips and any relevant issues others have encountered:

1. **Understand the Prerequisites**: Ensure you have the necessary tools installed, such as Docker and PostgreSQL. If you encounter problems with modules (e.g., `psycopg2`), you can resolve them by running the following commands:
   - Install with pip: 
     ```bash
     pip install psycopg2-binary
     ```
   - If issues persist, update:
     ```bash
     pip install psycopg2-binary --upgrade
     ```

2. **Set Up Your Environment**:
   - Verify that your Python environment is configured properly. If you're getting "ModuleNotFoundError," it may indicate that libraries are not installed correctly or paths are not set.
   - Ensure that PostgreSQL is installed if you encounter `pg_config not found`.

3. **Utilize Resources**: Follow along with Docker setups and other instructions provided in the course materials. If you face errors, they may often be resolved by con

In [310]:
def add_entry(question, answer):
    doc = {
        "question": question,
        "text": answer,
        "section": "user added",
        "course": "data-engineering-zoomcamp"
    }
    index_search.index.append(doc)

In [311]:
add_entry_description = {
    "type": "function",
    "name": "add_entry",
    "description": "Add an entry to the FAQ database",
    "parameters": {
        "type": "object",
        "properties": {
            "question": {
                "type": "string",
                "description": "The question to be added to the FAQ database",
            },
            "answer": {
                "type": "string",
                "description": "The answer to the question",
            }
        },
        "required": ["question", "answer"],
        "additionalProperties": False
    }
}

In [312]:
tools = [search_tool, add_entry_description]

In [313]:
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

Use FAQ if your own knowledge is not sufficient to answer the question.
When using FAQ, perform deep topic exploration: make one request to FAQ,
and then based on the results, make more requests, which means if you look up 
something in FAQ, convert the student question into multiple queries.

At the end of each response, ask the user a follow up question based on your answer.
""".strip()

chat_messages = [
    {"role": "developer", "content": developer_prompt},
]

In [307]:
while True:
    question = input("What do you wanna know")
    if question == "stop":
        break

    chat_messages.append({"role": "user", "content": question})
    while True:
        response = client.responses.create(
            model='gpt-4o-mini',
            input=chat_messages,
            tools=tools
        )
        has_messages = False
        for entry in response.output:
            chat_messages.append(entry)
        
            if entry.type == 'function_call':      
                result = do_call(entry)
                chat_messages.append(result)
            elif entry.type == 'message':
                print(entry.content[0].text)
                print()
                has_messages = True

        if has_messages:
            break

What do you wanna know How to do well in module 1


To excel in Module 1 of the course, here are some essential tips:

1. **Familiarize Yourself with Docker and Terraform**: Understanding the foundations of Docker and Terraform is key. Make sure to explore their official documentation and tutorials.

2. **Install Required Libraries**: Ensure that any necessary libraries are correctly installed, especially `psycopg2` if you're working with PostgreSQL. For example, you can use:
   ```
   pip install psycopg2-binary
   ```

3. **Practice Hands-On**: Implement the concepts as you learn them. Spin up Docker containers and use Terraform to manage any configurations or deployments.

4. **Check Your Code**:
   - If you encounter errors (like "ModuleNotFoundError"), double-check your installations.
   - Make sure to run your code in the correct environment (e.g., Jupyter notebook).

5. **Engage with Peers**: If you have questions, discuss them within your course community or with classmates. Collaboration often leads to better understanding.

6.

What do you wanna know Lets add this to faq database


The information has been successfully added to the FAQ database. 

If you have any more questions or need further assistance, feel free to ask! Would you like to explore another topic?



What do you wanna know stop


In [308]:
index_search.index.docs[-1]

{'question': 'How to do well in module 1?',
 'text': '1. **Familiarize Yourself with Docker and Terraform**: Understanding the foundations of Docker and Terraform is key. Explore their official documentation and tutorials.\n\n2. **Install Required Libraries**: Ensure that any necessary libraries, like `psycopg2` for PostgreSQL, are correctly installed. Use:\n   ```\n   pip install psycopg2-binary\n   ```\n\n3. **Practice Hands-On**: Implement concepts as you learn. Spin up Docker containers and use Terraform for configurations.\n\n4. **Check Your Code**:\n   - If you encounter errors (like "ModuleNotFoundError"), double-check installations.\n   - Ensure you\'re running code in the correct environment (e.g., Jupyter notebook).\n\n5. **Engage with Peers**: Discuss questions within the course community or with classmates for better understanding.\n\n6. **Utilize Office Hours**: If available, attend office hours to clarify complex topics.\n\n7. **Review Past Lectures and Assignments**: Rev