In [51]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [52]:
from minsearch import AppendableIndex

index = AppendableIndex(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.append.AppendableIndex at 0x777bff2fa020>

In [53]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5,
        output_ids=True
    )

    return results

In [54]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def build_prompt(query, search_results):
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [55]:
import os
from dotenv import load_dotenv

# Cargar .env
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

In [56]:
from openai import OpenAI
client = OpenAI(api_key=api_key)

def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [57]:
query = 'how do i join the course?'
rag(query)

"To join the course, you need to register before it starts using the provided link. The course begins on January 15th, 2024, at 17:00. Additionally, you can join the course's Telegram channel for announcements and register in DataTalks.Club's Slack to access the relevant channels. Even if you miss the registration deadline, you can still submit homework and participate, but be mindful of the final project deadlines."

In [77]:
prompt_template = """
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
{question}
</QUESTION>

<CONTEXT> 
{context}
</CONTEXT>

IF you have an acceptable response, proceed to answer, or If CONTEXT is EMPTY and you dont have an accurate answer, you SHOULD use our FAQ database.
In this case, use the following output template:

{{
"action": "SEARCH",
"reasoning": "<add your reasoning here>"
}}

If you can answer the QUESTION using CONTEXT, use this template:

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "CONTEXT"
}}

If the context doesn't contain the answer, use your own knowledge to answer the question

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}}
""".strip()

In [59]:
question = "how do I run docker on gentoo?"
context = "EMPTY"

prompt = prompt_template.format(question=question, context=context)
print(prompt)

answer = llm(prompt)
print(answer)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
how do I run docker on gentoo?
</QUESTION>

<CONTEXT> 
EMPTY
</CONTEXT>

If CONTEXT is EMPTY, you can use our FAQ database.
In this case, use the following output template:

{
"action": "SEARCH",
"reasoning": "<add your reasoning here>"
}

If you can answer the QUESTION using CONTEXT, use this template:

{
"action": "ANSWER",
"answer": "<your answer>",
"source": "CONTEXT"
}

If the context doesn't contain the answer, use your own knowledge to answer the question

{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}
{
"action": "ANSWER",
"answer": "To run Docker on Gentoo, you need to follow these steps: 1. First, ensure that your system is up to date. You can do this by running `emerge --sync` and then `emerge -uDNav world`. 2. Next, install Docker using the

In [60]:
question = "how do I join the course?"
context = "EMPTY"

prompt = prompt_template.format(question=question, context=context)
answer = llm(prompt)
print(answer)

{
"action": "ANSWER",
"answer": "To join the course, you typically need to visit the course website or platform where it is offered. Look for a 'Sign Up' or 'Enroll' button, and follow the instructions to create an account or enroll directly. You may also need to provide some personal information and agree to any terms or fees associated with the course.",
"source": "OWN_KNOWLEDGE"
}


In [61]:
def build_context(search_results):
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    return context.strip()

In [62]:
search_results = search(question)
context = build_context(search_results)
prompt = prompt_template.format(question=question, context=context)
print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
how do I join the course?
</QUESTION>

<CONTEXT> 
section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Course - When will the course start?
answer: The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1
Subscribe to course public Google Calendar (it works from Desktop only).
Register before the course start

In [63]:
answer = llm(prompt)
print(answer)

{
"action": "ANSWER",
"answer": "To join the course, you need to register before the course starts using the provided registration link. The course begins on January 15th, 2024, at 17:00, so make sure to complete your registration before that date. Additionally, it is beneficial to join the course Telegram channel for announcements and subscribe to the course public Google Calendar for updates.",
"source": "CONTEXT"
}


In [64]:
import json

In [70]:
def agentic_rag_v1(question):
    context = "EMPTY"
    prompt = prompt_template.format(question=question, context=context)
    answer_json = llm(prompt)
    answer = json.loads(answer_json)
    print(answer)

    if answer['action'] == 'SEARCH':
        print('need to perform search...')
        search_results = search(question)
        context = build_context(search_results)
        
        prompt = prompt_template.format(question=question, context=context)
        answer_json = llm(prompt)
        answer = json.loads(answer_json)
        print(answer)

    return answer

In [74]:
agentic_rag_v1('how do I join the course?')

{'action': 'SEARCH', 'reasoning': 'I do not have specific information about how to join the course in the context provided, so I will refer to the FAQ database to find the answer.'}
need to perform search...
{'action': 'ANSWER', 'answer': "To join the course, you need to register before the course starts using the provided link. Even if you register after the start date, you can still participate by submitting homeworks. Make sure to also join the course's Telegram channel and sign up for the Slack channel on DataTalks.Club for announcements and discussions.", 'source': 'CONTEXT'}


{'action': 'ANSWER',
 'answer': "To join the course, you need to register before the course starts using the provided link. Even if you register after the start date, you can still participate by submitting homeworks. Make sure to also join the course's Telegram channel and sign up for the Slack channel on DataTalks.Club for announcements and discussions.",
 'source': 'CONTEXT'}

In [78]:
agentic_rag_v1('how patch KDE under FreeBSD?')

{'action': 'ANSWER', 'answer': "To patch KDE under FreeBSD, you typically need to download the source code of the version you want to patch. After applying your patch file to the relevant source files, you can use the FreeBSD ports system or the 'make' command to build the software. Here are the steps involved:\n\n1. **Obtain the source code**: You can either use the ports collection (`/usr/ports/x11/kde5`) or download the KDE source tarball from the official KDE website.\n\n2. **Apply the patch**: Navigate to the directory where the source code resides and apply your patch file using the `patch` command. For example:\n   ```\n   patch < /path/to/your/patchfile.patch\n   ```\n\n3. **Compile**: After the patch has been applied, build the package by running:\n   ```\n   make install clean\n   ```\n\n4. **Install**: This will compile the code and install the patched version of KDE.\n\n5. **Test**: Finally, make sure to test the newly compiled KDE to ensure that everything works as expecte

{'action': 'ANSWER',
 'answer': "To patch KDE under FreeBSD, you typically need to download the source code of the version you want to patch. After applying your patch file to the relevant source files, you can use the FreeBSD ports system or the 'make' command to build the software. Here are the steps involved:\n\n1. **Obtain the source code**: You can either use the ports collection (`/usr/ports/x11/kde5`) or download the KDE source tarball from the official KDE website.\n\n2. **Apply the patch**: Navigate to the directory where the source code resides and apply your patch file using the `patch` command. For example:\n   ```\n   patch < /path/to/your/patchfile.patch\n   ```\n\n3. **Compile**: After the patch has been applied, build the package by running:\n   ```\n   make install clean\n   ```\n\n4. **Install**: This will compile the code and install the patched version of KDE.\n\n5. **Test**: Finally, make sure to test the newly compiled KDE to ensure that everything works as expect

In [79]:
prompt_template = """
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than {max_iterations} iterations for a given student question.
The current iteration number: {iteration_number}. If we exceed the allowed number 
of iterations, give the best possible answer with the provided information.

Output templates:

If you want to perform search, use this template:

{{
"action": "SEARCH",
"reasoning": "<add your reasoning here>",
"keywords": ["search query 1", "search query 2", ...]
}}

If you can answer the QUESTION using CONTEXT, use this template:

{{
"action": "ANSWER_CONTEXT",
"answer": "<your answer>",
"source": "CONTEXT"
}}

If the context doesn't contain the answer, use your own knowledge to answer the question

{{
"action": "ANSWER",
"answer": "<your answer>",
"source": "OWN_KNOWLEDGE"
}}

<QUESTION>
{question}
</QUESTION>

<SEARCH_QUERIES>
{search_queries}
</SEARCH_QUERIES>

<CONTEXT> 
{context}
</CONTEXT>

<PREVIOUS_ACTIONS>
{previous_actions}
</PREVIOUS_ACTIONS>
""".strip()

In [80]:
question = "how do I join the course?"

search_queries = []
search_results = []
previous_actions = []
context = build_context(search_results)

prompt = prompt_template.format(
    question=question,
    context=context,
    search_queries="\n".join(search_queries),
    previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
    max_iterations=3,
    iteration_number=1
)
print(prompt)

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current iteration number

In [81]:
answer_json = llm(prompt)
answer = json.loads(answer_json)
print(json.dumps(answer, indent=2))

{
  "action": "SEARCH",
  "reasoning": "To provide accurate information on how to join the course, I will search for any specific instructions, requirements, or links related to enrollment in the FAQ database.",
  "keywords": [
    "how to join the course",
    "course enrollment",
    "registration process"
  ]
}


In [82]:
previous_actions.append(answer)

In [83]:
keywords = answer['keywords']
search_queries.extend(keywords)

In [84]:
for k in keywords:
    res = search(k)
    search_results.extend(res)

In [85]:
search_results

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  '_id': 2},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'dat

In [86]:
def dedup(seq):
    seen = set()
    result = []
    for el in seq:
        _id = el['_id']
        if _id in seen:
            continue
        seen.add(_id)
        result.append(el)
    return result

search_results = dedup(search_results)

In [87]:
# question = "how do I join the course?"

# search_queries = []
# search_results = []
# previous_actions = []

context = build_context(search_results)

prompt = prompt_template.format(
    question=question,
    context=context,
    search_queries="\n".join(search_queries),
    previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
    max_iterations=3,
    iteration_number=2
)
print(prompt)

answer_json = llm(prompt)
answer = json.loads(answer_json)
print(json.dumps(answer, indent=2))

You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current iteration number

In [88]:
question = "what do I need to do to be successful at module 1?"

search_queries = []
search_results = []
previous_actions = []


iteration = 0

while True:
    print(f'ITERATION #{iteration}...')

    context = build_context(search_results)
    prompt = prompt_template.format(
        question=question,
        context=context,
        search_queries="\n".join(search_queries),
        previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
        max_iterations=3,
        iteration_number=iteration
    )

    print(prompt)

    answer_json = llm(prompt)
    answer = json.loads(answer_json)
    print(json.dumps(answer, indent=2))

    previous_actions.append(answer)

    action = answer['action']
    if action != 'SEARCH':
        break

    keywords = answer['keywords']
    search_queries = list(set(search_queries) | set(keywords))
    
    for k in keywords:
        res = search(k)
        search_results.extend(res)

    search_results = dedup(search_results)
    
    iteration = iteration + 1
    if iteration >= 4:
        break

    print()

ITERATION #0...
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current 

In [89]:
def agentic_search(question):
    search_queries = []
    search_results = []
    previous_actions = []

    iteration = 0
    
    while True:
        print(f'ITERATION #{iteration}...')
    
        context = build_context(search_results)
        prompt = prompt_template.format(
            question=question,
            context=context,
            search_queries="\n".join(search_queries),
            previous_actions='\n'.join([json.dumps(a) for a in previous_actions]),
            max_iterations=3,
            iteration_number=iteration
        )
    
        print(prompt)
    
        answer_json = llm(prompt)
        answer = json.loads(answer_json)
        print(json.dumps(answer, indent=2))

        previous_actions.append(answer)
    
        action = answer['action']
        if action != 'SEARCH':
            break
    
        keywords = answer['keywords']
        search_queries = list(set(search_queries) | set(keywords))

        for k in keywords:
            res = search(k)
            search_results.extend(res)
    
        search_results = dedup(search_results)
        
        iteration = iteration + 1
        if iteration >= 4:
            break
    
        print()

    return answer

In [93]:
agentic_search('how do I prepare for the course?')

ITERATION #0...
You're a course teaching assistant.

You're given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic. 

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current 

{'action': 'ANSWER',
 'answer': "To prepare for the course effectively, you should consider the following strategies:\n1. **Register**: Ensure you are registered for the course before it starts, using the provided registration link.\n2. **Join Communication Channels**: Subscribe to the course public Google Calendar, join the course Telegram channel for announcements, and register on DataTalks.Club's Slack to stay updated.\n3. **Familiarize Yourself with Tools**: If you're expected to use specific tools or technologies (like Git or a particular data stack), take the time to learn the basics or review relevant resources.\n4. **Plan Your Schedule**: Since the course starts on January 15, 2024, at 17h00, allocate specific times in your schedule to attend live sessions and complete coursework. \n5. **Preparation Material**: Look for any preliminary materials or guidelines that might be offered prior to the start date, so you can come prepared.\n6. **Review Past Cohort Materials**: If availa

In [94]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5,
        output_ids=True
    )

    return results

In [95]:
search_tool = {
    "type": "function",
    "name": "search",
    "description": "Search the FAQ database",
    "parameters": {
        "type": "object",
        "properties": {
            "query": {
                "type": "string",
                "description": "Search query text to look up in the course FAQ."
            }
        },
        "required": ["query"],
        "additionalProperties": False
    }
}

In [96]:
question = "How do I do well in module 1?"

developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.
""".strip()

tools = [search_tool]

chat_messages = [
    {"role": "developer", "content": developer_prompt},
    {"role": "user", "content": question}
]

response = client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)
response.output

[ResponseFunctionToolCall(arguments='{"query":"module 1 tips"}', call_id='call_QJb4iQ942phHlMYZOvlboXnH', name='search', type='function_call', id='fc_6877e90ebb7c8198850ba6405e1ce5fe01f3f725dd2a3590', status='completed')]

In [97]:
calls = response.output
call = calls[0]
call

call_id = call.call_id
call_id

f_name = call.name
f_name

arguments = json.loads(call.arguments)
arguments

{'query': 'module 1 tips'}

In [98]:
f = globals()[f_name]

In [99]:
results = f(**arguments)

In [100]:
search_results = json.dumps(results, indent=2)
print(search_results)

[
  {
    "text": "Even after installing pyspark correctly on linux machine (VM ) as per course instructions, faced a module not found error in jupyter notebook .\nThe solution which worked for me(use following in jupyter notebook) :\n!pip install findspark\nimport findspark\nfindspark.init()\nThereafter , import pyspark and create spark contex<<t as usual\nNone of the solutions above worked for me till I ran !pip3 install pyspark instead !pip install pyspark.\nFilter based on conditions based on multiple columns\nfrom pyspark.sql.functions import col\nnew_final.filter((new_final.a_zone==\"Murray Hill\") & (new_final.b_zone==\"Midwood\")).show()\nKrishna Anand",
    "section": "Module 5: pyspark",
    "question": "Module Not Found Error in Jupyter Notebook .",
    "course": "data-engineering-zoomcamp",
    "_id": 322
  },
  {
    "text": "Following dbt with BigQuery on Docker readme.md, after `docker-compose build` and `docker-compose run dbt-bq-dtc init`, encountered error `ModuleNotF

In [101]:
chat_messages.append(call)

chat_messages.append({
    "type": "function_call_output",
    "call_id": call.call_id,
    "output": search_results,
})

In [102]:
response = client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)

In [103]:
r = response.output[0]
print(r.content[0].text)

To excel in Module 1 of your course, here are some tips based on the content and common issues faced:

1. **Understand the Requirements**: Make sure you thoroughly read the module instructions and know what tools (like Docker and PostgreSQL) you need to have installed.

2. **Install Dependencies Properly**: If you encounter errors (like `ModuleNotFoundError: No module named 'psycopg2'`), ensure you've installed all required packages correctly:
   - Use `pip install psycopg2-binary` to install the psycopg2 package.
   - If you continue to face issues, try updating your installation with `pip install psycopg2-binary --upgrade`.

3. **Follow Docker Setup Instructions Carefully**: If you’re using Docker, follow the provided setup instructions in detail, ensuring that all commands execute successfully.

4. **Practice SQL Alchemy**: Familiarize yourself with SQL Alchemy usage, such as connection strings. For example, use:
   ```python
   conn_string = "postgresql+psycopg://root:root@localhos

In [104]:
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.
If you look up something in FAQ, convert the student question into multiple queries.
""".strip()

chat_messages = [
    {"role": "developer", "content": developer_prompt},
    {"role": "user", "content": question}
]

response = client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)

In [105]:
def do_call(tool_call_response):
    function_name = tool_call_response.name
    arguments = json.loads(tool_call_response.arguments)

    f = globals()[function_name]
    result = f(**arguments)

    return {
        "type": "function_call_output",
        "call_id": tool_call_response.call_id,
        "output": json.dumps(result, indent=2),
    }

In [106]:
for entry in response.output:
    chat_messages.append(entry)
    print(entry.type)

    if entry.type == 'function_call':      
        result = do_call(entry)
        chat_messages.append(result)
    elif entry.type == 'message':
        print(entry.text) 

function_call
function_call
function_call


In [107]:
response = client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=tools
)

for entry in response.output:
    chat_messages.append(entry)
    print(entry.type)
    print()

    if entry.type == 'function_call':      
        result = do_call(entry)
        chat_messages.append(result)
    elif entry.type == 'message':
        print(entry.content[0].text) 

message

To excel in Module 1, here are some strategies and tips:

### Key Strategies for Success:

1. **Understand the Basics**: Make sure you have a solid grasp of the foundational concepts related to Docker and Terraform, as these are crucial for the module.

2. **Hands-On Practice**: Engage with practical exercises. Set up your local environment using Docker. Follow the guides and instructions carefully to avoid common pitfalls.

3. **Troubleshoot Common Errors**:
   - **SQLAlchemy Issues**:
     - If you encounter a `TypeError`: Ensure your connection string is formatted correctly. Use:
       ```python
       conn_string = "postgresql+psycopg://root:root@localhost:5432/ny_taxi"
       engine = create_engine(conn_string)
       ```
     - For `No module named 'psycopg2'`: Install it via pip:
       ```
       pip install psycopg2-binary
       ```
       If issues persist, consider updating or reinstalling the package.

4. **Utilize Community Resources**: Engage with forums and di

In [108]:
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

Use FAQ if your own knowledge is not sufficient to answer the question.
When using FAQ, perform deep topic exploration: make one request to FAQ,
and then based on the results, make more requests.

At the end of each response, ask the user a follow up question based on your answer.
""".strip()

chat_messages = [
    {"role": "developer", "content": developer_prompt},
]

In [None]:
while True: # main Q&A loop
    question = input() # How do I do my best for module 1?
    if question == 'stop':
        break

    message = {"role": "user", "content": question}
    chat_messages.append(message)

    while True: # request-response loop - query API till get a message
        response = client.responses.create(
            model='gpt-4o-mini',
            input=chat_messages,
            tools=tools
        )

        has_messages = False
        
        for entry in response.output:
            chat_messages.append(entry)
        
            if entry.type == 'function_call':      
                print('function_call:', entry)
                print()
                result = do_call(entry)
                chat_messages.append(result)
            elif entry.type == 'message':
                print(entry.content[0].text)
                print()
                has_messages = True

        if has_messages:
            break

function_call: ResponseFunctionToolCall(arguments='{"query":"how to do best in module 1"}', call_id='call_N4GvJc29JgHNo5tNyMwLumZg', name='search', type='function_call', id='fc_6877ebc07a9c81998af14a3034e222be0b424739f391ba46', status='completed')

function_call: ResponseFunctionToolCall(arguments='{"query":"tips for success in module 1 docker and terraform"}', call_id='call_XdX8WuEusrJdr9YH0VTMq4wk', name='search', type='function_call', id='fc_6877ebc2cc408199bc94cddddc048c1a0b424739f391ba46', status='completed')

To excel in Module 1, which focuses on Docker and Terraform, here are some key tips based on information gathered:

1. **Follow Best Practices for Docker**:
   - Store all code in your default Linux distro to maximize file system performance. This is particularly important for users on Windows 10 or Windows 11, as Docker runs on WSL2 by default.

2. **Ensure Proper Setup for Terraform**:
   - Always navigate to the correct working directory that contains your Terraform confi

In [126]:
def shorten(text, max_length=50):
    if len(text) <= max_length:
        return text

def display_function_call(entry, result):
    call_html = f"""
        <details>
        <summary>Function call: <tt>{entry.name}({shorten(entry.arguments)})</tt></summary>
        <div>
            <b>Call</b>
            <pre>{entry}</pre>
        </div>
        <div>
            <b>Output</b>
            <pre>{result['output']}</pre>
        </div>
        
        </details>
    """
    display(HTML(call_html))

def display_response(entry):
    response_html = markdown.markdown(entry.content[0].text)
    html = f"""
        <div>
            <div><b>Assistant:</b></div>
            <div>{response_html}</div>
        </div>
    """
    display(HTML(html))

In [128]:
from IPython.display import display, HTML
import markdown # pip install markdown


developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

Use FAQ if your own knowledge is not sufficient to answer the question.

At the end of each response, ask the user a follow up question based on your answer.
""".strip()

chat_messages = [
    {"role": "developer", "content": developer_prompt},
]

# Chat loop
while True:
    question = input() # How do I do my best for module 1?
    
    if question.strip().lower() == 'stop':
        print("Chat ended.")
        break
    print()

    message = {"role": "user", "content": question}
    chat_messages.append(message)

    while True:  # inner request loop
        response = client.responses.create(
            model='gpt-4o-mini',
            input=chat_messages,
            tools=tools
        )

        has_messages = False

        for entry in response.output:
            chat_messages.append(entry)

            if entry.type == "function_call":
                result = do_call(entry)
                chat_messages.append(result)
                display_function_call(entry, result)

            elif entry.type == "message":
                display_response(entry)
                has_messages = True

        if has_messages:
            break

Chat ended.


In [129]:
def add_entry(question, answer):
    doc = {
        'question': question,
        'text': answer,
        'section': 'user added',
        'course': 'data-engineering-zoomcamp'
    }
    index.append(doc)

In [130]:
add_entry_description = {
    "type": "function",
    "name": "add_entry",
    "description": "Add an entry to the FAQ database",
    "parameters": {
        "type": "object",
        "properties": {
            "question": {
                "type": "string",
                "description": "The question to be added to the FAQ database",
            },
            "answer": {
                "type": "string",
                "description": "The answer to the question",
            }
        },
        "required": ["question", "answer"],
        "additionalProperties": False
    }
}

In [120]:
!wget https://raw.githubusercontent.com/alexeygrigorev/rag-agents-workshop/refs/heads/main/chat_assistant.py

--2025-07-16 12:32:29--  https://raw.githubusercontent.com/alexeygrigorev/rag-agents-workshop/refs/heads/main/chat_assistant.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3485 (3.4K) [text/plain]
Saving to: ‘chat_assistant.py’


2025-07-16 12:32:29 (3.71 MB/s) - ‘chat_assistant.py’ saved [3485/3485]



In [131]:
import chat_assistant

tools = chat_assistant.Tools()
tools.add_tool(search, search_tool)

tools.get_tools()

developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

Use FAQ if your own knowledge is not sufficient to answer the question.

At the end of each response, ask the user a follow up question based on your answer.
""".strip()

chat_interface = chat_assistant.ChatInterface()

chat = chat_assistant.ChatAssistant(
    tools=tools,
    developer_prompt=developer_prompt,
    chat_interface=chat_interface,
    client=client
)

In [132]:
chat.run()

Chat ended.


In [133]:
tools.add_tool(add_entry, add_entry_description)
tools.get_tools()

[{'type': 'function',
  'name': 'search',
  'description': 'Search the FAQ database',
  'parameters': {'type': 'object',
   'properties': {'query': {'type': 'string',
     'description': 'Search query text to look up in the course FAQ.'}},
   'required': ['query'],
   'additionalProperties': False}},
 {'type': 'function',
  'name': 'add_entry',
  'description': 'Add an entry to the FAQ database',
  'parameters': {'type': 'object',
   'properties': {'question': {'type': 'string',
     'description': 'The question to be added to the FAQ database'},
    'answer': {'type': 'string', 'description': 'The answer to the question'}},
   'required': ['question', 'answer'],
   'additionalProperties': False}}]

In [134]:
chat.run()

Chat ended.


In [135]:
index.docs[-1]

{'question': 'How do I install the new version of Kestra?',
 'text': "1. **Download the latest version** from the [Kestra GitHub releases page](https://github.com/kestra-io/kestra/releases). 2. **Extract the files and navigate to the directory** where you extracted Kestra. 3. **Run the installation command** according to the specific environment you're using (Docker, Kubernetes, etc.). 4. **Follow any additional setup instructions** for your specific platform as detailed in the documentation.",
 'section': 'user added',
 'course': 'data-engineering-zoomcamp'}