In [1]:
import gitsource
import os
from pathlib import Path

In [2]:
# Step 1: Prepare documents from markdown files
books_dir = Path("data/books/markdown")

# Read each markdown file and prepare documents
documents = []

for md_file in sorted(books_dir.glob("*.md")):
    # Read the file content
    try:
        # Try UTF-8 first
        with open(md_file, 'r', encoding='utf-8') as f:
            content = f.read()
    except UnicodeDecodeError:
        # Fall back to Latin-1 if UTF-8 fails
        with open(md_file, 'r', encoding='latin-1') as f:
            content = f.read()
    
    # Split into lines
    lines = content.split('\n')
    
    # Remove empty lines and lines with only whitespace
    non_empty_lines = [line for line in lines if line.strip()]
    
    # Create document dictionary
    document = {
        "source": md_file.name,
        "content": non_empty_lines
    }
    
    documents.append(document)
    print(f"Loaded {md_file.name}: {len(non_empty_lines)} lines")

print(f"\nTotal documents prepared: {len(documents)}")

Loaded 001_think_python_2e_thinkpython2.md: 10709 lines
Loaded 002_think_dsp_thinkdsp.md: 4241 lines
Loaded 003_think_complexity_2e_thinkcomplexity2.md: 6474 lines
Loaded 004_think_java_2e_thinkjava2.md: 10812 lines
Loaded 005_physical_modeling_in_matlab_PhysicalModelingInMatlab4.md: 5296 lines
Loaded 006_think_os_thinkos.md: 3145 lines
Loaded 007_think_c_Think-C.md: 5465 lines

Total documents prepared: 7


In [3]:
# Step 2: Chunk documents using gitsource
raw_chunks = gitsource.chunk_documents(
    documents,
    size=100,      # number of items per chunk
    step=50        # how many items to move forward for each chunk
)


In [4]:
i = 0
for chunk in raw_chunks:
    if chunk['source'] == '001_think_python_2e_thinkpython2.md':
        i += 1
print(f"Number of chunks from '001_think_python_2e_thinkpython2.md': {i}")

Number of chunks from '001_think_python_2e_thinkpython2.md': 214


In [5]:
from minsearch import Index

def prepare_documents(chunks):
    for chunk in chunks:
        chunk['content'] = "\n".join(chunk["content"])
    return chunks

chunks = prepare_documents(raw_chunks)

In [6]:
index = Index(text_fields=["content"])
index.fit(chunks)

<minsearch.minsearch.Index at 0x756d81ef9dc0>

In [8]:
results = index.search("python function definition", num_results=5)
results

[{'start': 1900,
  'content': 'when you are comfortable with Python, I\x92ll make suggestions for installing Python on your\ncomputer.\nThere are a number of web pages you can use to run Python. If you already have a fa-\nvorite, go ahead and use it. Otherwise I recommend PythonAnywhere. I provide detailed\ninstructions for getting started at http://tinyurl.com/thinkpython2e.\nThere are two versions of Python, called Python 2 and Python 3. They are very similar, so\nif you learn one, it is easy to switch to the other. In fact, there are only a few differences you\nwill encounter as a beginner. This book is written for Python 3, but I include some notes\nabout Python 2.\nThe Python interpreter is a program that reads and executes Python code. Depending\non your environment, you might start the interpreter by clicking on an icon, or by typing\npython on a command line. When it starts, you should see output like this:\nPython 3.4.0 (default, Jun 19 2015, 14:20:21)\n[GCC 4.8.2] on linux\nT

In [9]:
from openai import OpenAI

openai_client = OpenAI()

In [34]:
from pydantic import BaseModel, Field
from typing import Literal

class RAGResponse(BaseModel):
    answer: str = Field(description="The main answer to the user's question in markdown")
    found_answer: bool = Field(description="True if relevant information was found in the documentation")
    confidence: float = Field(description="Confidence score from 0.0 to 1.0")
    confidence_explanation: str = Field(description="Explanation about the confidence level")
    answer_type: Literal["how-to", "explanation", "troubleshooting", "comparison", "reference"] = Field(description="The category of the answer")
    followup_questions: list[str] = Field(description="Suggested follow-up questions")

In [35]:
import json

instructions = """
You're a course assistant, your task is to answer the QUESTION from the
course students using the provided CONTEXT
"""

prompt_template = """
<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def build_prompt(question, search_results):
    context = json.dumps(search_results, indent=2)
    prompt = prompt_template.format(
        question=question,
        context=context
    ).strip()
    return prompt

def search(question):
    return index.search(question, num_results=5)

def llm(user_prompt, instructions, model='gpt-4o-mini'):
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": user_prompt}
    ]

    response = openai_client.responses.create(
        model=model,
        input=messages
    )

    return response

def llm_structured(user_prompt, instructions, model='gpt-4o-mini'):
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": user_prompt}
    ]

    response = openai_client.responses.parse(
        model=model,
        input=messages,
        text_format=RAGResponse
    )

    return response


def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, instructions)
    return answer

def rag_structured(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm_structured(prompt, instructions)
    return answer


In [16]:
response = rag("python function definition")

In [36]:
response = rag_structured("python function definition")

In [32]:
from pprint import pprint

dump = json.loads(response.model_dump_json())
pprint(dump['usage'])

{'input_tokens': 6800,
 'input_tokens_details': {'cached_tokens': 0},
 'output_tokens': 320,
 'output_tokens_details': {'reasoning_tokens': 0},
 'total_tokens': 7120}


In [37]:
dump = json.loads(response.model_dump_json())
pprint(dump['usage'])

{'input_tokens': 6986,
 'input_tokens_details': {'cached_tokens': 0},
 'output_tokens': 455,
 'output_tokens_details': {'reasoning_tokens': 0},
 'total_tokens': 7441}


In [None]:
rag_response = response.output_parsed

In [48]:
print(rag_response.found_answer)

True


In [46]:
print(rag_response.confidence)

0.9


In [45]:
print(rag_response.confidence_explanation)

The answer is based on standard practices for defining functions in Python, as backed by the context provided. The examples and explanation align well with fundamental programming concepts introduced in Python literature.


In [47]:
print(rag_response.followup_questions)

['What are parameters and arguments in a function?', 'How do I return multiple values from a function in Python?', 'Can you explain recursive functions in Python?']


In [None]:
print(rag_response.answer)

### Python Function Definition

In Python, a function is defined using the `def` keyword, followed by the name of the function and parentheses containing any parameters. The function body consists of a sequence of statements that define its functionality. Here’s a basic structure:

```python
def function_name(parameters):
    """Optional docstring"""
    # Body of the function
    # More code to define what the function does
    return result  # Optional return statement
```

#### Example
Here’s a simple example of a function that adds two numbers:

```python
def add_numbers(a, b):
    return a + b
```

You can call this function like so:

```python
result = add_numbers(5, 3)  # result will be 8
``` 

### Key Components:
1. **Function Name**: `add_numbers` is the name identifying the function.
2. **Parameters**: `a` and `b` are parameters that the function takes.
3. **Body**: The indented lines form the body of the function, executing the return statement when it’s called.
4. **Return 