In [1]:
import os
import requests
import pandas as pd

url = 'https://raw.githubusercontent.com/alexeygrigorev/ai-engineering-buildcamp-code/main/01-foundation/homework/books.csv'
df = pd.read_csv(url)

display(df)

output_dir = 'data'
os.makedirs(output_dir, exist_ok=True)

for _, row in df.iterrows():
    title = row['title']
    pdf_url = row['pdf_url']

    filename = title.replace(' ', '_') + '.pdf'
    filepath = os.path.join(output_dir, filename)

    if pd.isna(pdf_url):
        print(f"No PDF URL for '{title}'")
        continue

    try:
        response = requests.get(url = pdf_url, timeout=30)
        response.raise_for_status()

        with open(filepath, 'wb') as f:
            f.write(response.content)
            
        print(f'Downloaded {title} successfully')
    
    except Exception as e:
        print(f"Failed to download {title}: {e}")

Unnamed: 0,title,book_url,pdf_url
0,Think Python 2e,https://greenteapress.com/wp/think-python-2e/,http://greenteapress.com/thinkpython2/thinkpyt...
1,Think DSP,https://greenteapress.com/wp/think-dsp/,http://greenteapress.com/thinkdsp/thinkdsp.pdf
2,Think Complexity 2e,https://greenteapress.com/wp/think-complexity/,http://greenteapress.com/complexity2/thinkcomp...
3,Think Java 2e,https://greenteapress.com/wp/think-java-2e/,http://greenteapress.com/thinkjava7/thinkjava2...
4,Physical Modeling in MATLAB,https://greenteapress.com/wp/physical-modeling...,https://github.com/AllenDowney/PhysicalModelin...
5,Think OS,https://greenteapress.com/wp/think-os/,http://greenteapress.com/thinkos/thinkos.pdf
6,Think C++,https://greenteapress.com/wp/think-c/,https://raw.githubusercontent.com/tscheffl/Thi...


Downloaded Think Python 2e successfully
Downloaded Think DSP successfully
Downloaded Think Complexity 2e successfully
Downloaded Think Java 2e successfully
Downloaded Physical Modeling in MATLAB successfully
Downloaded Think OS successfully
Downloaded Think C++ successfully


## Convert to Markdown

In [85]:
# Question 1
# convert to txt format

from markitdown import MarkItDown

md = MarkItDown()
result = md.convert('data/Think_Python_2e.pdf')
markdown_text = result.markdown

# print(markdown_text[:100])

num_lines = markdown_text.count('\n') # similar to !wc -l books_text/Think_Python_2e.md
print(f'Think_Python_2e have {num_lines} lines')

Think_Python_2e have 16268 lines


In [None]:
#  Clean text

def clean_line(line):
    line = line.strip()
    return line if line else None

In [87]:
documents = []

for file in os.listdir(output_dir):
    if file.endswith('.pdf'):
        pdf_path = os.path.join(output_dir, file)
        
        result = md.convert(pdf_path)
        lines_file = result.markdown.splitlines()
        markdown_lines_clean = [l for l in (clean_line(line) for line in lines_file) if l]
        markdown_text = "\n".join(markdown_lines_clean)

        documents.append({
            'filename': file.replace('.pdf', ''),
            'content': markdown_lines_clean
        })

        print(f'Converted {file} to markdown format')

Converted Think_DSP.pdf to markdown format
Converted Think_C++.pdf to markdown format
Converted Think_OS.pdf to markdown format
Converted Physical_Modeling_in_MATLAB.pdf to markdown format
Converted Think_Java_2e.pdf to markdown format
Converted Think_Complexity_2e.pdf to markdown format
Converted Think_Python_2e.pdf to markdown format


## Chunking

In [88]:
from gitsource import chunk_documents

chunks = chunk_documents(documents, size=100, step=50)

In [89]:
chunks[0].keys()

dict_keys(['start', 'content', 'filename'])

In [90]:
# Question 2
# Chunks for the Think Python book

think_python_chunks = [
    c for c in chunks
    if c['filename'] == 'Think_Python_2e'
]

len(think_python_chunks)

214

## Indexing

In [91]:
from minsearch import Index

In [92]:
for c in chunks:
    c["content"] = "\n".join(c["content"])

In [93]:
index = Index(
        text_fields=["content"],
        keyword_fields=["filename"]
    )

In [None]:
from minsearch import Index

index.fit(chunks)

<minsearch.minsearch.Index at 0x784cc01b1b80>

In [95]:
# question 3
len(chunks)

923

## Searching and RAG

In [96]:
# question 4

results = index.search("python function definition", num_results=5)
results

[{'start': 1900,
  'content': 'when you are comfortable with Python, I’ll make suggestions for installing Python on your\ncomputer.\nThere are a number of web pages you can use to run Python. If you already have a fa-\nvorite, go ahead and use it. Otherwise I recommend PythonAnywhere. I provide detailed\ninstructions for getting started at http://tinyurl.com/thinkpython2e.\nThere are two versions of Python, called Python 2 and Python 3. They are very similar, so\nif you learn one, it is easy to switch to the other. In fact, there are only a few differences you\nwill encounter as a beginner. This book is written for Python 3, but I include some notes\nabout Python 2.\nThe Python interpreter is a program that reads and executes Python code. Depending\non your environment, you might start the interpreter by clicking on an icon, or by typing\npython on a command line. When it starts, you should see output like this:\nPython 3.4.0 (default, Jun 19 2015, 14:20:21)\n[GCC 4.8.2] on linux\nType

## Full RAG

In [97]:
from openai import OpenAI

openai_client = OpenAI()

In [98]:
import json

instructions = """
You're a course assistant, your task is to answer the QUESTION from the
course students using the provided CONTEXT
"""

prompt_template = """
<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def build_prompt(question, search_results):
    context = json.dumps(search_results, indent=2)
    prompt = prompt_template.format(
        question=question,
        context=context
    ).strip()
    return prompt

def search(question):
    return index.search(question, num_results=5)

def llm(user_prompt, instructions, model='gpt-4o-mini'):
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": user_prompt}
    ]

    response = openai_client.responses.create(
        model=model,
        input=messages
    )

    return {
        "answer": response.output_text,
        "input_tokens": response.usage.input_tokens,
        "output_tokens": response.usage.output_tokens
    }

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, instructions)
    return answer

In [99]:
# question 5

answer = rag('python function definition')
answer

{'answer': 'In Python, a function is defined using the `def` keyword followed by the function name and parentheses, which may include parameters. The body of the function is indented. Here’s a basic structure:\n\n```python\ndef function_name(parameters):\n    # body of the function\n```\n\nFor example, here’s a simple function that prints "Hello, World!":\n\n```python\ndef greet():\n    print(\'Hello, World!\')\n```\n\nTo call the function, simply use its name followed by parentheses:\n\n```python\ngreet()  # This will output: Hello, World!\n```\n\nFunctions can also return values. For instance:\n\n```python\ndef add(a, b):\n    return a + b\n```\n\nNow, calling `add(2, 3)` will return `5`. \n\nFor more details on functions, including concepts like parameters, return values, and function objects, you may want to explore Chapter 3 of your course material.',
 'input_tokens': 6947,
 'output_tokens': 195}

## Structured Output

In [100]:
from pydantic import BaseModel, Field
from typing import Literal

class RAGResponse(BaseModel):
    answer: str = Field(description="The main answer to the user's question in markdown")
    found_answer: bool = Field(description="True if relevant information was found in the documentation")
    confidence: float = Field(description="Confidence score from 0.0 to 1.0")
    confidence_explanation: str = Field(description="Explanation about the confidence level")
    answer_type: Literal["how-to", "explanation", "troubleshooting", "comparison", "reference"] = Field(description="The category of the answer")
    followup_questions: list[str] = Field(description="Suggested follow-up questions")

In [101]:
def llm_structured(
        user_prompt,
        output_type =None,
        instructions=None,
        model='gpt-4o-mini',
        ):
    
    messages = []

    if instructions:
        messages.append({
            "role": "system",
            "content": instructions
        })

    messages.append({
        "role": "user",
        "content": user_prompt
    })

    response = openai_client.responses.parse(
        model=model,
        input=messages,
        text_format=output_type
    )

    return {
        "answer": response.output_text,
        "input_tokens": response.usage.input_tokens,
        "output_tokens": response.usage.output_tokens
    }

In [102]:
def rag_structured(query, output_type = RAGResponse):
    search_results = search(query)
    prompt = build_prompt(query, search_results)

    return llm_structured(instructions = instructions,
                           user_prompt= prompt, 
                           output_type = output_type)

In [103]:
answer_structured= rag_structured('python function definition', output_type=RAGResponse)
answer_structured


{'answer': '{"answer":"In Python, a function is defined using the `def` keyword, followed by the function name and parentheses that may include parameters. The body of the function is indented and contains the statements that perform the desired operations. Here’s a basic structure for defining a function:\\n\\n```python\\n\\ndef function_name(parameters):\\n    \\"\\"\\"Docstring explaining the function\\"\\"\\"\\n    # statements\\n    return value  # optional\\n```\\n\\n### Example:\\n\\nHere’s how you would define a simple function that adds two numbers:\\n\\n```python\\n\\ndef add_numbers(a, b):\\n    \\"\\"\\"Returns the sum of a and b.\\"\\"\\"\\n    return a + b\\n```\\n\\nYou can then call this function by using its name followed by arguments in parentheses:\\n\\n```python\\n\\result = add_numbers(3, 5)\\nprint(result)  # Output will be 8\\n```\\n\\nThis basic definition structure is common across Python, and understanding it is crucial for performing more complex operations u

In [104]:
# question 6

answer_structured['input_tokens'] - answer['input_tokens']

186