## Homework 1 

In [22]:
import os
import json
import csv
import tiktoken
import requests
from pathlib import Path
from urllib.parse import urlparse
from markitdown import MarkItDown
from gitsource import chunk_documents
from typing import List, Dict, Any, Literal
from pydantic import BaseModel, Field
from minsearch import Index

CSV_FILE = Path("books.csv")
PDF_DIR = Path("downloads")
MARKDOWN_DIR = Path("books_text")
URL_COLUMN = "pdf_url"

In [2]:
# Test LLM call
from dotenv import load_dotenv
load_dotenv()

from openai import OpenAI
openai_client = OpenAI()
response = openai_client.responses.create(
    model="gpt-4o-mini",
    input="What is neuro-symbolic AI? Give me a three sentence definition."
)
print(response.output_text)

Neuro-symbolic AI is an approach that combines neural networks with symbolic reasoning to leverage the strengths of both paradigms. This integration allows for the learning capabilities of neural networks to be enhanced by the structured logic and interpretability of symbolic systems. As a result, neuro-symbolic AI aims to create more robust and explainable artificial intelligence solutions, capable of reasoning about complex data while still learning from it.


#### Download the books listed in books.csv

In [3]:
# Create directory
PDF_DIR.mkdir(parents=True, exist_ok=True)

def filename_from_url(url: str) -> str:
    path = urlparse(url).path
    name = Path(path).name
    return name if name else "downloaded_file.pdf"

with CSV_FILE.open(newline="", encoding="utf-8") as f:
    reader = csv.DictReader(f)

    for row in reader:
        url = row[URL_COLUMN]

        try:
            response = requests.get(url, stream=True, timeout=30)
            response.raise_for_status()

            filename = filename_from_url(url)
            filepath = PDF_DIR / filename   # ✅ Path join

            with filepath.open("wb") as out:
                for chunk in response.iter_content(chunk_size=8192):
                    out.write(chunk)

            print(f"✅ Downloaded: {filename}")

        except Exception as e:
            print(f"❌ Failed to download {url}: {e}")

✅ Downloaded: thinkpython2.pdf
✅ Downloaded: thinkdsp.pdf
✅ Downloaded: thinkcomplexity2.pdf
✅ Downloaded: thinkjava2.pdf
✅ Downloaded: PhysicalModelingInMatlab4.pdf
✅ Downloaded: thinkos.pdf
✅ Downloaded: Think-C.pdf


#### PDF to Markdown Conversion

In [4]:
# Create directory
MARKDOWN_DIR.mkdir(parents=True, exist_ok=True)

md = MarkItDown()

pdf_files = list(PDF_DIR.glob("*.pdf"))

if not pdf_files:
    print("No PDF files found.")
    exit(0)

for pdf_path in pdf_files:
    try:
        result = md.convert(pdf_path)

        output_file = MARKDOWN_DIR / f"{pdf_path.stem}.md"
        output_file.write_text(result.text_content, encoding="utf-8")

        print(f"✅ Converted: {pdf_path.name} → {output_file.name}")

    except Exception as e:
        print(f"❌ Failed to convert {pdf_path.name}: {e}")

✅ Converted: PhysicalModelingInMatlab4.pdf → PhysicalModelingInMatlab4.md
✅ Converted: Think-C.pdf → Think-C.md
✅ Converted: thinkcomplexity2.pdf → thinkcomplexity2.md
✅ Converted: thinkdsp.pdf → thinkdsp.md
✅ Converted: thinkjava2.pdf → thinkjava2.md
✅ Converted: thinkos.pdf → thinkos.md
✅ Converted: thinkpython2.pdf → thinkpython2.md


#### Chunking for RAG

In [5]:
def load_book_as_line_doc(md_path: Path) -> Dict[str, Any]:
    """
    Input: one markdown file
    Output:
      {
        "source": "<filename.md>",
        "content": [<non-empty line>, ...]
      }
    """
    text = md_path.read_text(encoding="utf-8", errors="replace")

    lines = [line.strip() for line in text.splitlines()]
    lines = [line for line in lines if line]

    return {
        "source": md_path.name,
        "content": lines,
    }

def to_gitsource_document(line_doc: Dict[str, Any]) -> Dict[str, str]:
    """
    Converts:
      {
        "source": "...",
        "content": [lines]
      }
    → {
        "filename": "...",
        "content": [lines]
      }
    """
    return {
        "filename": line_doc["source"],
        "content": line_doc["content"],
    }

In [6]:
all_chunks = []

for md_path in sorted(MARKDOWN_DIR.glob("*.md")):
    # Load one book
    line_doc = load_book_as_line_doc(md_path)

    # Convert to gitsource format
    gs_doc = to_gitsource_document(line_doc)

    # Item-based chunking of the book (one item = one line)
    # Slidding window with 100 items per chunk, step size of 50
    chunks = chunk_documents(documents=[to_gitsource_document(line_doc)],
                             size=100, step=50)

    # Append chunks
    all_chunks.extend(chunks)
    print(f"✅ {md_path.name}: {len(chunks)} chunks")

print(f"\nTotal chunks created: {len(all_chunks)}")

✅ PhysicalModelingInMatlab4.md: 106 chunks
✅ Think-C.md: 109 chunks
✅ thinkcomplexity2.md: 130 chunks
✅ thinkdsp.md: 86 chunks
✅ thinkjava2.md: 216 chunks
✅ thinkos.md: 62 chunks
✅ thinkpython2.md: 214 chunks

Total chunks created: 923


In [7]:
# Number of chunks in Think Python 2
count = sum(
    1 for c in all_chunks
    if c.get("filename") == "thinkpython2.md"
)
print("thinkpython2.md", count, "chunks")

thinkpython2.md 214 chunks


#### Indexing with minsearch

In [8]:
def prepare_documents_for_search(all_chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Converts gitsource chunks into documents suitable for minsearch.Index.fit()
    """
    documents = []

    for i, chunk in enumerate(all_chunks):
        # chunk["content"] is a LIST (item-based chunking)
        text = "\n".join(chunk["content"])

        doc = {
            "id": f"{chunk.get('filename', 'doc')}_{i}",
            "text": text,
            "source": chunk.get("filename"),
        }

        documents.append(doc)

    return documents

In [9]:
# Convert each chunk (list of lines) into an indexed block of text
documents = prepare_documents_for_search(all_chunks)

# Use indexing from the minsearch library
index = Index(text_fields=["text"])
index.fit(documents)

<minsearch.minsearch.Index at 0x1a55c750620>

#### Retrieval Augmented Generation

In [10]:
# Lexical (rather than semantic) search with the minsearch library
results = index.search("python function definition", num_results=5)

In [11]:
# minsearch returns results sorted by descending relevance
# The most relevant match for the above user-query is:
results[0]['source']

'thinkpython2.md'

In [20]:
# Full RAG pipeline
instructions = """
You're a course assistant, your task is to answer the QUESTION from the
course students using the provided CONTEXT
"""

prompt_template = """
<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def build_prompt(question, search_results):
    context = json.dumps(search_results, indent=2)
    prompt = prompt_template.format(
        question=question,
        context=context
    ).strip()
    return prompt

def search(question):
    return index.search(question, num_results=5)

def llm(user_prompt, instructions, model='gpt-4o-mini'):
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": user_prompt}
    ]

    response = openai_client.responses.create(
        model=model,
        input=messages
    )

    return response.output_text

def count_input_tokens(instructions, user_prompt, model="gpt-4o-mini"):
    encoding = tiktoken.encoding_for_model(model)

    system_tokens = len(encoding.encode(instructions))
    user_tokens = len(encoding.encode(user_prompt))

    return {
        "system_tokens": system_tokens,
        "user_tokens": user_tokens,
        "total_input_tokens": system_tokens + user_tokens,
    }

def rag(query):
    # Define prompt, including lexical search results
    search_results = search(query)
    prompt = build_prompt(query, search_results)

    # Count the number of input tokens for one RAG query
    token_stats = count_input_tokens(instructions, prompt)
    print(token_stats)

    # Return LLM output
    answer = llm(prompt, instructions)
    return answer

In [21]:
user_prompt = "python function definition"
rag(user_prompt)

{'system_tokens': 24, 'user_tokens': 6926, 'total_input_tokens': 6950}


'A Python function is defined using the `def` keyword, followed by the function name and parentheses that may contain parameters. The function\'s body, which contains the statements that perform the operation, is indented beneath the definition line. \n\nHere\'s a basic structure for a function definition:\n\n```python\ndef function_name(parameters):\n    # body of the function\n    statement(s)\n    return value  # optional\n```\n\n### Example:\nHere\'s a simple function that takes a parameter and prints a message:\n\n```python\ndef greet(name):\n    print("Hello, " + name + "!")\n```\n\nYou can call this function with an argument:\n\n```python\ngreet(\'Alice\')\n```\n\nThis would output:\n```\nHello, Alice!\n```\n\nFunctions may return values using the `return` statement, allowing for value processing and manipulation. Functions can also take multiple parameters and perform various operations based on the logic defined within the body.'

#### RAG with Structured Outputs

In [23]:
class RAGResponse(BaseModel):
    answer: str = Field(description="The main answer to the user's question in markdown")
    found_answer: bool = Field(description="True if relevant information was found in the documentation")
    confidence: float = Field(description="Confidence score from 0.0 to 1.0")
    confidence_explanation: str = Field(description="Explanation about the confidence level")
    answer_type: Literal["how-to", "explanation", "troubleshooting", "comparison", "reference"] = Field(description="The category of the answer")
    followup_questions: list[str] = Field(description="Suggested follow-up questions")

In [24]:
def llm(user_prompt, instructions, model="gpt-4o-mini"):
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": user_prompt},
        {"role": "user", "content": (
            "Respond ONLY as valid JSON matching this schema:\n"
            f"{RAGResponse.model_json_schema()}\n"
            "Do not include any extra keys. Do not wrap in markdown."
        )},
    ]

    response = openai_client.responses.create(
        model=model,
        input=messages,
        # This nudges the API to return JSON (if supported in your client/version)
        text={"format": {"type": "json_object"}},
    )

    # Parse JSON text into Pydantic model
    data = json.loads(response.output_text)
    return RAGResponse.model_validate(data)

In [25]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)

    token_stats = count_input_tokens(instructions, prompt)
    print(token_stats)

    structured = llm(prompt, instructions)  # now returns RAGResponse
    return structured

In [26]:
result = rag("python function definition")

print(result)                 # shows the full model
print(result.answer)          # just the markdown answer
print(result.confidence)      # confidence number
print(result.found_answer)    # True/False

{'system_tokens': 24, 'user_tokens': 6926, 'total_input_tokens': 6950}
answer='In Python, a function is defined using the `def` keyword, followed by the function name and parentheses containing any parameters. The function body is indented and contains the statements that will be executed when the function is called. Here is an example:\n\n```python\ndef my_function(parameter1, parameter2):\n    # Function body\n    return parameter1 + parameter2\n```\n\nThe above function takes two parameters and returns their sum.' found_answer=True confidence=0.9 confidence_explanation='The explanation includes a clear and concise definition of a function, along with an example, which reflects common practices in Python programming.' answer_type='how-to' followup_questions=['Can you explain the difference between a function and a method in Python?', 'What are parameters and arguments in Python functions?', 'How can I return multiple values from a function in Python?']
In Python, a function is define