## Homework 1 

In [32]:
import csv
import os
import requests
from pathlib import Path
from urllib.parse import urlparse
from markitdown import MarkItDown
from gitsource import chunk_documents
from typing import List, Dict, Any
from minsearch import Index

CSV_FILE = Path("books.csv")
PDF_DIR = Path("downloads")
MARKDOWN_DIR = Path("books_text")
URL_COLUMN = "pdf_url"

In [46]:
# Test LLM call
from openai import OpenAI
openai_client = OpenAI()
response = openai_client.responses.create(
    model="gpt-4o-mini",
    input="What is AI?"
)
print(response.output_text)

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

#### Download the books listed in books.csv

In [2]:
# Create directory
PDF_DIR.mkdir(parents=True, exist_ok=True)

def filename_from_url(url: str) -> str:
    path = urlparse(url).path
    name = Path(path).name
    return name if name else "downloaded_file.pdf"

with CSV_FILE.open(newline="", encoding="utf-8") as f:
    reader = csv.DictReader(f)

    for row in reader:
        url = row[URL_COLUMN]

        try:
            response = requests.get(url, stream=True, timeout=30)
            response.raise_for_status()

            filename = filename_from_url(url)
            filepath = PDF_DIR / filename   # ✅ Path join

            with filepath.open("wb") as out:
                for chunk in response.iter_content(chunk_size=8192):
                    out.write(chunk)

            print(f"✅ Downloaded: {filename}")

        except Exception as e:
            print(f"❌ Failed to download {url}: {e}")

✅ Downloaded: thinkpython2.pdf
✅ Downloaded: thinkdsp.pdf
✅ Downloaded: thinkcomplexity2.pdf
✅ Downloaded: thinkjava2.pdf
✅ Downloaded: PhysicalModelingInMatlab4.pdf
✅ Downloaded: thinkos.pdf
✅ Downloaded: Think-C.pdf


#### PDF to Markdown Conversion

In [3]:
# Create directory
MARKDOWN_DIR.mkdir(parents=True, exist_ok=True)

md = MarkItDown()

pdf_files = list(PDF_DIR.glob("*.pdf"))

if not pdf_files:
    print("No PDF files found.")
    exit(0)

for pdf_path in pdf_files:
    try:
        result = md.convert(pdf_path)

        output_file = MARKDOWN_DIR / f"{pdf_path.stem}.md"
        output_file.write_text(result.text_content, encoding="utf-8")

        print(f"✅ Converted: {pdf_path.name} → {output_file.name}")

    except Exception as e:
        print(f"❌ Failed to convert {pdf_path.name}: {e}")

✅ Converted: PhysicalModelingInMatlab4.pdf → PhysicalModelingInMatlab4.md
✅ Converted: Think-C.pdf → Think-C.md
✅ Converted: thinkcomplexity2.pdf → thinkcomplexity2.md
✅ Converted: thinkdsp.pdf → thinkdsp.md
✅ Converted: thinkjava2.pdf → thinkjava2.md
✅ Converted: thinkos.pdf → thinkos.md
✅ Converted: thinkpython2.pdf → thinkpython2.md


#### Chunking for RAG

In [20]:
def load_book_as_line_doc(md_path: Path) -> Dict[str, Any]:
    """
    Input: one markdown file
    Output:
      {
        "source": "<filename.md>",
        "content": [<non-empty line>, ...]
      }
    """
    text = md_path.read_text(encoding="utf-8", errors="replace")

    lines = [line.strip() for line in text.splitlines()]
    lines = [line for line in lines if line]

    return {
        "source": md_path.name,
        "content": lines,
    }

def to_gitsource_document(line_doc: Dict[str, Any]) -> Dict[str, str]:
    """
    Converts:
      {
        "source": "...",
        "content": [lines]
      }
    → {
        "filename": "...",
        "content": [lines]
      }
    """
    return {
        "filename": line_doc["source"],
        "content": line_doc["content"],
    }

In [25]:
all_chunks = []

for md_path in sorted(MARKDOWN_DIR.glob("*.md")):
    # Load one book
    line_doc = load_book_as_line_doc(md_path)

    # Convert to gitsource format
    gs_doc = to_gitsource_document(line_doc)

    # Item-based chunking of the book (one item = one line)
    # Slidding window with 100 items per chunk, step size of 50
    chunks = chunk_documents(documents=[to_gitsource_document(line_doc)],
                             size=100, step=50)

    # Append chunks
    all_chunks.extend(chunks)
    print(f"✅ {md_path.name}: {len(chunks)} chunks")

print(f"\nTotal chunks created: {len(all_chunks)}")

✅ PhysicalModelingInMatlab4.md: 106 chunks
✅ Think-C.md: 109 chunks
✅ thinkcomplexity2.md: 130 chunks
✅ thinkdsp.md: 86 chunks
✅ thinkjava2.md: 216 chunks
✅ thinkos.md: 62 chunks
✅ thinkpython2.md: 214 chunks

Total chunks created: 923


In [26]:
# Number of chunks in Think Python 2
count = sum(
    1 for c in all_chunks
    if c.get("filename") == "thinkpython2.md"
)
print("thinkpython2.md", count, "chunks")

thinkpython2.md 214 chunks


#### Indexing with minsearch

In [30]:
def prepare_documents_for_search(all_chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Converts gitsource chunks into documents suitable for minsearch.Index.fit()
    """
    documents = []

    for i, chunk in enumerate(all_chunks):
        # chunk["content"] is a LIST (item-based chunking)
        text = "\n".join(chunk["content"])

        doc = {
            "id": f"{chunk.get('filename', 'doc')}_{i}",
            "text": text,
            "source": chunk.get("filename"),
        }

        documents.append(doc)

    return documents

In [35]:
# Convert each chunk (list of lines) into an indexed block of text
documents = prepare_documents_for_search(all_chunks)

# Use indexing from the minsearch library
index = Index(text_fields=["text"])
index.fit(documents)

<minsearch.minsearch.Index at 0x24630b2a360>

#### Retrieval Augmented Generation

In [37]:
# Lexical (rather than semantic) search with the minsearch library
results = index.search("python function definition", num_results=5)

In [43]:
# minsearch returns results sorted by descending relevance
# The most relevant match for the above user-query is:
results[0]['source']

'thinkpython2.md'

In [44]:
# Full RAG pipeline
import json

instructions = """
You're a course assistant, your task is to answer the QUESTION from the
course students using the provided CONTEXT
"""

prompt_template = """
<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def build_prompt(question, search_results):
    context = json.dumps(search_results, indent=2)
    prompt = prompt_template.format(
        question=question,
        context=context
    ).strip()
    return prompt

def search(question):
    return index.search(question, num_results=5)

def llm(user_prompt, instructions, model='gpt-4o-mini'):
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": user_prompt}
    ]

    response = openai_client.responses.create(
        model=model,
        input=messages
    )

    return response.output_text

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, instructions)
    return answer

In [45]:
rag("python function definition")

NameError: name 'openai_client' is not defined