# üß† Technical PDF Knowledge Agent

**Agent Architecture:**
```
PDF Files (Microcontrollers)
        ‚Üì
Document Loader
        ‚Üì
Text Splitter
        ‚Üì
Embedding Model
        ‚Üì
Vector Store (ChromaDB)
        ‚Üì
Retriever
        ‚Üì
Agent (DeepSeek R1 via Ollama)
        ‚Üì
Structured JSON Output
```

This agent reads from technical PDFs and returns structured JSON with:
- `title`, `content`, `summary`, `facts`, `quiz`, `key_notes`

In [None]:
# ============================================================================
# Cell 1 ‚Äî Configuration
# ============================================================================

# LLM Model: DeepSeek R1 via Ollama for reasoning
OLLAMA_LLM_MODEL = "deepseek-r1"

# Embedding Model: mxbai-embed-large for semantic search
OLLAMA_EMBED_MODEL = "mxbai-embed-large"

# ChromaDB persistent storage path
CHROMA_DB_PATH = "./vector_db"

# PDF files directory
PDF_DATA_PATH = "./.docs"

# Text splitting configuration
CHUNK_SIZE = 800       # Size of each text chunk
CHUNK_OVERLAP = 150    # Overlap between chunks for context continuity

print("‚úÖ Configuration loaded")
print(f"   LLM: {OLLAMA_LLM_MODEL}")
print(f"   Embeddings: {OLLAMA_EMBED_MODEL}")
print(f"   Vector DB: {CHROMA_DB_PATH}")
print(f"   PDFs: {PDF_DATA_PATH}")

In [None]:
# ============================================================================
# Cell 2 ‚Äî Load PDFs
# ============================================================================

from langchain_community.document_loaders import PyPDFLoader
import os

def load_documents(path: str):
    """
    Load all PDF files from the specified directory.
    
    Args:
        path: Directory containing PDF files
        
    Returns:
        List of Document objects from all PDFs
    """
    docs = []
    
    # Loop through every file in the directory
    for file in os.listdir(path):
        # Only process PDF files
        if file.endswith(".pdf"):
            file_path = os.path.join(path, file)
            print(f"   Loading: {file}")
            
            # PyPDFLoader extracts text from each page
            loader = PyPDFLoader(file_path)
            docs.extend(loader.load())
    
    return docs

# Load all PDFs
print("üìÑ Loading PDF documents...")
documents = load_documents(PDF_DATA_PATH)
print(f"‚úÖ Loaded {len(documents)} pages from PDFs")

In [None]:
# ============================================================================
# Cell 3 ‚Äî Split Text into Chunks
# ============================================================================

from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_documents(documents: list):
    """
    Split documents into smaller chunks for better embedding performance.
    
    Args:
        documents: List of Document objects
        
    Returns:
        List of chunked Document objects
    """
    # RecursiveCharacterTextSplitter tries to split on natural boundaries
    # (paragraphs, sentences, words) before falling back to character count
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,        # Max characters per chunk
        chunk_overlap=CHUNK_OVERLAP   # Overlapping chars between chunks
    )
    
    return splitter.split_documents(documents)

# Split documents into chunks
print("‚úÇÔ∏è Splitting documents into chunks...")
chunks = split_documents(documents)
print(f"‚úÖ Created {len(chunks)} chunks")
print(f"   Sample chunk preview: {chunks[0].page_content[:100]}...")

In [None]:
# ============================================================================
# Cell 4 ‚Äî Embeddings + Vector Database
# ============================================================================

from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma

def build_vector_store(chunks: list):
    """
    Convert text chunks into embeddings and store in ChromaDB.
    
    Args:
        chunks: List of chunked Document objects
        
    Returns:
        Chroma vector store instance
    """
    # Initialize the embedding model (runs locally via Ollama)
    embedding = OllamaEmbeddings(model=OLLAMA_EMBED_MODEL)
    
    # Create ChromaDB vector store from documents
    # This embeds all chunks and stores them persistently
    db = Chroma.from_documents(
        documents=chunks,
        embedding=embedding,
        persist_directory=CHROMA_DB_PATH
    )
    
    return db

# Build and persist vector store
print("üî¢ Creating embeddings and storing in ChromaDB...")
vectorstore = build_vector_store(chunks)
print(f"‚úÖ Vector store created at: {CHROMA_DB_PATH}")

In [None]:
# ============================================================================
# Cell 5 ‚Äî Retriever Setup
# ============================================================================

from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma

def get_retriever():
    """
    Load existing vector store and create a retriever.
    
    Returns:
        Retriever for similarity search over the vector store
    """
    # Load the persisted vector store
    db = Chroma(
        persist_directory=CHROMA_DB_PATH,
        embedding_function=OllamaEmbeddings(model=OLLAMA_EMBED_MODEL)
    )
    
    # Create retriever that returns top 5 most similar chunks
    return db.as_retriever(search_kwargs={"k": 5})

# Initialize retriever
print("üîç Initializing retriever...")
retriever = get_retriever()
print("‚úÖ Retriever ready for similarity search")

In [None]:
# ============================================================================
# Cell 6 ‚Äî Structured JSON System Prompt
# ============================================================================

SYSTEM_PROMPT = """
You are a Microcontroller Learning Assistant.
Use the retrieved PDF content and respond ONLY in valid JSON format.

Your response MUST be a valid JSON object with this exact structure:

{
  "title": "Topic title based on the question",
  "content": "Detailed explanation from the PDF content",
  "summary": "A concise 2-3 sentence summary",
  "facts": "Key facts, figures, and specifications",
  "quiz": {
    "topic": "Quiz topic",
    "difficulty": "easy/medium/hard",
    "questions": [
      {
        "question": "Question text",
        "options": ["A) ...", "B) ...", "C) ...", "D) ..."],
        "correct_answer": "A",
        "explanation": "Why this is correct"
      }
    ]
  },
  "key_notes": {
    "main_points": ["Point 1", "Point 2"],
    "definitions": [{"term": "...", "definition": "..."}],
    "formulas": ["Formula 1", "Formula 2"],
    "diagrams_mentioned": ["Diagram descriptions if any"]
  }
}

IMPORTANT RULES:
1. ONLY output valid JSON - no markdown, no explanations outside JSON
2. Base ALL content on the retrieved PDF context
3. If information is not found, use null or empty arrays []
4. Generate 2-3 quiz questions to test understanding
5. Extract any formulas, definitions, and key terms
"""

print("üìù System prompt configured for structured JSON output")

In [None]:
# ============================================================================
# Cell 7 ‚Äî Agent Setup with RAG + Memory
# ============================================================================

from langchain_ollama import ChatOllama
from langchain.agents import initialize_agent, Tool, AgentType
from langchain.memory import ConversationBufferMemory

def retrieve_context(query: str) -> str:
    """
    Retrieve relevant documents from the vector store.
    
    Args:
        query: User's question or search query
        
    Returns:
        Concatenated content from relevant documents
    """
    # Get relevant documents using similarity search
    docs = retriever.invoke(query)
    
    # Combine all document contents with separators
    return "\n\n---\n\n".join(d.page_content for d in docs)

# Create the PDF search tool
pdf_search_tool = Tool(
    name="PDFSearch",
    func=retrieve_context,
    description="Search technical PDF documents for microcontroller concepts, circuits, and specifications"
)

# Initialize conversation memory for multi-turn chat
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

# Initialize the LLM (DeepSeek R1 via Ollama)
llm = ChatOllama(
    model=OLLAMA_LLM_MODEL,
    temperature=0.3  # Lower for more focused/consistent output
)

# Create the conversational agent
agent = initialize_agent(
    tools=[pdf_search_tool],
    llm=llm,
    agent=AgentType.CONVERSATIONAL_REACT_DESCRIPTION,
    memory=memory,
    verbose=True,
    handle_parsing_errors=True,
    agent_kwargs={
        "system_message": SYSTEM_PROMPT
    }
)

print("ü§ñ Agent initialized!")
print(f"   Model: {OLLAMA_LLM_MODEL}")
print("   Tools: PDFSearch")
print("   Memory: ConversationBufferMemory")

In [None]:
# ============================================================================
# Cell 8 ‚Äî Run Agent Query
# ============================================================================

import json

# Example query - change this to ask about any microcontroller topic
query = "Explain ADC in microcontrollers"

print(f"‚ùì Query: {query}")
print("="*60)

# Run the agent
response = agent.invoke({"input": query})

print("\n" + "="*60)
print("üì§ Agent Response:")
print("="*60)

# Try to parse and pretty-print JSON response
try:
    output = response["output"]
    
    # Try to extract JSON from response
    if "{" in output and "}" in output:
        # Find JSON boundaries
        start = output.find("{")
        end = output.rfind("}") + 1
        json_str = output[start:end]
        
        # Parse and pretty print
        parsed = json.loads(json_str)
        print(json.dumps(parsed, indent=2))
    else:
        print(output)
except json.JSONDecodeError:
    # If JSON parsing fails, print raw response
    print(response["output"])
except Exception as e:
    print(f"Error: {e}")
    print(response)

In [None]:
# ============================================================================
# Cell 9 ‚Äî Interactive Chat (Optional)
# ============================================================================

import json

def chat(query: str):
    """
    Send a query to the agent and get a structured response.
    
    Args:
        query: Your question about microcontrollers
    """
    print(f"\n‚ùì Query: {query}")
    print("-"*60)
    
    response = agent.invoke({"input": query})
    output = response["output"]
    
    try:
        if "{" in output:
            start = output.find("{")
            end = output.rfind("}") + 1
            parsed = json.loads(output[start:end])
            print(json.dumps(parsed, indent=2))
        else:
            print(output)
    except:
        print(output)

# Example usage:
# chat("What are the different types of memory in microcontrollers?")
# chat("Explain PWM and its applications")
# chat("How does I2C communication work?")

print("üí° Use chat('your question') to interact with the agent")