Exercise 01: First Workflow - Starter Code
==========================================
Build a document processing workflow with conditional routing.

LEARNING GOALS:
- Import and use StateGraph, START, END
- Add nodes and edges to build a workflow
- Implement conditional routing

In [26]:
import os
from dotenv import load_dotenv
from typing import TypedDict, Literal
from langgraph.graph import StateGraph, START, END
from langchain.chat_models import init_chat_model

load_dotenv()

True

In [27]:

# =============================================================================
# TODO 1: Define the State
# =============================================================================
# Create a TypedDict with fields for:
# - document_text: str (the input document)
# - document_type: str (invoice, receipt, contract, unknown)
# - extracted_data: dict (extracted information)
# - formatted_output: str (final output)
#
# EXPERIMENT: Add a confidence_score field
# =============================================================================

class DocumentState(TypedDict):
    """State for document processing workflow."""
    document_text: str
    document_type: Literal["invoice", "receipt", "contract", "unknown"]
    extracted_data: dict
    formatted_output: str
    confidence_score: float

In [28]:

# =============================================================================
# TODO 2: Implement Node Functions
# =============================================================================
# Each node function:
# - Takes state as parameter
# - Returns a dict with fields to update
# - Only returns the fields that change (not the whole state)
#
# classify_document:
# - Use LLM to classify as invoice/receipt/contract/unknown
# - Return {"document_type": "..."}
#
# extract_invoice_data, extract_receipt_data, extract_contract_data:
# - Use LLM to extract relevant fields
# - Return {"extracted_data": {...}}
#
# format_output:
# - Create a formatted string from the data
# - Return {"formatted_output": "..."}
#
# EXPERIMENT: What happens if a node returns nothing?
# =============================================================================

import json
import re

def parse_json_response(response_text: str) -> dict:
    """Extract JSON from LLM response, handling markdown code blocks."""
    text = response_text.strip()
    
    json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
    if json_match:
        text = json_match.group(1)
    else:
        json_match = re.search(r'(\{.*\})', text, re.DOTALL)
        if json_match:
            text = json_match.group(1)
    
    try:
        return json.loads(text)
    except:
        return {}

def classify_document(state: DocumentState) -> dict:
    """Classify the document type using an LLM."""
    model = init_chat_model("gpt-4o-mini", model_provider="openai")
    
    prompt = f"""Classify the following document as one of: invoice, receipt, contract, or unknown.
    
Document text:
{state['document_text']}

Respond with only one word: invoice, receipt, contract, or unknown."""
    
    response = model.invoke(prompt)
    document_type = response.content.strip().lower()
    
    if document_type not in ["invoice", "receipt", "contract", "unknown"]:
        document_type = "unknown"
    
    return {"document_type": document_type}


def extract_invoice_data(state: DocumentState) -> dict:
    """Extract invoice fields: amount, date, vendor."""
    model = init_chat_model("gpt-4o-mini", model_provider="openai")
    
    prompt = f"""Extract invoice information from the following document. Return a JSON object with keys: amount, date, vendor.

Document text:
{state['document_text']}

Return only valid JSON."""
    
    response = model.invoke(prompt)
    extracted_data = parse_json_response(response.content)
    
    if not extracted_data:
        extracted_data = {"amount": None, "date": None, "vendor": None}
    
    return {"extracted_data": extracted_data}


def extract_receipt_data(state: DocumentState) -> dict:
    """Extract receipt fields: total, store, items."""
    model = init_chat_model("gpt-4o-mini", model_provider="openai")
    
    prompt = f"""Extract receipt information from the following document. Return a JSON object with keys: total, store, items (as a list).

Document text:
{state['document_text']}

Return only valid JSON."""
    
    response = model.invoke(prompt)
    extracted_data = parse_json_response(response.content)
    
    if not extracted_data:
        extracted_data = {"total": None, "store": None, "items": []}
    
    return {"extracted_data": extracted_data}


def extract_contract_data(state: DocumentState) -> dict:
    """Extract contract fields: parties, terms, date."""
    model = init_chat_model("gpt-4o-mini", model_provider="openai")
    
    prompt = f"""Extract contract information from the following document. Return a JSON object with keys: parties (as a list), terms, date.

Document text:
{state['document_text']}

Return only valid JSON."""
    
    response = model.invoke(prompt)
    extracted_data = parse_json_response(response.content)
    
    if not extracted_data:
        extracted_data = {"parties": [], "terms": None, "date": None}
    
    return {"extracted_data": extracted_data}


def format_output(state: DocumentState) -> dict:
    """Format the extracted data into readable output."""
    doc_type = state['document_type']
    data = state['extracted_data']
    
    formatted = f"Document Type: {doc_type}\n"
    formatted += f"Extracted Data:\n"
    
    for key, value in data.items():
        formatted += f"  {key}: {value}\n"
    
    return {"formatted_output": formatted}


In [29]:
# =============================================================================
# TODO 3: Implement Routing Function
# =============================================================================
# The routing function:
# - Takes state as parameter
# - Returns a STRING with the name of the next node
# - Called by add_conditional_edges()
#
# EXPERIMENT: What happens if you return an invalid node name?
# =============================================================================

def route_by_document_type(state: DocumentState) -> str:
    """Route to appropriate extraction node."""
    document_type = state['document_type']
    return document_type

In [30]:
# =============================================================================
# TODO 4: Build the Graph
# =============================================================================
# Steps:
# 1. Create StateGraph with your state class
# 2. Add nodes with add_node(name, function)
# 3. Add edge from START to classify
# 4. Add conditional edges from classify
# 5. Add edges from extraction nodes to format
# 6. Add edge from format to END
# 7. Compile the graph
#
# EXPERIMENT: Add a "validate" node between extraction and formatting
# =============================================================================

workflow = StateGraph(DocumentState)

workflow.add_node("classify", classify_document)
workflow.add_node("extract_invoice", extract_invoice_data)
workflow.add_node("extract_receipt", extract_receipt_data)
workflow.add_node("extract_contract", extract_contract_data)
workflow.add_node("format", format_output)

workflow.add_edge(START, "classify")

workflow.add_conditional_edges(
    "classify",
    route_by_document_type,
    {
        "invoice": "extract_invoice",
        "receipt": "extract_receipt",
        "contract": "extract_contract",
        "unknown": "extract_receipt"
    }
)

workflow.add_edge("extract_invoice", "format")
workflow.add_edge("extract_receipt", "format")
workflow.add_edge("extract_contract", "format")

workflow.add_edge("format", END)

agent = workflow.compile()



In [31]:
# =============================================================================
# Testing
# =============================================================================
if __name__ == "__main__":
    print("Test documents after completing TODOs:")
    print("\nInvoice: 'Invoice #12345\\nAmount: $500'")
    print("Receipt: 'Store: Coffee Shop\\nTotal: $8'")
    print("Contract: 'Agreement between A and B dated Jan 1'")

Test documents after completing TODOs:

Invoice: 'Invoice #12345\nAmount: $500'
Receipt: 'Store: Coffee Shop\nTotal: $8'
Contract: 'Agreement between A and B dated Jan 1'


In [32]:
# Test the workflow with sample documents
test_invoice = {
    "document_text": "Invoice #12345\nAmount: $500",
    "document_type": "unknown",
    "extracted_data": {},
    "formatted_output": "",
    "confidence_score": 0.0
}

test_receipt = {
    "document_text": "Store: Coffee Shop\nTotal: $8",
    "document_type": "unknown",
    "extracted_data": {},
    "formatted_output": "",
    "confidence_score": 0.0
}

test_contract = {
    "document_text": "Agreement between A and B dated Jan 1",
    "document_type": "unknown",
    "extracted_data": {},
    "formatted_output": "",
    "confidence_score": 0.0
}

print("Testing Invoice:")
result_invoice = agent.invoke(test_invoice)
print(result_invoice["formatted_output"])
print("\n" + "="*50 + "\n")

print("Testing Receipt:")
result_receipt = agent.invoke(test_receipt)
print(result_receipt["formatted_output"])
print("\n" + "="*50 + "\n")

print("Testing Contract:")
result_contract = agent.invoke(test_contract)
print(result_contract["formatted_output"])

Testing Invoice:
Document Type: invoice
Extracted Data:
  amount: $500
  date: None
  vendor: None



Testing Receipt:
Document Type: receipt
Extracted Data:
  total: $8
  store: Coffee Shop
  items: []



Testing Contract:
Document Type: contract
Extracted Data:
  parties: ['A', 'B']
  terms: 
  date: Jan 1

