In [None]:
from unstract.llmwhisperer.client import LLMWhispererClient
from langchain_openai import AzureChatOpenAI
from typing import Dict, List, TypedDict, Any, Optional
from pydantic import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_community.callbacks import get_openai_callback
from langchain_core.output_parsers import PydanticOutputParser, JsonOutputParser
from langchain.output_parsers import OutputFixingParser

In [None]:
# Initialize LLM Whisperer client
client = LLMWhispererClient(base_url="YOUR_BASE_URL", api_key="YOUR_LLM_WHISPERER_API_KEY")

In [None]:
# Process PDF document
whisper = client.whisper(
    file_path=r"PATH_TO_YOUR_PDF_FILE",
    processing_mode="ocr",
    output_mode="line-printer",
    force_text_processing="false",
    line_splitter_tolerance="0.5"
)

In [None]:
# Extract raw text
raw_text = whisper["extracted_text"]
print("PDF text extracted successfully!")

In [None]:
# Define data models for extraction
class ProductLineItems(BaseModel):
    """Information about product line items."""
    currency: Optional[str] = Field(description="Product currency")
    quantity: Optional[int] = Field(description="Product quantity")
    unit_price: Optional[float] = Field(description="Unit price")
    product_description: Optional[str] = Field(description="Product description")
    spell_corrected_product_description: Optional[str] = Field(description="Corrected description")

class Header(BaseModel):
    purchase_order: Optional[str] = Field(description="Purchase order number")
    invoice_order: Optional[str] = Field(description="Invoice number")
    ship_to: Optional[str] = Field(description="Shipping address")
    bill_to: Optional[str] = Field(description="Billing address")
    vendor: Optional[str] = Field(description="Vendor information")

class Data(BaseModel):
    """Complete extraction data model."""
    queries: List[ProductLineItems]
    headers: List[Header]

In [None]:
# Configure Azure OpenAI
llm = AzureChatOpenAI(
    azure_endpoint="YOUR_AZURE_ENDPOINT",
    deployment_name="YOUR_DEPLOYMENT_NAME",
    api_version="YOUR_API_VERSION",
    api_key="YOUR_AZURE_OPENAI_API_KEY",
    temperature=0,
    max_tokens=4096
)

In [None]:
# Create extraction prompt
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an expert extraction algorithm. Extract product line items and headers from the document."),
    ("human", "{text}")
])

In [None]:
# Example extraction (replace with actual implementation)
print("PDF Extraction System Ready!")
print("Configure your API keys and PDF path to run extraction")
print("Sample output structure:")
print("- Product line items with quantities, prices, descriptions")
print("- Document headers with PO numbers, addresses, vendor info")
print("- Token usage tracking and cost analysis")