In [2]:
from unstract.llmwhisperer.client import LLMWhispererClient

from langchain_openai import AzureChatOpenAI
from typing import Dict, List, TypedDict, Any, Optional
from pydantic import BaseModel, Field

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

from langchain_community.callbacks import get_openai_callback

from langchain_core.output_parsers import PydanticOutputParser, JsonOutputParser

from langchain.output_parsers import OutputFixingParser

In [ ]:
client = LLMWhispererClient(base_url="YOUR_BASE_URL", api_key="YOUR_LLM_WHISPERER_API_KEY")

In [ ]:
 whisper = client.whisper(file_path=r"PATH_TO_YOUR_PDF_FILE",processing_mode="ocr" ,output_mode="line-printer",force_text_processing="false",line_splitter_tolerance="0.5") 

In [18]:
raw_text = whisper["extracted_text"]

In [14]:
class ProductLineItems(BaseModel):
    """Information about product line items."""
    
    currency: Optional[str] = Field(..., description="Extract the currency of the product for the product line item. The currency should be represented as a string, if no currency is mentioned, return an empty string.",
    )    
    quantity: Optional[int] = Field(..., description="Extract the quantity of the product for the product line item. Convert alphabetical numbers to numerical values for ease of processing. If not found, return an empty string.",
    )   
    unit_price : Optional[float] = Field(..., description= "Extract the unit price of the product for the product line item. The unit price should be represented as a number, excluding the currency symbol. If not found, return an empty string.",
    )                        
    product_description : Optional[str] = Field(..., description= "Extract the product description for the product line item. The product description should include the product name, product description and attributes (size, colour, material, type, length, Type Designation, MNFG, Dimension, and other product attributes such as Height, Width, Voltage type etc.), include all these, separating them with a comma if there are multiple. If not found, return an empty string.",
    )        
    spell_corrected_product_description : Optional[str] = Field(..., description= """Correct any spelling errors, common typing errors, abbreviations, and product-specific jargons in the extracted product details. Use the context of the email to determine the correct spelling. This field is to ensure the accuracy and readability of the product details.""",
    )   

class Header(BaseModel):
    purchase_order: Optional[str] = Field(description="Extract the purchase order number from the email. Look for labels such as 'order number' or 'PO number'. If not found, return an empty string.")
    invoice_order: Optional[str] = Field(description="Extract the invoice number from the email. Look for labels such as 'invoice number'. If not found, return an empty string.")
    ship_to: Optional[str] = Field(description="Extract the 'ship to' address details from the email, which are specified for the customer's location. If 'ship to' is not found, do not default to any other address. Include the name of the location, address, or company, the address lines, city, state, country, zip code, and country code. If not found, return an empty string.")
    bill_to: Optional[str] = Field(description="Extract the 'bill to' address details from the email. Prioritize 'bill to' information when present. If these are not found, use the company address, but do not consider the vendor's address. If no 'bill to' section is found, consider any address details found in the salutation following 'Thanks!', 'Best', 'Regards,', 'best regards,' or similar sign-offs as potential 'bill to' addresses. Include the name of the location, address, or company, the address lines, city, state, country, zip code, and country code. If not found, return an empty string.")
    vendor: Optional[str] = Field(description="Extract the 'to' or the 'vendor' address details from the email. If the address is not specified as vendor address, exclude it. Include the name of the location, address, or company, the address lines, city, state, country, zip code, and country code.")
    delivery_date: Optional[str] = Field(description="Identify and extract the requested delivery date from the email. Look for labels such as 'DATE REQUESTED' and ensure the format is 'mm/dd/yyyy'. Convert relative dates based on the document's date. If not found, return an empty string.")
    invoice_date: Optional[str] = Field(description="Identify and extract the date when the order was invoiced from the text. Convert to 'mm/dd/yyyy' format if necessary and look for labels such as 'invoice date', 'order date'. Convert relative dates based on the document's date. If not found, return an empty string.")
    shipping_instructions: Optional[str] = Field(description="Identify and extract the shipping instructions from the email. Look for labels such as 'Shipping method', 'Delivery Instructions', 'ROUTING INSTRUCTIONS', or 'Special Instructions'. If not found, return an empty string.")
    notes: Optional[str] = Field(description="Identify and extract any comments or notes related to the order from the email. Exclude irrelevant phrases or conversational messages that are not directly related to the order. If not found, return an empty string.")


class Data(BaseModel):
    """
       Extract data about products.
       Extract all line item information from the text.
       Information includes product details, part number, spell-corrected product description, quantity, unit price, and currency.
       The information should be extracted in the order it appears and should include all relevant details for each line item without mixing or omitting information.
       Provide the output in JSON format.
    """

    queries: List[ProductLineItems]
    headers: List[Header]


In [9]:
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Extract only the following fields for each product line item: "
            "quantity, unit price, currency, product description, spell corrected product description."
            "Extract the headers: purchase_order_number, invoice_number, ship_to, bill_to, vendor address, delivery date, invoice date, shipping instructions, notes."
            "There can be multiple headers, if found extract the headers and respective line items."
            "If you do not know the value of an attribute asked to extract, "
            "return null for the attribute's value.,"
            "Give the output in Json format.",
        ),
        ("human", "{text}"),
    ]
)

In [ ]:
llm = AzureChatOpenAI(
                azure_endpoint="YOUR_AZURE_ENDPOINT",
                deployment_name="YOUR_DEPLOYMENT_NAME",
                api_version="YOUR_API_VERSION",
                api_key="YOUR_AZURE_OPENAI_API_KEY",
                temperature=0,
                max_tokens=4096,
                frequency_penalty=0,
                presence_penalty=0,
                top_p=1.0,
            )

In [16]:
text = raw_text

In [15]:
combine_extraction = prompt |  llm  |  OutputFixingParser(parser=PydanticOutputParser(pydantic_object = Data), retry_chain=None).from_llm(parser=PydanticOutputParser(pydantic_object = Data), llm=llm, max_retries=2 )


with get_openai_callback() as cb:
    res = combine_extraction.invoke({"text": text})
    print(res)
    print("---")
print()

print(f"Total Tokens: {cb.total_tokens}")
print(f"Prompt Tokens: {cb.prompt_tokens}")
print(f"Completion Tokens: {cb.completion_tokens}")
print(f"Total Cost (USD): ${cb.total_cost}")


queries=[ProductLineItems(currency='USD', quantity=1, unit_price=100.0, product_description='Front and rear brake cables', spell_corrected_product_description='Front and rear brake cables'), ProductLineItems(currency='USD', quantity=2, unit_price=15.0, product_description='New set of pedal arms', spell_corrected_product_description='New set of pedal arms'), ProductLineItems(currency='USD', quantity=3, unit_price=5.0, product_description='Labor 3hrs', spell_corrected_product_description='Labor 3hrs')] headers=[Header(purchase_order='US-001', invoice_order=None, ship_to='John Smith, 3787 Pineview Drive, Cambridge, MA 12210', bill_to=None, vendor='John Smith, 2 Court Square, New York, NY 12210', delivery_date=None, invoice_date='11/02/2019', shipping_instructions=None, notes='Payment is due within 15 days. Please make checks payable to: East Repair Inc.')]
---

Total Tokens: 3277
Prompt Tokens: 2673
Completion Tokens: 604
Total Cost (USD): $0.0127225
