In [1]:
import base64
import os
from mistralai import Mistral

def encode_pdf(pdf_path):
    """Encode the pdf to base64."""
    try:
        with open(pdf_path, "rb") as pdf_file:
            return base64.b64encode(pdf_file.read()).decode('utf-8')
    except FileNotFoundError:
        print(f"Error: The file {pdf_path} was not found.")
        return None
    except Exception as e:  # Added general exception handling
        print(f"Error: {e}")
        return None

# Path to your pdf
pdf_path = "tests/data/test1.pdf"

# Getting the base64 string
base64_pdf = encode_pdf(pdf_path)


api_key = os.environ["MISTRAL_API_KEY"]
client = Mistral(api_key=api_key)



In [2]:
from pydantic import BaseModel, Field
from typing import Optional

class Bill(BaseModel):
    previous_date: str = Field(
        ..., description="The previous date of the bill reading e.g. 2022-06-30"
    )
    current_date: str = Field(
        ..., description="The current date of the bill e.g. 03-01-2023  "
    )
    consumption: float = Field(
        ..., description="The total consumption of water in m3, e.g. 2268.89 "
    )
    total_bill: float = Field(
        ..., description="The total current bill of water, e.g. $10,475.69 "
    )
    sewage: Optional[float] = Field(
        None, description="The sewage amount if available"
    )

In [15]:
from mistralai.extra import response_format_from_pydantic_model

results = []
for i in range(14):
    base64_pdf = encode_pdf(f"tests/data/test{i+1}.pdf")
    ocr_response = client.ocr.process(
        model="mistral-ocr-latest",
        pages=list(range(4)),
        document={
            "type": "document_url",
            "document_url": f"data:application/pdf;base64,{base64_pdf}" 
        },
        bbox_annotation_format=response_format_from_pydantic_model(Bill),
        document_annotation_format=response_format_from_pydantic_model(Bill),

    )
    results.append(ocr_response.document_annotation)

In [16]:
from pprint import pprint

pprint(results)


['{\n'
 '  "previous_date": "2022-06-30",\n'
 '  "current_date": "2022-07-31",\n'
 '  "consumption": 1969.94,\n'
 '  "total_bill": 8830.65,\n'
 '  "sewage": null\n'
 '}',
 '{\n'
 '  "previous_date": "07/31/2022",\n'
 '  "current_date": "08/31/2022",\n'
 '  "consumption": 2354.46,\n'
 '  "total_bill": 10554.34,\n'
 '  "sewage": null\n'
 '}',
 '{\n'
 '  "previous_date": "08/31/2022",\n'
 '  "current_date": "09/30/2022",\n'
 '  "consumption": 2589.86,\n'
 '  "total_bill": 11609.56,\n'
 '  "sewage": null\n'
 '}',
 '{\n'
 '  "previous_date": "09/30/2022",\n'
 '  "current_date": "10/31/2022",\n'
 '  "consumption": 2241.52,\n'
 '  "total_bill": 10.04807,\n'
 '  "sewage": null\n'
 '}',
 '{\n'
 '  "previous_date": "2022-10-31",\n'
 '  "current_date": "2022-11-30",\n'
 '  "consumption": 2046.21,\n'
 '  "total_bill": 9172.54,\n'
 '  "sewage": null\n'
 '}',
 '{\n'
 '  "previous_date": "11/30/2022",\n'
 '  "current_date": "12/31/2022",\n'
 '  "consumption": 2237.92,\n'
 '  "total_bill": 10031.93,\n