In [2]:
import base64
import os
from mistralai import Mistral

def encode_pdf(pdf_path):
    """Encode the pdf to base64."""
    try:
        with open(pdf_path, "rb") as pdf_file:
            return base64.b64encode(pdf_file.read()).decode('utf-8')
    except FileNotFoundError:
        print(f"Error: The file {pdf_path} was not found.")
        return None
    except Exception as e:  # Added general exception handling
        print(f"Error: {e}")
        return None



api_key = os.environ["MISTRAL_API_KEY"]
client = Mistral(api_key=api_key)



In [4]:
from pydantic import BaseModel, Field
from typing import Optional


class Date(BaseModel):
    day: int = Field(
        ...,
        description="The number of day of the month of the date, can range from 1 to 31",
    )
    month: int = Field(
        ...,
        description="The number of month of the year of the date, can range from 1 to 12",
    )
    year: int = Field(..., description="The year of the date")


class Bill(BaseModel):
    previous_date: Date = Field(
        ..., description="The previous date of the bill reading e.g. 2022-06-30"
    )
    current_date: Date = Field(
        ..., description="The current date of the bill e.g. 03-01-2023  "
    )
    consumption: float = Field(
        ..., description="The total consumption of water in m3, e.g. 2268.89 "
    )
    total_bill: float = Field(
        ..., description="The total current bill of water, e.g. $10,475.69 "
    )
    sewage: Optional[float] = Field(None, description="The sewage amount if available")
    bill_no: str = Field(..., description="The serial number of the bill")



In [None]:

encoded_bills = []
for i in range(10):

    base64_pdf = encode_pdf(f"tests/data/test{i+1}.pdf")
    encoded_bills.append(base64_pdf)
   

In [32]:
import json
from mistralai.extra import response_format_from_pydantic_model

def create_batch_file(encode_bills, output_file):

    document_annotation_format = {
        "type": "json_schema",
        "json_schema": {
            "name": "Bill",
            "description": "Water bill extraction schema",
            "schema": Bill.model_json_schema(),
            "strict": True
        }
    }    
    with open(output_file, "w") as file:
        for index, encode_bill in enumerate(encode_bills):
            entry = {
                "custom_id": str(index),
                "body": {
                    "document": {
                        "type": "document_url",
                        "document_url": f"data:application/pdf;base64,{encode_bill}",
                    },
                    "document_annotation_format": document_annotation_format
                },
            }
            file.write(json.dumps(entry) + "\n")


In [33]:
    batch_file = "batch_file.jsonl"
    create_batch_file(encoded_bills, batch_file)

In [34]:
batch_data = client.files.upload(
    file={
        "file_name": batch_file,
        "content": open(batch_file, "rb")},
    purpose = "batch"
)

In [35]:
ocr_model = "mistral-ocr-latest"
created_job = client.batch.jobs.create(
    input_files=[batch_data.id],
    model=ocr_model,
    endpoint="/v1/ocr",
    metadata={"job_type": "testing"}
)

SDKError: API error occurred: Status 402
{"detail": "You cannot launch batch jobs this big with your free trial. Reduce the number of steps in your configuration or subscribe via the console."}

In [None]:
import time
from IPython.display import clear_output

while retrieved_job.status in ["QUEUED", "RUNNING"]:
    retrieved_job = client.batch.jobs.get(job_id=created_job.id)

    clear_output(wait=True)  # Clear the previous output ( User Friendly )
    print(f"Status: {retrieved_job.status}")
    print(f"Total requests: {retrieved_job.total_requests}")
    print(f"Failed requests: {retrieved_job.failed_requests}")
    print(f"Successful requests: {retrieved_job.succeeded_requests}")
    print(
        f"Percent done: {round((retrieved_job.succeeded_requests + retrieved_job.failed_requests) / retrieved_job.total_requests, 4) * 100}%"
    )
    time.sleep(2)