In [21]:
# !pip install mistralai



## Mistral OCR API

In [66]:
from mistralai import Mistral
from mistralai.models import OCRResponse
from IPython.display import Markdown, display
from dotenv import load_dotenv
import json
import os

In [23]:
_ = load_dotenv()

In [24]:
api_key = os.environ.get("MISTRAL_API_KEY")
api_key

'Z7dLIGUM3PICo8UudffBQeuVpOonVfdb'

In [84]:
client = Mistral(api_key=api_key)

uploaded_pdf = client.files.upload(
    file={
        "file_name": "screenshot_text_and_image.pdf",
        "content": open("screenshot_text_and_image.pdf", "rb"),
    },
    purpose="ocr"
)  

In [85]:
# Getting the signed URL for the uploaded PDF
signed_url = client.files.get_signed_url(file_id=uploaded_pdf.id)

In [86]:
# To extract text from the uploaded PDF using OCR
ocr_response = client.ocr.process(
    model="mistral-ocr-latest",
    document={
        "type": "document_url",
        "document_url": signed_url.url,
      },
      include_image_base64=True
)

In [87]:
response_dict = json.loads(ocr_response.model_dump_json())
response_dict

{'pages': [{'index': 0,
   'markdown': '# Sample our Education Journals \n\n>> Sign in here to start your access to the latest two volumes for 14 days',
   'images': [],
   'dimensions': {'dpi': 200, 'height': 1653, 'width': 2339}}],
 'model': 'mistral-ocr-2503-completion',
 'usage_info': {'pages_processed': 1, 'doc_size_bytes': 60557}}

In [83]:
def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
    """
    Replace image placeholders in markdown with base64-encoded images.

    Args:
        markdown_str: Markdown text containing image placeholders
        images_dict: Dictionary mapping image IDs to base64 strings

    Returns:
        Markdown text with images replaced by base64 data
    """
    for img_name, base64_str in images_dict.items():
        markdown_str = markdown_str.replace(
            f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})"
        )
    return markdown_str

def get_combined_markdown(ocr_response: OCRResponse) -> str:
    """
    Combine OCR text and images into a single markdown document.

    Args:
        ocr_response: Response from OCR processing containing text and images

    Returns:
        Combined markdown string with embedded images
    """
    markdowns: list[str] = []
    # Extract images from page
    for page in ocr_response.pages:
        image_data = {}
        for img in page.images:
            image_data[img.id] = img.image_base64
        # Replace image placeholders with actual images
        markdowns.append(replace_images_in_markdown(page.markdown, image_data))

    return "\n\n".join(markdowns)

# Display combined markdowns and images
display(Markdown(get_combined_markdown(ocr_response)))

# Sample our Education Journals 

>> Sign in here to start your access to the latest two volumes for 14 days