In [None]:
MISTRAL_API_KEY= "your_api_key_here"

In [2]:
from mistralai import Mistral
from dotenv import load_dotenv
import datauri
import os
from tqdm import tqdm

In [6]:
def save_image_to_dir(image, output_dir):
    """Decode image_base64 from Mistral OCR and save it."""
    parsed = datauri.parse(image.image_base64)
    image_path = os.path.join(output_dir, image.id)

    with open(image_path, "wb") as f:
        f.write(parsed.data)

def process_pdfs(input_dir: str, output_dir: str, client):
    """
    - Scans output_dir first to detect already-processed PDFs
    - Then scans input_dir for PDFs
    - Skips PDFs if already in output_dir
    - Uploads each remaining PDF to Mistral OCR
    - Saves markdown + images
    - Deletes uploaded file from cloud
    """

    os.makedirs(output_dir, exist_ok=True)

    # -----------------------------------------
    # 1️⃣ Pre-scan output_dir for processed PDFs
    # -----------------------------------------
    processed = {
        name for name in os.listdir(output_dir)
        if os.path.isdir(os.path.join(output_dir, name))
    }

    # -----------------------------------------
    # 2️⃣ Scan input_dir for PDFs
    # -----------------------------------------
    pdf_files = [f for f in os.listdir(input_dir) if f.lower().endswith(".pdf") if f.split(".")[0] not in processed]

    for pdf in tqdm(pdf_files):
        base_name = os.path.splitext(pdf)[0].strip()

        # -----------------------------------------
        # 3️⃣ Skip if already processed
        # -----------------------------------------
        if base_name in processed:
            # print(f"Skipping '{pdf}' → already processed.")
            continue

        input_path = os.path.join(input_dir, pdf)
        pdf_output_dir = os.path.join(output_dir, base_name)
        os.makedirs(pdf_output_dir, exist_ok=True)

        # Upload PDF
        uploaded = client.files.upload(
            file={"file_name": pdf, "content": open(input_path, "rb")},
            purpose="ocr"
        )

        try:
            signed_url = client.files.get_signed_url(file_id=uploaded.id)

            ocr_response = client.ocr.process(
                model="mistral-ocr-latest",
                document={"type": "document_url", "document_url": signed_url.url},
                include_image_base64=True
            )

            # Save markdown
            md_path = os.path.join(pdf_output_dir, f"{base_name}.md")
            with open(md_path, "w", encoding="utf-8") as md_file:
                for idx, page in enumerate(ocr_response.pages):
                    md_file.write(page.markdown)
                    md_file.write(f"\n\n<!-- Page {idx + 1} End -->\n\n")

                    # Save images
                    for image in page.images:
                        save_image_to_dir(image, pdf_output_dir)

        finally:
            client.files.delete(file_id=uploaded.id)


In [None]:
api_key = MISTRAL_API_KEY
client = Mistral(api_key=api_key)
input_dir = "data\\resources_raw" # Directory containing input PDFs
output_dir = "data\\resources_md" # Directory that will contain output markdown + images
processed = {
    name for name in os.listdir(output_dir)
    if os.path.isdir(os.path.join(output_dir, name))
}

# -----------------------------------------
# 2️⃣ Scan input_dir for PDFs
# -----------------------------------------
pdf_files = [f for f in os.listdir(input_dir) if f.lower().endswith(".pdf") if f.split(".")[0] not in processed]

process_pdfs(
    input_dir=input_dir,
    output_dir=output_dir,
    client=client
)

100%|██████████| 1/1 [00:02<00:00,  2.73s/it]
