In [1]:
import requests
import json
import base64
import os
from pathlib import Path
import time

In [6]:
MISTRAL_API_KEY = "wyUBTiyDJlUvrQuVZAo3Nd9L0vCz9fwD"  # Replace with your actual API key
MISTRAL_OCR_ENDPOINT = "https://api.mistral.ai/v1/chat/completions"

def encode_pdf_to_base64(pdf_path):
    """
    Encode a PDF file to base64 string for API upload
    
    Args:
        pdf_path (str): Path to the PDF file
        
    Returns:
        str: Base64 encoded PDF content
    """
    try:
        with open(pdf_path, "rb") as pdf_file:
            encoded_pdf = base64.b64encode(pdf_file.read()).decode('utf-8')
        print(f"✓ Successfully encoded PDF: {pdf_path}")
        return encoded_pdf
    except FileNotFoundError:
        print(f"✗ Error: PDF file not found at {pdf_path}")
        return None
    except Exception as e:
        print(f"✗ Error encoding PDF: {str(e)}")
        return None

In [3]:
def convert_pdf_to_markdown(pdf_path, output_path=None):
    """
    Convert PDF to Markdown using Mistral's OCR API
    
    Args:
        pdf_path (str): Path to the input PDF file
        output_path (str, optional): Path for output markdown file
        
    Returns:
        str: Converted markdown content or None if failed
    """
    
    # Encode PDF to base64
    encoded_pdf = encode_pdf_to_base64(pdf_path)
    if not encoded_pdf:
        return None
    
    # Prepare the API request
    headers = {
        "Authorization": f"Bearer {MISTRAL_API_KEY}",
        "Content-Type": "application/json"
    }
    
    # Create the payload for Mistral API
    payload = {
        "model": "pixtral-12b-2409",  # Mistral's vision model for OCR
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Please extract all text from this PDF document and convert it to clean, well-formatted Markdown. Preserve the document structure, headings, and formatting as much as possible."
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:application/pdf;base64,{encoded_pdf}"
                        }
                    }
                ]
            }
        ],
        "max_tokens": 4000,
        "temperature": 0.1
    }
    
    try:
        print("🔄 Sending request to Mistral OCR API...")
        response = requests.post(MISTRAL_OCR_ENDPOINT, headers=headers, json=payload)
        
        if response.status_code == 200:
            result = response.json()
            markdown_content = result['choices'][0]['message']['content']
            
            print("✓ Successfully converted PDF to Markdown")
            
            # Save to file if output path is provided
            if output_path:
                try:
                    with open(output_path, 'w', encoding='utf-8') as f:
                        f.write(markdown_content)
                    print(f"✓ Markdown saved to: {output_path}")
                except Exception as e:
                    print(f"✗ Error saving file: {str(e)}")
            
            return markdown_content
            
        else:
            print(f"✗ API Error: {response.status_code}")
            print(f"Response: {response.text}")
            return None
            
    except requests.exceptions.RequestException as e:
        print(f"✗ Request Error: {str(e)}")
        return None
    except Exception as e:
        print(f"✗ Unexpected Error: {str(e)}")
        return None

In [4]:
def batch_convert_pdfs(input_directory, output_directory):
    """
    Convert multiple PDF files in a directory to Markdown
    
    Args:
        input_directory (str): Directory containing PDF files
        output_directory (str): Directory to save converted Markdown files
    """
    
    input_path = Path(input_directory)
    output_path = Path(output_directory)
    
    # Create output directory if it doesn't exist
    output_path.mkdir(parents=True, exist_ok=True)
    
    # Find all PDF files
    pdf_files = list(input_path.glob("*.pdf"))
    
    if not pdf_files:
        print(f"No PDF files found in {input_directory}")
        return
    
    print(f"Found {len(pdf_files)} PDF files to convert...")
    
    for pdf_file in pdf_files:
        print(f"\n📄 Processing: {pdf_file.name}")
        
        # Generate output filename
        markdown_filename = pdf_file.stem + ".md"
        output_file = output_path / markdown_filename
        
        # Convert PDF to Markdown
        result = convert_pdf_to_markdown(str(pdf_file), str(output_file))
        
        if result:
            print(f"✓ Converted: {pdf_file.name} → {markdown_filename}")
        else:
            print(f"✗ Failed to convert: {pdf_file.name}")
        
        # Add a small delay to respect API rate limits
        time.sleep(1)

In [5]:
# Example Usage

# 1. Set your API key (replace with your actual key)
MISTRAL_API_KEY = "fVzAa04gFTtwQOVkxFAwKDrZUi3KqfQr"

# 2. Convert a single PDF file
pdf_file_path = "Discharge-Summary-Joint-Commission.pdf"  # Replace with your PDF path
output_markdown_path = "converted_document.md"

print("=== Single PDF Conversion ===")
markdown_content = convert_pdf_to_markdown(pdf_file_path, output_markdown_path)

if markdown_content:
    print("\n📝 Preview of converted content:")
    print("-" * 50)
    print(markdown_content[:500] + "..." if len(markdown_content) > 500 else markdown_content)
    print("-" * 50)

# 3. Batch convert multiple PDFs
print("\n=== Batch PDF Conversion ===")
input_dir = "input_pdfs"      # Directory with PDF files
output_dir = "output_markdown" # Directory for converted files

# Uncomment the line below to run batch conversion
# batch_convert_pdfs(input_dir, output_dir)

# 4. Quick test function to check API connectivity
def test_api_connection():
    """Test if the API key and endpoint are working"""
    headers = {
        "Authorization": f"Bearer {MISTRAL_API_KEY}",
        "Content-Type": "application/json"
    }
    
    # Simple test payload
    test_payload = {
        "model": "pixtral-12b-2409",
        "messages": [{"role": "user", "content": "Hello, this is a test."}],
        "max_tokens": 10
    }
    
    try:
        response = requests.post(MISTRAL_OCR_ENDPOINT, headers=headers, json=test_payload)
        if response.status_code == 200:
            print("✓ API connection successful!")
            return True
        else:
            print(f"✗ API test failed: {response.status_code}")
            print(f"Response: {response.text}")
            return False
    except Exception as e:
        print(f"✗ Connection error: {str(e)}")
        return False

=== Single PDF Conversion ===
✓ Successfully encoded PDF: Discharge-Summary-Joint-Commission.pdf
🔄 Sending request to Mistral OCR API...
✗ API Error: 400
Response: {"object":"error","message":"Image content must be a URL (starting with 'https') or base64 encoded image (starting with 'data:image/<format>;base64,<image-base64>'). Received: data:application/pdf;base64,JVBERi0xLjQKJdPr6eEKMS...IKL0luZm8gMSAwIFI+PgpzdGFydHhyZWYKNjU5NjUKJSVFT0YK","type":"invalid_request_invalid_args","param":null,"code":"3051"}

=== Batch PDF Conversion ===
