# Palateful OCR Testing

This notebook tests the HunyuanOCR model for recipe extraction.

In [None]:
# Install dependencies if needed
# !pip install torch transformers accelerate pillow httpx

In [None]:
import torch
from PIL import Image
import httpx
import io

# Check available devices
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"MPS available: {torch.backends.mps.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## Option 1: Direct Model Usage

Load and use the model directly in the notebook.

In [None]:
from transformers import AutoProcessor, AutoModelForVision2Seq

MODEL_NAME = "tencent/HunyuanOCR"

# Determine device
if torch.cuda.is_available():
    device = "cuda"
    dtype = torch.float16
elif torch.backends.mps.is_available():
    device = "mps"
    dtype = torch.float16
else:
    device = "cpu"
    dtype = torch.float32

print(f"Using device: {device}, dtype: {dtype}")

In [None]:
# Load model and processor
print(f"Loading {MODEL_NAME}...")

processor = AutoProcessor.from_pretrained(MODEL_NAME)
model = AutoModelForVision2Seq.from_pretrained(
    MODEL_NAME,
    torch_dtype=dtype,
    device_map=device if device != "cpu" else None,
)

if device == "cpu":
    model = model.to(device)

print("Model loaded!")

In [None]:
def run_ocr(image_path_or_url: str) -> str:
    """Run OCR on an image from path or URL."""
    # Load image
    if image_path_or_url.startswith(("http://", "https://")):
        response = httpx.get(image_path_or_url)
        image = Image.open(io.BytesIO(response.content))
    else:
        image = Image.open(image_path_or_url)
    
    # Convert to RGB
    if image.mode != "RGB":
        image = image.convert("RGB")
    
    # Prepare inputs
    inputs = processor(images=image, return_tensors="pt")
    if device != "cpu":
        inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Generate
    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=2048,
            do_sample=False,
        )
    
    # Decode
    text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return text

In [None]:
# Test with a sample recipe image
# Replace with your own image path or URL
TEST_IMAGE = "path/to/your/recipe/image.jpg"  # or URL

# Run OCR
result = run_ocr(TEST_IMAGE)
print("=" * 50)
print("OCR Result:")
print("=" * 50)
print(result)

## Option 2: Using the OCR Service API

If the OCR service is running, use the HTTP API.

In [None]:
OCR_SERVICE_URL = "http://localhost:8001"

# Check if service is running
try:
    response = httpx.get(f"{OCR_SERVICE_URL}/health")
    print(f"OCR Service Status: {response.json()}")
except Exception as e:
    print(f"OCR Service not running: {e}")
    print("\nTo start the service, run:")
    print("cd services/ocr && poetry install && poetry run uvicorn src.main:app --port 8001")

In [None]:
def ocr_via_api(image_path: str) -> dict:
    """Send image to OCR service and get result."""
    with open(image_path, "rb") as f:
        files = {"file": (image_path, f, "image/jpeg")}
        response = httpx.post(
            f"{OCR_SERVICE_URL}/ocr",
            files=files,
            timeout=120.0,
        )
    response.raise_for_status()
    return response.json()

def ocr_via_api_url(image_url: str) -> dict:
    """Send image URL to OCR service and get result."""
    response = httpx.post(
        f"{OCR_SERVICE_URL}/ocr/url",
        params={"url": image_url},
        timeout=120.0,
    )
    response.raise_for_status()
    return response.json()

In [None]:
# Test via API with a local file
# result = ocr_via_api("path/to/your/recipe/image.jpg")
# print(result["extracted_markdown"])

## Display Results

In [None]:
from IPython.display import display, Markdown, Image as IPImage

def display_result(image_path_or_url: str, ocr_text: str):
    """Display image and OCR result side by side."""
    # Display image
    if image_path_or_url.startswith(("http://", "https://")):
        display(IPImage(url=image_path_or_url, width=400))
    else:
        display(IPImage(filename=image_path_or_url, width=400))
    
    # Display OCR result as markdown
    display(Markdown("### Extracted Text"))
    display(Markdown(ocr_text))

In [None]:
# Display your result
# display_result(TEST_IMAGE, result)

## Cleanup

In [None]:
# Free up GPU memory if needed
del model
del processor
if torch.cuda.is_available():
    torch.cuda.empty_cache()
print("Memory cleared")