# OCR Baseline Testing
## AI Document Intelligence - Week 1

This notebook tests OCR engines (PaddleOCR and Tesseract) on synthetic invoices.

## Setup

In [None]:
import sys
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path.cwd().parent))

import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
import numpy as np

from src.ocr import OCREngine
from src.utils.file_utils import list_files

# Configure matplotlib
%matplotlib inline
plt.rcParams['figure.figsize'] = (14, 10)

## Initialize OCR Engines

In [None]:
# Initialize both engines
print("Initializing OCR engines...")

try:
    paddle_engine = OCREngine(engine="paddle", use_gpu=False)
    print("✓ PaddleOCR initialized")
except Exception as e:
    print(f"✗ PaddleOCR failed: {e}")
    paddle_engine = None

try:
    tesseract_engine = OCREngine(engine="tesseract")
    print("✓ Tesseract initialized")
except Exception as e:
    print(f"✗ Tesseract failed: {e}")
    tesseract_engine = None

## Process Sample Images

In [None]:
# Get sample images
dataset_path = Path.cwd().parent / "dataset"
raw_dir = dataset_path / "raw"
image_files = list_files(raw_dir, pattern="*.png", recursive=False)

if len(image_files) == 0:
    print("No images found. Run: python scripts/generate_dataset.py --num-samples 200")
else:
    print(f"Found {len(image_files)} images")
    sample_image = str(image_files[0])
    print(f"Processing: {image_files[0].name}")

## PaddleOCR Processing

In [None]:
if paddle_engine and len(image_files) > 0:
    print("\nRunning PaddleOCR...")
    paddle_result = paddle_engine.extract_text_paddle(sample_image)
    
    print(f"\nDetected {paddle_result['num_blocks']} text blocks")
    print(f"Average confidence: {paddle_result['avg_confidence']:.2%}")
    print("\nExtracted Text:")
    print("="*60)
    print(paddle_result['full_text'])
    print("="*60)
else:
    print("PaddleOCR not available or no images found")

## Tesseract Processing

In [None]:
if tesseract_engine and len(image_files) > 0:
    print("\nRunning Tesseract...")
    tesseract_result = tesseract_engine.extract_text_tesseract(sample_image)
    
    print(f"\nDetected {tesseract_result['num_blocks']} text blocks")
    print(f"Average confidence: {tesseract_result['avg_confidence']:.2%}")
    print("\nExtracted Text:")
    print("="*60)
    print(tesseract_result['full_text'])
    print("="*60)
else:
    print("Tesseract not available or no images found")

## Visualize OCR Results with Bounding Boxes

In [None]:
def visualize_ocr_result(image_path, ocr_result, title="OCR Result"):
    """Visualize OCR result with bounding boxes."""
    img = Image.open(image_path)
    fig, ax = plt.subplots(figsize=(12, 14))
    ax.imshow(img)
    
    # Draw bounding boxes
    for block in ocr_result.get('text_blocks', [])[:20]:  # Show first 20
        bbox = block['bbox']
        x_min, y_min = bbox['x_min'], bbox['y_min']
        width = bbox['x_max'] - bbox['x_min']
        height = bbox['y_max'] - bbox['y_min']
        
        # Color based on confidence
        confidence = block['confidence']
        if confidence > 0.9:
            color = 'green'
        elif confidence > 0.7:
            color = 'yellow'
        else:
            color = 'red'
        
        rect = patches.Rectangle(
            (x_min, y_min), width, height,
            linewidth=2, edgecolor=color, facecolor='none'
        )
        ax.add_patch(rect)
    
    ax.set_title(f"{title} - {len(ocr_result.get('text_blocks', []))} blocks detected")
    ax.axis('off')
    plt.tight_layout()
    plt.show()

# Visualize PaddleOCR result
if paddle_engine and len(image_files) > 0:
    visualize_ocr_result(sample_image, paddle_result, "PaddleOCR")

# Visualize Tesseract result
if tesseract_engine and len(image_files) > 0:
    visualize_ocr_result(sample_image, tesseract_result, "Tesseract")

## Batch Processing Test

In [None]:
# Process first 10 images with PaddleOCR
if paddle_engine and len(image_files) >= 10:
    print("\nBatch Processing Test (First 10 images)...")
    
    confidences = []
    block_counts = []
    
    for img_file in image_files[:10]:
        result = paddle_engine.extract_text_paddle(str(img_file))
        confidences.append(result['avg_confidence'])
        block_counts.append(result['num_blocks'])
    
    print(f"\nAverage confidence: {np.mean(confidences):.2%}")
    print(f"Average blocks per image: {np.mean(block_counts):.1f}")
    print(f"Min/Max confidence: {np.min(confidences):.2%} / {np.max(confidences):.2%}")
    print(f"Min/Max blocks: {np.min(block_counts)} / {np.max(block_counts)}")
else:
    print("Batch processing skipped - need at least 10 images")

## Compare Engines

In [None]:
if paddle_engine and tesseract_engine and len(image_files) > 0:
    print("\nEngine Comparison:")
    print("="*60)
    print(f"{'Metric':<30} {'PaddleOCR':<15} {'Tesseract':<15}")
    print("="*60)
    print(f"{'Text blocks detected':<30} {paddle_result['num_blocks']:<15} {tesseract_result['num_blocks']:<15}")
    print(f"{'Average confidence':<30} {paddle_result['avg_confidence']:<15.2%} {tesseract_result['avg_confidence']:<15.2%}")
    print(f"{'Text length (chars)':<30} {len(paddle_result['full_text']):<15} {len(tesseract_result['full_text']):<15}")
    print("="*60)
else:
    print("Both engines needed for comparison")

## Save OCR Results

In [None]:
# Save results for later analysis
if paddle_engine and len(image_files) > 0:
    output_dir = dataset_path / "ocr_text" / "paddle"
    output_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"\nProcessing {min(10, len(image_files))} images and saving to {output_dir}...")
    
    for img_file in image_files[:10]:
        result = paddle_engine.process_image(str(img_file), output_dir=str(output_dir))
    
    print("✓ OCR results saved")
else:
    print("No images to process")

## Conclusion

This notebook tested OCR baselines:
- Both PaddleOCR and Tesseract can extract text from synthetic invoices
- Confidence scores are generally high for clean synthetic images
- Bounding boxes are accurately detected

Next: Run error analysis in notebook 03