# OCR Evaluation Pipeline - AI Model Comparison

This notebook processes all compressed images, evaluates OCR accuracy using multiple vision models via OpenRouter, and generates results for the paper.

## Overview
- **Models**: Gemini 2.5 Pro, Claude Opus 4.5 Low, GPT-4o-mini (via OpenRouter)
- **Workflows**: EXCEL, GIT, JIRA, TEKKEN
- **Quality Levels**: PNG (original), JPEG q25, q5, q1
- **Evaluation**: Quantitative metrics (character/word accuracy) + Task-critical phrase detection

## Task-Critical Phrases
- **GIT**: ["Commit changes", "Push", "Pull", "Repository"]
- **JIRA**: ["Create", "Issue", "Project", "Summary", "Description"]
- **EXCEL**: ["File", "Home", "Insert", "Data", "Formula"]
- **TEKKEN**: ["Menu", "Start", "Options", "Character"]

## Setup Instructions
1. Install dependencies: `pip install -r requirements.txt`
2. Copy `.env.example` to `.env` and add your OpenRouter API key
3. Verify model names at [openrouter.ai/models](https://openrouter.ai/models) and update Cell 2 if needed

## OpenRouter API Documentation
This notebook uses the OpenRouter API format as documented at https://openrouter.ai/docs/quickstart

In [None]:
# Cell 1: Setup and imports
import os
import base64
import json
import time
from pathlib import Path
from typing import Dict, List, Tuple, Optional
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import difflib
from Levenshtein import distance as levenshtein_distance
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Configure paths
BASE_DIR = Path(__file__).parent if '__file__' in globals() else Path.cwd()
DATA_DIR = BASE_DIR / "data"
RESULTS_DIR = BASE_DIR

# Define workflows and quality levels
WORKFLOWS = ["EXCEL", "GIT", "JIRA", "TEKKEN"]
QUALITY_LEVELS = ["PNG", 25, 5, 1]

# Task-critical phrases for each workflow
TASK_CRITICAL_PHRASES = {
    "GIT": ["Commit changes", "Push", "Pull", "Repository"],
    "JIRA": ["Create", "Issue", "Project", "Summary", "Description"],
    "EXCEL": ["File", "Home", "Insert", "Data", "Formula"],
    "TEKKEN": ["Menu", "Start", "Options", "Character"]
}

print("✓ Setup complete!")
print(f"Data directory: {DATA_DIR}")
print(f"Workflows: {WORKFLOWS}")
print(f"Quality levels: {QUALITY_LEVELS}")

In [None]:
# Cell 2: OpenRouter API setup and OCR extraction
# Using OpenRouter API format from https://openrouter.ai/docs/quickstart
import requests

# OpenRouter API configuration
OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')
OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"

# Models to test - UPDATE THESE WITH EXACT MODEL NAMES FROM openrouter.ai/models
# Visit https://openrouter.ai/models to get the exact model IDs
MODELS = {
    'gemini': 'google/gemini-2.0-flash-exp',  # Update with exact model name from OpenRouter
    'claude': 'anthropic/claude-3-opus-20240229',  # Update with exact model name from OpenRouter
    'gpt': 'openai/gpt-4o-mini'  # Update with exact model name from OpenRouter
}

# Verify API key is set
if not OPENROUTER_API_KEY:
    raise ValueError("OPENROUTER_API_KEY not found in environment variables. Please set it in .env file.")

def encode_image_to_base64(image_path: Path) -> str:
    """Convert image to base64 for API"""
    with open(image_path, 'rb') as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def extract_ocr_openrouter(image_path: Path, model_name: str, api_key: Optional[str] = None, max_retries: int = 3) -> Tuple[str, bool]:
    """
    Extract text using OpenRouter vision model
    
    Uses the OpenRouter API format as documented at https://openrouter.ai/docs/quickstart
    
    Args:
        image_path: Path to image file
        model_name: Model identifier from OpenRouter (e.g., 'openai/gpt-4o')
        api_key: API key (defaults to environment variable)
        max_retries: Maximum number of retry attempts
        
    Returns:
        Tuple of (extracted_text, success_status)
    """
    api_key = api_key or OPENROUTER_API_KEY
    base64_image = encode_image_to_base64(image_path)
    
    # OpenRouter API headers (following documentation format)
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json",
        "HTTP-Referer": "https://github.com/your-repo",  # Optional: Site URL for rankings on openrouter.ai
        "X-Title": "OCR Evaluation Pipeline"  # Optional: Site title for rankings on openrouter.ai
    }
    
    # OpenRouter API payload format
    payload = {
        "model": model_name,
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Extract all visible text from this screenshot. Return only the text content, preserving line breaks and structure. Do not add explanations or formatting."
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ],
        "max_tokens": 4000
    }
    
    # Retry logic with exponential backoff
    for attempt in range(max_retries):
        try:
            response = requests.post(
                OPENROUTER_API_URL,
                headers=headers,
                json=payload,
                timeout=60
            )
            response.raise_for_status()
            result = response.json()
            text = result['choices'][0]['message']['content']
            
            # Rate limiting: wait 1.5 seconds between requests
            time.sleep(1.5)
            
            return text, True
            
        except requests.exceptions.HTTPError as e:
            if response.status_code == 429:  # Rate limit
                wait_time = 2 ** attempt
                print(f"Rate limited. Waiting {wait_time} seconds...")
                time.sleep(wait_time)
                continue
            else:
                return f"HTTP Error {response.status_code}: {str(e)}", False
        except Exception as e:
            if attempt == max_retries - 1:
                return f"Error: {str(e)}", False
            time.sleep(2 ** attempt)
    
    return "Max retries exceeded", False

print("✓ OpenRouter API functions defined")
print(f"Models configured: {list(MODELS.keys())}")
print(f"API URL: {OPENROUTER_API_URL}")