SETUP

In [ ]:
pip install langchain-groq langchain-core python-dotenv matplotlib pillow tqdm numpy pytesseract

In [None]:
# =============================================================================
# IMPORTS AND DEPENDENCIES - TESSERACT OCR VERSION
# =============================================================================

import json
import os
import csv
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import string
import gc
import time

import matplotlib.pyplot as plt
from PIL import Image
from dotenv import load_dotenv
import pytesseract

# LLM imports
from langchain_groq import ChatGroq

In [ ]:
# Configuration and Setup
os.environ["TOKENIZERS_PARALLELISM"] = "false"
GEN_MODEL_ID = "llama-3.1-8b-instant"

# Evaluation metrics
def normalize(text):
    return text.lower().translate(str.maketrans('', '', string.punctuation)).strip()

def exact_match(pred, ground_truths):
    pred_norm = normalize(pred)
    return any(pred_norm == normalize(gt) for gt in ground_truths)

def f1_score(pred, ground_truths):
    def score(pred, gt):
        pred_tokens = normalize(pred).split()
        gt_tokens = normalize(gt).split()
        common = set(pred_tokens) & set(gt_tokens)
        if not common:
            return 0.0
        precision = len(common) / len(pred_tokens) if pred_tokens else 0.0
        recall = len(common) / len(gt_tokens) if gt_tokens else 0.0
        return 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    return max(score(pred, gt) for gt in ground_truths)

# Dataset loading
data_dir = 'docvqa_samples_300'
image_dir = os.path.join(data_dir, "images")
metadata_file = os.path.join(data_dir, "metadata.json")
output_csv = "OCR_results_tesseract.csv"

with open(metadata_file, "r", encoding="utf-8") as f:
    docvqa_metadata = json.load(f)

In [ ]:
# Tesseract OCR Setup
def extract_text_with_tesseract(image_path):
    """Extract text using Tesseract OCR"""
    try:
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image)
        return text.strip()
    except Exception as e:
        print(f"OCR error: {e}")
        return ""

print("Tesseract OCR initialized")

Document loading

In [ ]:
# LLM Setup and Main Processing Pipeline
llm = ChatGroq(
    groq_api_key=os.getenv('GROQ_API_KEY'),
    model_name=GEN_MODEL_ID,
    temperature=0,
    max_tokens=512,
    timeout=15,
    max_retries=2,
)

def process_document_fast(sample, image_dir, llm):
    """Fast processing with Tesseract OCR"""
    try:
        doc_id = sample['id']
        image_filename = sample['image_filename']
        question = sample['question']
        ground_truth = sample['answers']

        image_path = os.path.join(image_dir, image_filename)
        
        # Fast OCR with Tesseract
        ocr_content = extract_text_with_tesseract(image_path)
        if not ocr_content:
            return None

        # LLM QA
        prompt = f"""Answer the question using only the relevant number, word, or phrase — no extra text.

        OCR Content:
        {ocr_content[:1500]}

        Question: {question}
        Answer:"""

        try:
            response = llm.invoke(prompt)
            predicted_answer = str(response.content).strip()
        except Exception:
            predicted_answer = ""

        # Evaluation
        em = exact_match(predicted_answer, ground_truth)
        f1_val = f1_score(predicted_answer, ground_truth)

        return {
            "id": doc_id,
            "image_filename": image_filename,
            "question": question,
            "ground_truth": " | ".join(ground_truth),
            "ocr_content": ocr_content,
            "predicted_answer": predicted_answer,
            "exact_match": em,
            "f1_score": round(f1_val, 2)
        }
    except Exception:
        return None

# Main processing pipeline
print(f"Processing {len(docvqa_metadata)} documents...")
all_results = []
processed_count = 0

with open(output_csv, "w", newline="", encoding="utf-8") as csvfile:
    fieldnames = ["id", "image_filename", "question", "ground_truth", "ocr_content", "predicted_answer", "exact_match", "f1_score"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for sample in tqdm(docvqa_metadata, desc="Processing"):
        result = process_document_fast(sample, image_dir, llm)
        if result is not None:
            writer.writerow(result)
            all_results.append(result)
            processed_count += 1

# Results summary
if all_results:
    em_scores = [r['exact_match'] for r in all_results]
    f1_scores = [r['f1_score'] for r in all_results]
    
    print(f"Processed: {processed_count}/{len(docvqa_metadata)}")
    print(f"Exact Match: {np.mean(em_scores)*100:.1f}%")
    print(f"F1 Score: {np.mean(f1_scores)*100:.1f}%")
    print(f"Results saved to: {output_csv}")