# Text Detection and OCR Comparison

This notebook compares three different OCR engines:
1. **Tesseract OCR**: An open-source OCR engine.
2. **EasyOCR**: A ready-to-use OCR with 80+ supported languages and all popular writing scripts.
3. **AWS Textract**: A fully managed machine learning service that automatically extracts text and data from scanned documents.

## Setup and Installation

In [None]:
# Install dependencies if not already installed
# !pip install -r requirements.txt

import os
import cv2
import pytesseract
from PIL import Image
from easyocr import Reader
import boto3
import matplotlib.pyplot as plt
import kagglehub
import numpy as np

## 1. Data Preparation

Download the COCO-Text dataset using `kagglehub`.

In [None]:
# Download latest version of the dataset
try:
    path = kagglehub.dataset_download("c7934597/cocotext-v20")
    print("Path to dataset files:", path)
except Exception as e:
    print(f"Error downloading dataset: {e}")
    path = 'data' # Fallback to local data folder

## 2. Initialize OCR Engines

In [None]:
# Initialize EasyOCR Reader
reader = Reader(['en'])

# AWS Credentials (Set these in environment variables or enter here)
access_key = os.environ.get('AWS_ACCESS_KEY_ID')
secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
region_name = 'us-east-1'

textract_client = None
if access_key and secret_access_key:
    textract_client = boto3.client('textract',
                                   aws_access_key_id=access_key,
                                   aws_secret_access_key=secret_access_key,
                                   region_name=region_name)
else:
    print("AWS credentials not found. Textract will be skipped.")

def read_text_tesseract(image_path):
    try:
        text = pytesseract.image_to_string(Image.open(image_path), lang='eng')
        return text.strip()
    except Exception as e:
        return ""

def read_text_easyocr(image_path):
    try:
        results = reader.readtext(image_path)
        text = ' '.join([result[1] for result in results])
        return text.strip()
    except Exception as e:
        return ""

def read_text_textract(image_path):
    if not textract_client:
        return ""
    try:
        with open(image_path, 'rb') as im:
            response = textract_client.detect_document_text(Document={'Bytes': im.read()})

        text = ' '.join([item['Text'] for item in response['Blocks'] if item['BlockType'] == 'LINE'])
        return text.strip()
    except Exception as e:
        return ""

## 3. Performance Evaluation

We use Jaccard Similarity to compare the detected text with the ground truth (extracted from filenames in this example).

In [None]:
def jaccard_similarity(sentence1, sentence2):
    # Tokenize sentences into sets of words
    set1 = set(sentence1.lower().split())
    set2 = set(sentence2.lower().split())

    if not set1 and not set2:
        return 1.0
    
    intersection_size = len(set1.intersection(set2))
    union_size = len(set1.union(set2))

    return intersection_size / union_size if union_size != 0 else 0.0

def clean_text(text):
    return text.lower().replace('\n', ' ').strip().replace('!', '').replace('?', '').replace('.', '')

# Evaluation path - Change this to where your images are stored
eval_data_path = path if 'path' in locals() else 'data' 

if os.path.exists(eval_data_path):
    # Recursively find images if path is a base directory from kagglehub
    image_files = []
    for root, dirs, files in os.walk(eval_data_path):
        for f in files:
            if f.lower().endswith(('.png', '.jpg', '.jpeg')):
                image_files.append(os.path.join(root, f))
    
    if not image_files:
        print(f"No images found in {eval_data_path}")
    else:
        # Take a subset for quick testing if there are too many
        subset_limit = 20
        image_files = image_files[:subset_limit]
        
        total_tesseract = 0
        total_easyocr = 0
        total_textract = 0
        count = 0

        for image_path in image_files:
            img_name = os.path.basename(image_path)
            
            # Assuming ground truth is in the filename (placeholder logic)
            gt = clean_text(img_name.split('.')[0].replace('_', ' '))
            
            res_tess = clean_text(read_text_tesseract(image_path))
            res_easy = clean_text(read_text_easyocr(image_path))
            res_text = clean_text(read_text_textract(image_path))

            total_tesseract += jaccard_similarity(gt, res_tess)
            total_easyocr += jaccard_similarity(gt, res_easy)
            if textract_client:
                total_textract += jaccard_similarity(gt, res_text)
            
            count += 1

        print(f"--- Evaluation Results (n={count}) ---")
        print(f"Tesseract Score: {total_tesseract / count:.4f}")
        print(f"EasyOCR Score:   {total_easyocr / count:.4f}")
        if textract_client:
            print(f"AWS Textract:    {total_textract / count:.4f}")
else:
    print(f"Path {eval_data_path} does not exist.")

## 4. Visualization

Visualize the detection results on a sample image.

In [None]:
def visualize_easyocr(image_path, threshold=0.25):
    if not os.path.exists(image_path):
        print(f"Image {image_path} not found.")
        return

    img = cv2.imread(image_path)
    results = reader.readtext(img)

    for (bbox, text, score) in results:
        if score > threshold:
            # Draw bounding box
            top_left = tuple(map(int, bbox[0]))
            bottom_right = tuple(map(int, bbox[2]))
            cv2.rectangle(img, top_left, bottom_right, (0, 255, 0), 2)
            cv2.putText(img, text, top_left, cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 0, 0), 2)

    plt.figure(figsize=(10, 10))
    plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    plt.show()

# Example usage (replace with an actual image path)
if 'image_files' in locals() and image_files:
    sample_img = image_files[0]
    visualize_easyocr(sample_img)
else:
    print("No sample image available for visualization.")