### imports

In [26]:
import easyocr
from PIL import Image
import pytesseract
import pandas as pd

import os
from pathlib import Path
from PIL import Image
from dotenv import load_dotenv
load_dotenv
from jiwer import cer, wer


In [27]:
import sys
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()
sys.path.append(str(PROJECT_ROOT))

print("Project root added:", PROJECT_ROOT)


Project root added: C:\Users\ahmed_abdulhakeem\Documents\smart-ca-ai


In [5]:
from src.text_normalization import normalize_arabic_text

from src.ocr.azure_ocr import azure_ocr, create_ocr_client

### full contract maker

In [6]:
def ocr_contract_directory(contract_dir: Path, ocr_function, client=None):
    texts = []

    for file in sorted(os.listdir(contract_dir)):
        file_path = contract_dir / file
        if client is not None:
            text = ocr_function(client, str(file_path))
        else:
            text = ocr_function(str(file_path))
        texts.append(text)

    return "\n".join(texts)


In [7]:
contract_path = Path('../data/contracts/3')
gt_path = '../data/contracts/groundtruth.txt'

with open(gt_path, "r", encoding="utf-8") as f:
    ground_truth = f.read()

### tesseract

In [8]:
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

def tesseract_ocr(file_path: str, lang="ara"):
    img = Image.open(file_path)
    text = pytesseract.image_to_string(img, lang=lang)
    return text

In [9]:
tesseract_text = ocr_contract_directory(contract_path, tesseract_ocr)

### easyocr

In [10]:
easy_reader = easyocr.Reader(['ar'])

def easyocr_ocr(file_path: str):
    results = easy_reader.readtext(file_path, detail=0)
    return "\n".join(results)



Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


In [11]:
easyocr_text = ocr_contract_directory(contract_path, easyocr_ocr)

  super().__init__(loader)
  super().__init__(loader)
  super().__init__(loader)


### azure ocr

In [15]:
endpoint = os.getenv("ENDPOINT")
key = os.getenv("KEY")
azure_client = create_ocr_client(endpoint=endpoint, key=key)

In [16]:
azure_text = ocr_contract_directory(contract_path, azure_ocr, azure_client)

In [30]:
def evaluate_ocr(gt_text, ocr_text):
    ce =  cer(gt_text, ocr_text)
    return {
        "CER": ce,
        "WER": wer(gt_text, ocr_text),
        "Accuracy": 1 - ce
    }

In [31]:
def evaluate_norm_ocr(gt_text, ocr_text):
    gt_norm = normalize_arabic_text(gt_text)
    ocr_norm = normalize_arabic_text(ocr_text)
    ce =  cer(gt_norm, ocr_norm)
    return {
        "CER": ce,
        "WER": wer(gt_norm, ocr_norm),
        "Accuracy": 1 - ce
    }

In [32]:
tesseract_eval =  evaluate_ocr(ground_truth, tesseract_text)
easyocr_eval = evaluate_ocr(ground_truth, easyocr_text)
azure_eval = evaluate_ocr(ground_truth, azure_text)

raw_evaluation = pd.DataFrame(
    [easyocr_eval, tesseract_eval, azure_eval],
    index=['easyocr', 'tesseract', 'azure']
)

In [33]:
tesseract_norm_eval =  evaluate_norm_ocr(ground_truth, tesseract_text)
easyocr_norm_eval = evaluate_norm_ocr(ground_truth, easyocr_text)
azure_norm_eval = evaluate_norm_ocr(ground_truth, azure_text)

norm_evaluation = pd.DataFrame(
    [easyocr_norm_eval, tesseract_norm_eval, azure_norm_eval],
    index=['easyocr', 'tesseract', 'azure']
)

In [34]:
raw_evaluation

Unnamed: 0,CER,WER,Accuracy
easyocr,0.330971,0.599005,0.669029
tesseract,0.081231,0.247761,0.918769
azure,0.015382,0.035821,0.984618


In [35]:
norm_evaluation

Unnamed: 0,CER,WER,Accuracy
easyocr,0.32359,0.58408,0.67641
tesseract,0.078347,0.241791,0.921653
azure,0.012279,0.030846,0.987721
