In [36]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Tesseract OCR

In [37]:
!sudo apt-get install tesseract-ocr
!pip install pytesseract

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.


# Easy OCR

In [38]:
!pip install easyocr



# Evaluate

In [39]:
pip install python-Levenshtein



# Output

In [58]:
import os
import cv2
import easyocr
import pytesseract
from PIL import Image
import Levenshtein
import csv

# -------------------------
# OCR FUNCTIONS
# -------------------------

def easyocr_ocr(image, save_path):
    reader = easyocr.Reader(['en'])
    result = reader.readtext(image)

    # sort from top to bottom
    result = sorted(result, key=lambda x: x[0][0][1])
    lines = [text for (_, text, _) in result]
    paragraph = "\n".join(lines)

    with open(save_path, "w", encoding="utf-8") as f:
        f.write(paragraph)

    return paragraph


def tesseract_ocr(image, save_path):
    pil_img = Image.fromarray(image)
    config = '--psm 3'
    text = pytesseract.image_to_string(pil_img, lang='eng', config=config)

    with open(save_path, "w", encoding="utf-8") as f:
        f.write(text)

    return text


# -------------------------
# EVALUATE
# -------------------------

def load_text(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        return f.read().strip()


def normalized_accuracy(gt, pred):
    if len(gt) == 0:
        return 0
    dist = Levenshtein.distance(gt, pred)
    return (1 - dist / max(len(gt), len(pred))) * 100


# -------------------------
# PROCESS ONE FOLDER
# -------------------------

def process_folder(folder_name, image_folder, gt, output_root, writer):

    print(f"\n================= PROCESSING: {folder_name} =================")

    # สร้างโฟลเดอร์ย่อย เช่น output/raw/tesseract
    folder_out = os.path.join(output_root, folder_name)
    tess_folder = os.path.join(folder_out, "tesseract")
    easy_folder = os.path.join(folder_out, "easyocr")

    os.makedirs(tess_folder, exist_ok=True)
    os.makedirs(easy_folder, exist_ok=True)

    # loop ผ่านภาพทั้งหมด
    for filename in os.listdir(image_folder):

        if filename.lower().endswith((".png", ".jpg", ".jpeg")):

            print(f"\n----- {folder_name}: {filename} -----")

            img_path = os.path.join(image_folder, filename)
            image = cv2.imread(img_path)

            if image is None:
                print("Error reading image:", img_path)
                continue

            base = os.path.splitext(filename)[0]

            # output files
            tess_out = os.path.join(tess_folder, f"{base}_tesseract.txt")
            easy_out = os.path.join(easy_folder, f"{base}_easyocr.txt")

            # Run OCR
            tesseract_pred = tesseract_ocr(image, tess_out)
            easy_pred = easyocr_ocr(image, easy_out)

            # Accuracy
            tess_acc = normalized_accuracy(gt, tesseract_pred)
            easy_acc = normalized_accuracy(gt, easy_pred)

            print(f"Tesseract: {tess_acc:.2f}% | EasyOCR: {easy_acc:.2f}%")

            # Save row → summary.csv
            writer.writerow([
                folder_name,
                filename,
                f"{tess_acc:.2f}",
                f"{easy_acc:.2f}"
            ])


# -------------------------
# MAIN PIPELINE (รับ 2 folder)
# -------------------------

def evaluate_two_folders(raw_folder, processed_folder, gt_path, output_root):

    # โหลด ground truth
    gt = load_text(gt_path)

    # summary path
    summary_csv = os.path.join(output_root, "summary.csv")
    os.makedirs(output_root, exist_ok=True)

    with open(summary_csv, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["folder", "filename", "tesseract_accuracy", "easyocr_accuracy"])

        # Evaluate RAW folder
        process_folder("raw", raw_folder, gt, output_root, writer)

        # Evaluate PROCESSED folder
        process_folder("processed", processed_folder, gt, output_root, writer)

    print("\nDONE! Summary saved at:", summary_csv)
    return summary_csv


In [66]:
path = '/content/drive/My Drive/ocr_dataset/set_1/'

evaluate_two_folders(
    raw_folder= path + "1_raw/",
    processed_folder= path + "1_processed/",
    gt_path= path + "groundtruth.txt",
    output_root= path
)



----- raw: 2.JPG -----
Tesseract: 15.08% | EasyOCR: 27.77%

----- raw: 10.JPG -----
Tesseract: 45.53% | EasyOCR: 42.46%

----- raw: 1.JPG -----
Tesseract: 97.88% | EasyOCR: 80.22%

----- raw: 6.JPG -----
Tesseract: 72.27% | EasyOCR: 45.20%

----- raw: 5.JPG -----
Tesseract: 21.90% | EasyOCR: 68.60%

----- raw: 4.JPG -----
Tesseract: 97.26% | EasyOCR: 81.12%

----- raw: 3.JPG -----
Tesseract: 82.47% | EasyOCR: 80.11%

----- raw: 9.JPG -----
Tesseract: 91.05% | EasyOCR: 83.91%

----- raw: 8.JPG -----
Tesseract: 94.13% | EasyOCR: 67.37%

----- raw: 7.JPG -----
Tesseract: 97.93% | EasyOCR: 69.94%


----- processed: 7_processed.JPG -----
Tesseract: 97.93% | EasyOCR: 79.55%

----- processed: 8_processed.JPG -----
Tesseract: 92.18% | EasyOCR: 72.29%

----- processed: 6_processed.JPG -----
Tesseract: 97.99% | EasyOCR: 80.73%

----- processed: 5_processed.JPG -----
Tesseract: 97.15% | EasyOCR: 70.84%

----- processed: 4_processed.JPG -----
Tesseract: 98.21% | EasyOCR: 71.34%

----- processed:

'/content/drive/My Drive/ocr_dataset/set_1/summary.csv'

In [67]:
path = '/content/drive/My Drive/ocr_dataset/set_2/'

evaluate_two_folders(
    raw_folder= path + "2_raw/",
    processed_folder= path + "2_processed/",
    gt_path= path + "groundtruth.txt",
    output_root= path
)



----- raw: 10.JPG -----
Tesseract: 66.27% | EasyOCR: 56.97%

----- raw: 1.JPG -----
Tesseract: 69.27% | EasyOCR: 50.59%

----- raw: 2.JPG -----
Tesseract: 53.39% | EasyOCR: 44.89%

----- raw: 9.JPG -----
Tesseract: 59.47% | EasyOCR: 55.81%

----- raw: 8.JPG -----
Tesseract: 84.82% | EasyOCR: 66.13%

----- raw: 7.JPG -----
Tesseract: 50.91% | EasyOCR: 52.74%

----- raw: 6.JPG -----
Tesseract: 53.23% | EasyOCR: 55.27%

----- raw: 5.JPG -----
Tesseract: 23.47% | EasyOCR: 43.38%

----- raw: 4.JPG -----
Tesseract: 52.42% | EasyOCR: 51.13%

----- raw: 3.JPG -----
Tesseract: 93.81% | EasyOCR: 71.57%


----- processed: 5_processed.JPG -----
Tesseract: 61.52% | EasyOCR: 53.15%

----- processed: 3_processed.JPG -----
Tesseract: 96.30% | EasyOCR: 57.89%

----- processed: 4_processed.JPG -----
Tesseract: 60.28% | EasyOCR: 49.19%

----- processed: 2_processed.JPG -----
Tesseract: 5.87% | EasyOCR: 7.80%

----- processed: 1_processed.JPG -----
Tesseract: 64.43% | EasyOCR: 50.32%

----- processed: 1

'/content/drive/My Drive/ocr_dataset/set_2/summary.csv'

In [68]:
path = '/content/drive/My Drive/ocr_dataset/set_3/'

evaluate_two_folders(
    raw_folder= path + "3_raw/",
    processed_folder= path + "3_processed/",
    gt_path= path + "groundtruth.txt",
    output_root= path
)



----- raw: 1.JPG -----
Tesseract: 96.89% | EasyOCR: 80.29%

----- raw: 10.JPG -----
Tesseract: 80.76% | EasyOCR: 35.27%

----- raw: 4.JPG -----
Tesseract: 43.55% | EasyOCR: 78.73%

----- raw: 2.JPG -----
Tesseract: 69.61% | EasyOCR: 85.15%

----- raw: 3.JPG -----
Tesseract: 94.08% | EasyOCR: 76.70%

----- raw: 9.JPG -----
Tesseract: 39.87% | EasyOCR: 55.43%

----- raw: 8.JPG -----
Tesseract: 40.70% | EasyOCR: 45.12%

----- raw: 7.JPG -----
Tesseract: 96.55% | EasyOCR: 89.32%

----- raw: 6.JPG -----
Tesseract: 20.72% | EasyOCR: 70.26%

----- raw: 5.JPG -----
Tesseract: 0.00% | EasyOCR: 32.41%


----- processed: 8_processed.JPG -----
Tesseract: 73.79% | EasyOCR: 73.85%

----- processed: 7_processed.JPG -----
Tesseract: 96.89% | EasyOCR: 74.31%

----- processed: 1_processed.JPG -----
Tesseract: 97.08% | EasyOCR: 84.71%

----- processed: 6_processed.JPG -----
Tesseract: 97.26% | EasyOCR: 70.53%

----- processed: 4_processed.JPG -----
Tesseract: 97.26% | EasyOCR: 82.50%

----- processed: 

'/content/drive/My Drive/ocr_dataset/set_3/summary.csv'

In [69]:
path = '/content/drive/My Drive/ocr_dataset/set_4/'

evaluate_two_folders(
    raw_folder= path + "4_raw/",
    processed_folder= path + "4_processed/",
    gt_path= path + "groundtruth.txt",
    output_root= path
)



----- raw: 10.JPG -----
Tesseract: 69.07% | EasyOCR: 62.60%

----- raw: 1.JPG -----
Tesseract: 95.73% | EasyOCR: 73.22%

----- raw: 5.JPG -----
Tesseract: 91.40% | EasyOCR: 80.34%

----- raw: 4.JPG -----
Tesseract: 69.59% | EasyOCR: 78.63%

----- raw: 3.JPG -----
Tesseract: 95.07% | EasyOCR: 58.05%

----- raw: 2.JPG -----
Tesseract: 86.87% | EasyOCR: 65.50%

----- raw: 6.JPG -----
Tesseract: 82.50% | EasyOCR: 56.73%

----- raw: 9.JPG -----
Tesseract: 41.09% | EasyOCR: 54.55%

----- raw: 8.JPG -----
Tesseract: 75.22% | EasyOCR: 76.58%

----- raw: 7.JPG -----
Tesseract: 88.39% | EasyOCR: 80.61%


----- processed: 1_processed.JPG -----
Tesseract: 96.04% | EasyOCR: 58.25%

----- processed: 2_processed.JPG -----
Tesseract: 95.93% | EasyOCR: 72.63%

----- processed: 10_processed.JPG -----
Tesseract: 29.54% | EasyOCR: 27.04%

----- processed: 8_processed.JPG -----
Tesseract: 92.73% | EasyOCR: 56.27%

----- processed: 7_processed.JPG -----
Tesseract: 93.70% | EasyOCR: 71.44%

----- processed

'/content/drive/My Drive/ocr_dataset/set_4/summary.csv'

In [70]:
path = '/content/drive/My Drive/ocr_dataset/set_5/'

evaluate_two_folders(
    raw_folder= path + "5_raw/",
    processed_folder= path + "5_processed/",
    gt_path= path + "groundtruth.txt",
    output_root= path
)



----- raw: 10.JPG -----
Tesseract: 94.69% | EasyOCR: 77.68%

----- raw: 1.JPG -----
Tesseract: 63.86% | EasyOCR: 32.63%

----- raw: 4.JPG -----
Tesseract: 96.44% | EasyOCR: 66.27%

----- raw: 3.JPG -----
Tesseract: 95.23% | EasyOCR: 55.89%

----- raw: 2.JPG -----
Tesseract: 60.03% | EasyOCR: 27.08%

----- raw: 9.JPG -----
Tesseract: 85.12% | EasyOCR: 81.40%

----- raw: 8.JPG -----
Tesseract: 30.70% | EasyOCR: 82.08%

----- raw: 7.JPG -----
Tesseract: 52.96% | EasyOCR: 73.18%

----- raw: 6.JPG -----
Tesseract: 95.29% | EasyOCR: 57.52%

----- raw: 5.JPG -----
Tesseract: 20.53% | EasyOCR: 47.88%


----- processed: 2_processed.JPG -----
Tesseract: 64.53% | EasyOCR: 25.09%

----- processed: 3_processed.JPG -----
Tesseract: 95.93% | EasyOCR: 78.10%

----- processed: 1_processed.JPG -----
Tesseract: 93.82% | EasyOCR: 63.17%

----- processed: 10_processed.JPG -----
Tesseract: 96.24% | EasyOCR: 74.91%

----- processed: 7_processed.JPG -----
Tesseract: 96.93% | EasyOCR: 70.30%

----- processed

'/content/drive/My Drive/ocr_dataset/set_5/summary.csv'

In [71]:
path = '/content/drive/My Drive/ocr_dataset/set_6/'

evaluate_two_folders(
    raw_folder= path + "6_raw/",
    processed_folder= path + "6_processed/",
    gt_path= path + "groundtruth.txt",
    output_root= path
)



----- raw: 7.JPG -----
Tesseract: 93.15% | EasyOCR: 72.27%

----- raw: 6.JPG -----
Tesseract: 90.27% | EasyOCR: 64.47%

----- raw: 5.JPG -----
Tesseract: 23.20% | EasyOCR: 54.69%

----- raw: 4.JPG -----
Tesseract: 91.43% | EasyOCR: 72.75%

----- raw: 3.JPG -----
Tesseract: 30.88% | EasyOCR: 66.28%

----- raw: 2.JPG -----
Tesseract: 58.08% | EasyOCR: 50.34%

----- raw: 10.JPG -----
Tesseract: 45.86% | EasyOCR: 50.76%

----- raw: 1.JPG -----
Tesseract: 60.54% | EasyOCR: 42.66%

----- raw: 9.JPG -----
Tesseract: 58.73% | EasyOCR: 47.98%

----- raw: 8.JPG -----
Tesseract: 93.05% | EasyOCR: 45.50%


----- processed: 2_processed.JPG -----
Tesseract: 95.76% | EasyOCR: 74.08%

----- processed: 1_processed.JPG -----
Tesseract: 65.92% | EasyOCR: 48.46%

----- processed: 10_processed.JPG -----
Tesseract: 36.85% | EasyOCR: 21.69%

----- processed: 7_processed.JPG -----
Tesseract: 95.40% | EasyOCR: 73.11%

----- processed: 6_processed.JPG -----
Tesseract: 82.30% | EasyOCR: 77.46%

----- processed

'/content/drive/My Drive/ocr_dataset/set_6/summary.csv'

In [72]:
path = '/content/drive/My Drive/ocr_dataset/set_7/'

evaluate_two_folders(
    raw_folder= path + "7_raw/",
    processed_folder= path + "7_processed/",
    gt_path= path + "groundtruth.txt",
    output_root= path
)



----- raw: 4.JPG -----
Tesseract: 90.30% | EasyOCR: 51.31%

----- raw: 3.JPG -----
Tesseract: 85.20% | EasyOCR: 65.55%

----- raw: 2.JPG -----
Tesseract: 34.00% | EasyOCR: 48.36%

----- raw: 10.JPG -----
Tesseract: 82.75% | EasyOCR: 71.34%

----- raw: 1.JPG -----
Tesseract: 67.09% | EasyOCR: 50.31%

----- raw: 5.JPG -----
Tesseract: 35.00% | EasyOCR: 51.64%

----- raw: 8.JPG -----
Tesseract: 96.18% | EasyOCR: 73.07%

----- raw: 7.JPG -----
Tesseract: 92.21% | EasyOCR: 33.06%

----- raw: 6.JPG -----
Tesseract: 95.59% | EasyOCR: 63.11%

----- raw: 9.JPG -----
Tesseract: 55.43% | EasyOCR: 48.41%


----- processed: 2_processed.JPG -----
Tesseract: 88.65% | EasyOCR: 71.45%

----- processed: 3_processed.JPG -----
Tesseract: 85.96% | EasyOCR: 61.16%

----- processed: 1_processed.JPG -----
Tesseract: 68.17% | EasyOCR: 54.87%

----- processed: 10_processed.JPG -----
Tesseract: 95.53% | EasyOCR: 72.57%

----- processed: 8_processed.JPG -----
Tesseract: 95.58% | EasyOCR: 59.60%

----- processed

'/content/drive/My Drive/ocr_dataset/set_7/summary.csv'

In [73]:
path = '/content/drive/My Drive/ocr_dataset/set_8/'

evaluate_two_folders(
    raw_folder= path + "8_raw/",
    processed_folder= path + "8_processed/",
    gt_path= path + "groundtruth.txt",
    output_root= path
)



----- raw: 10.JPG -----
Tesseract: 61.63% | EasyOCR: 50.90%

----- raw: 1.JPG -----
Tesseract: 76.23% | EasyOCR: 66.09%

----- raw: 5.JPG -----
Tesseract: 68.93% | EasyOCR: 61.82%

----- raw: 4.JPG -----
Tesseract: 46.03% | EasyOCR: 56.72%

----- raw: 3.JPG -----
Tesseract: 76.23% | EasyOCR: 63.89%

----- raw: 2.JPG -----
Tesseract: 53.10% | EasyOCR: 58.33%

----- raw: 8.JPG -----
Tesseract: 76.00% | EasyOCR: 68.67%

----- raw: 7.JPG -----
Tesseract: 63.11% | EasyOCR: 59.04%

----- raw: 6.JPG -----
Tesseract: 27.45% | EasyOCR: 56.27%

----- raw: 9.JPG -----
Tesseract: 58.14% | EasyOCR: 57.69%


----- processed: 4_processed.JPG -----
Tesseract: 50.00% | EasyOCR: 61.30%

----- processed: 3_processed.JPG -----
Tesseract: 77.41% | EasyOCR: 65.05%

----- processed: 2_processed.JPG -----
Tesseract: 61.18% | EasyOCR: 57.17%

----- processed: 1_processed.JPG -----
Tesseract: 77.10% | EasyOCR: 62.02%

----- processed: 10_processed.JPG -----
Tesseract: 87.23% | EasyOCR: 53.29%

----- processed

'/content/drive/My Drive/ocr_dataset/set_8/summary.csv'

In [74]:
path = '/content/drive/My Drive/ocr_dataset/set_9/'

evaluate_two_folders(
    raw_folder= path + "9_raw/",
    processed_folder= path + "9_processed/",
    gt_path= path + "groundtruth.txt",
    output_root= path
)



----- raw: 6.JPG -----
Tesseract: 14.29% | EasyOCR: 69.05%

----- raw: 5.JPG -----
Tesseract: 92.49% | EasyOCR: 80.48%

----- raw: 4.JPG -----
Tesseract: 10.48% | EasyOCR: 59.05%

----- raw: 3.JPG -----
Tesseract: 0.95% | EasyOCR: 19.52%

----- raw: 2.JPG -----
Tesseract: 0.95% | EasyOCR: 41.90%

----- raw: 10.JPG -----
Tesseract: 20.00% | EasyOCR: 67.14%

----- raw: 1.JPG -----
Tesseract: 43.32% | EasyOCR: 42.86%

----- raw: 9.JPG -----
Tesseract: 8.57% | EasyOCR: 68.10%

----- raw: 8.JPG -----
Tesseract: 21.43% | EasyOCR: 50.00%

----- raw: 7.JPG -----
Tesseract: 41.90% | EasyOCR: 30.61%


----- processed: 5_processed.JPG -----
Tesseract: 60.42% | EasyOCR: 70.00%

----- processed: 3_processed.JPG -----
Tesseract: 3.81% | EasyOCR: 13.81%

----- processed: 4_processed.JPG -----
Tesseract: 23.89% | EasyOCR: 17.14%

----- processed: 9_processed.JPG -----
Tesseract: 93.93% | EasyOCR: 82.38%

----- processed: 7_processed.JPG -----
Tesseract: 93.06% | EasyOCR: 83.81%

----- processed: 10_

'/content/drive/My Drive/ocr_dataset/set_9/summary.csv'

In [76]:
path = '/content/drive/My Drive/ocr_dataset/set_10/'

evaluate_two_folders(
    raw_folder= path + "10_raw/",
    processed_folder= path + "10_processed/",
    gt_path= path + "groundtruth.txt",
    output_root= path
)



----- raw: 1.JPG -----
Tesseract: 94.56% | EasyOCR: 66.67%

----- raw: 6.JPG -----
Tesseract: 5.73% | EasyOCR: 53.80%

----- raw: 5.JPG -----
Tesseract: 21.35% | EasyOCR: 61.81%

----- raw: 4.JPG -----
Tesseract: 92.71% | EasyOCR: 80.35%

----- raw: 3.JPG -----
Tesseract: 87.86% | EasyOCR: 72.69%

----- raw: 2.JPG -----
Tesseract: 34.85% | EasyOCR: 65.38%

----- raw: 10.JPG -----
Tesseract: 49.24% | EasyOCR: 33.39%

----- raw: 9.JPG -----
Tesseract: 57.60% | EasyOCR: 55.38%

----- raw: 8.JPG -----
Tesseract: 33.57% | EasyOCR: 44.09%

----- raw: 7.JPG -----
Tesseract: 39.24% | EasyOCR: 48.77%


----- processed: 4_processed.JPG -----
Tesseract: 96.06% | EasyOCR: 71.11%

----- processed: 3_processed.JPG -----
Tesseract: 96.21% | EasyOCR: 74.44%

----- processed: 2_processed.JPG -----
Tesseract: 96.96% | EasyOCR: 72.81%

----- processed: 1_processed.JPG -----
Tesseract: 96.61% | EasyOCR: 74.91%

----- processed: 10_processed.JPG -----
Tesseract: 60.00% | EasyOCR: 37.07%

----- processed:

'/content/drive/My Drive/ocr_dataset/set_10/summary.csv'