In [None]:
# Arabic OCR Tool -- Prototype V2
# Author: Hicham Yezza
# Updated: March 2025
# Purpose: Extract text from Arabic documents using Tesseract OCR

In [None]:
# Install dependencies
!sudo apt-get update
!sudo apt install tesseract-ocr -y
!pip install pytesseract Pillow opencv-python numpy tqdm

# Add Arabic language support
!sudo apt-get install tesseract-ocr-ara -y

# Verify language files are properly installed
!ls /usr/share/tesseract-ocr/4.00/tessdata/ | grep ara

import pytesseract
from PIL import Image
import cv2
import numpy as np
import os
import time
from tqdm.notebook import tqdm
from google.colab import files
import re


def preprocess_image(image, scale_percent=200):
    """Enhance image quality for better Arabic OCR results"""
    # Convert to OpenCV format for processing
    img = np.array(image)

    # Sometimes upscaling helps with small or low-quality text
    # (especially with diacritics in Arabic)
    if scale_percent != 100:
        width = int(img.shape[1] * scale_percent / 100)
        height = int(img.shape[0] * scale_percent / 100)
        img = cv2.resize(img, (width, height), interpolation=cv2.INTER_CUBIC)

    # Convert to grayscale if needed
    if len(img.shape) == 3:
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    else:
        gray = img

    # Try adaptive thresholding - works better than global for mixed lighting
    binary = cv2.adaptiveThreshold(
        gray, 255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY, 11, 2
    )

    # Remove small noise with a closing operation
    kernel = np.ones((1, 1), np.uint8)
    binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)

    # Convert back to PIL - Tesseract works with PIL images
    return Image.fromarray(binary)


def perform_ocr(image, lang='ara'):
    """Run OCR with optimized settings for Arabic text"""
    # First try to preprocess the image
    processed_image = preprocess_image(image)

    # LSTM neural network (OEM 1) with single text block assumption (PSM 6)
    # These settings work better for most Arabic documents I've tested
    config = r'--oem 1 --psm 6 -l ara'

    try:
        text = pytesseract.image_to_string(processed_image, config=config)

        # Sometimes Tesseract adds garbage chars - clean them
        text = clean_ocr_output(text)

        return text
    except Exception as e:
        print(f"OCR failed: {e}")
        # Fallback to standard settings
        try:
            return pytesseract.image_to_string(image, lang=lang)
        except:
            return "ERROR: OCR processing failed"


def clean_ocr_output(text):
    """Remove common OCR errors and formatting issues"""
    # Remove non-Arabic/non-punctuation characters that are likely errors
    # Keep Arabic chars, numbers, punctuation and whitespace
    text = re.sub(r'[^\u0600-\u06FF\s\d\.,;:!?()[\]{}\'\"/-]', '', text)

    # Fix common double spacing issues
    text = re.sub(r'\s{2,}', ' ', text)

    # Remove empty lines
    text = re.sub(r'\n\s*\n', '\n', text)

    return text


def process_file(image_path):
    """Process a single image file and return extracted text"""
    start = time.time()

    try:
        # Load image
        with Image.open(image_path) as img:
            # Keep a copy in memory
            image = img.copy()

        # Extract text
        text = perform_ocr(image)

        # Clean up resources
        image.close()

        processing_time = time.time() - start
        return text, processing_time, None

    except Exception as e:
        return "", time.time() - start, str(e)


def process_single_file():
    """Interactive workflow for single file processing"""
    print("Upload an Arabic document image:")
    uploaded = files.upload()

    if not uploaded:
        print("No file uploaded.")
        return

    image_path = list(uploaded.keys())[0]
    print(f"Processing '{image_path}'...")

    text, proc_time, error = process_file(image_path)

    if error:
        print(f"❌ Error: {error}")
        return

    print(f"✓ Processing completed in {proc_time:.2f} seconds")

    # Display results
    print("\n" + "="*40)
    print("EXTRACTED TEXT:")
    print("="*40)
    print(text)
    print("="*40)

    # Save to file
    output_file = f"{os.path.splitext(image_path)[0]}_text.txt"
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(text)

    print(f"\nText saved to {output_file}")


def process_multiple_files():
    """Batch process multiple images"""
    print("Upload one or more Arabic document images:")
    uploaded = files.upload()

    if not uploaded:
        print("No files uploaded.")
        return

    # Setup output file
    combined_file = "all_extracted_texts.txt"
    success_count = 0
    error_count = 0

    # Process each file
    with open(combined_file, 'w', encoding='utf-8') as outfile:
        outfile.write(f"ARABIC OCR RESULTS - {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n")

        for filename in tqdm(uploaded.keys(), desc="Processing files"):
            outfile.write(f"FILE: {filename}\n")
            outfile.write("-" * 50 + "\n")

            text, proc_time, error = process_file(filename)

            if error:
                error_msg = f"Error processing file: {error}"
                print(f"❌ {filename}: {error}")
                outfile.write(f"{error_msg}\n\n")
                error_count += 1
            else:
                outfile.write(f"{text}\n\n")
                print(f"✓ {filename}: Completed in {proc_time:.2f}s")

                # Create individual file
                indiv_file = f"{os.path.splitext(filename)[0]}_text.txt"
                with open(indiv_file, 'w', encoding='utf-8') as f:
                    f.write(text)

                success_count += 1

            outfile.write("="*50 + "\n\n")

    # Show summary
    print(f"\nProcessing complete: {success_count} succeeded, {error_count} failed")
    print(f"Combined results saved to '{combined_file}'")


def verify_installation():
    """Make sure all components are correctly installed"""
    try:
        # Check Tesseract
        ver = pytesseract.get_tesseract_version()
        print(f"✓ Tesseract OCR v{ver} detected")

        # Check language packs
        langs = pytesseract.get_languages()

        if 'ara' in langs:
            print("✓ Arabic language pack installed")
        else:
            print("❌ Arabic language pack NOT FOUND")
            print("Try reinstalling with: sudo apt-get install tesseract-ocr-ara")
            return False

        # Test OpenCV
        test_img = np.zeros((10, 10), dtype=np.uint8)
        _ = cv2.threshold(test_img, 127, 255, cv2.THRESH_BINARY)
        print("✓ OpenCV is working")

        return True

    except Exception as e:
        print(f"❌ Setup verification failed: {e}")
        return False


# Program entry point
if __name__ == "__main__":
    print("\n📄 ARABIC OCR TOOL 📄")
    print("---------------------")

    # First verify installation
    if not verify_installation():
        print("\nPlease fix installation issues before continuing.")

    # Show menu
    while True:
        print("\nOptions:")
        print("  1. Process single image")
        print("  2. Process multiple images")
        print("  3. Verify installation")
        print("  4. Exit")

        choice = input("\nEnter choice (1-4): ").strip()

        if choice == '1':
            process_single_file()
        elif choice == '2':
            process_multiple_files()
        elif choice == '3':
            verify_installation()
        elif choice == '4':
            print("Exiting program.")
            break
        else:
            print("Invalid choice, please try again.")

# Uncomment the option you want to run directly
# process_single_file()
# process_multiple_files()

Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,692 kB]
Get:12 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,236 kB]
Get:13 https://cloud.r-project.org/bin/linux/ubuntu ja

Saving BBCM - Arabic OCR - Test Image 1.png to BBCM - Arabic OCR - Test Image 1.png
Processing 'BBCM - Arabic OCR - Test Image 1.png'...
✓ Processing completed in 1.16 seconds

EXTRACTED TEXT:
إن مه شاريج المحركة يي بحاجهة َ
1 ولابغفي أن الافة هي مذد المجاهدين ١1
ا 0 وإالكخه سقو يوبا لخد وي أ 1 0
37 

Text saved to BBCM - Arabic OCR - Test Image 1_text.txt

Options:
  1. Process single image
  2. Process multiple images
  3. Verify installation
  4. Exit

Enter choice (1-4): 4
Exiting program.
