In [2]:
import os
import json
import cv2
from PIL import Image

Visualization of Bounding Boxes

In [3]:
def find_image_path(base_folder, base_filename, extensions):
    for ext in extensions:
        image_path = os.path.join(base_folder, f"{base_filename}.{ext}")
        if os.path.exists(image_path):
            return image_path, ext  # Return both path and extension
    return None, None

def draw_bounding_boxes(image_folder, annotations_folder, output_folder):
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    # Supported extensions
    extensions = ['jpg', 'jpeg', 'png']

    for filename in os.listdir(annotations_folder):
        if filename.endswith(".json"):
            json_path = os.path.join(annotations_folder, filename)
            with open(json_path, 'r') as file:
                data = json.load(file)

                base_filename = os.path.splitext(filename)[0]
                image_path, image_ext = find_image_path(image_folder, base_filename, extensions)

                if image_path:
                    image = cv2.imread(image_path)

                    for obj in data['objects']:
                        points = obj['points']['exterior']
                        if points:
                            start_point = (points[0][0], points[0][1])
                            end_point = (points[1][0], points[1][1])
                            color = (255, 0, 0)  # Blue color in RGB
                            thickness = 2
                            image = cv2.rectangle(image, start_point, end_point, color, thickness)

                    output_filename = f"{base_filename}.{image_ext}"  # Use the same extension as input image
                    output_path = os.path.join(output_folder, output_filename)
                    cv2.imwrite(output_path, image)
                else:
                    print(f"No matching image file found for JSON file: {filename}")

# Output images with bounding boxes to output folder

image_folder = "Book/img/"
annotations_folder = "Book/ann/"
output_folder = "Book/output/"
draw_bounding_boxes(image_folder, annotations_folder, output_folder)


Preprocessing: Cropping Bounding Boxes

In [13]:
def find_image_path(base_folder, base_filename, extensions):
    for ext in extensions:
        image_path = os.path.join(base_folder, f"{base_filename}.{ext}")
        if os.path.exists(image_path):
            return image_path, ext  # Return both path and extension
    return None, None

def crop_bounding_boxes(image_folder, annotations_folder, output_folder):
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    # Supported extensions
    extensions = ['jpg', 'jpeg', 'png']

    for filename in os.listdir(annotations_folder):
        if filename.endswith(".json"):
            json_path = os.path.join(annotations_folder, filename)
            with open(json_path, 'r') as file:
                data = json.load(file)

                base_filename = os.path.splitext(filename)[0]
                image_path, image_ext = find_image_path(image_folder, base_filename, extensions)

                if image_path:
                    image = cv2.imread(image_path)
                    if image is None or not image.size:
                        print(f"Failed to load image: {image_path}")
                        continue

                    # Convert OpenCV image format to Pillow format
                    image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

                    for i, obj in enumerate(data['objects']):
                        points = obj['points']['exterior']
                        if points:
                            # Sort points to ensure proper ordering for cropping
                            x_min, y_min = min(points[0][0], points[1][0]), min(points[0][1], points[1][1])
                            x_max, y_max = max(points[0][0], points[1][0]), max(points[0][1], points[1][1])

                            # Ensure cropping coordinates are within the image dimensions
                            if 0 <= x_min < x_max <= image.width and 0 <= y_min < y_max <= image.height:
                                # Crop the image using Pillow
                                cropped_image = image.crop((x_min, y_min, x_max, y_max))
                                object_name = obj['classTitle']
                                output_filename = f"{base_filename}_{object_name}_{i}.{image_ext}"  # Save with the original extension
                                output_path = os.path.join(output_folder, output_filename)

                                # Save the cropped image with Pillow
                                cropped_image.save(output_path)
                            else:
                                print(f"Invalid cropping coordinates for file: {filename}")
                else:
                    print(f"No matching image file found for JSON file: {filename}")

# Output cropped bounding boxes to cropped folder

image_folder = "Book/img/"
annotations_folder = "Book/ann/"
output_folder = "Book/cropped/"
crop_bounding_boxes(image_folder, annotations_folder, output_folder)


Invalid cropping coordinates for file: 00672.json
Invalid cropping coordinates for file: 00799.json
Invalid cropping coordinates for file: 00432.json
Invalid cropping coordinates for file: 00384.json
Invalid cropping coordinates for file: 00783.json
Invalid cropping coordinates for file: 00306.json
Invalid cropping coordinates for file: 00214.json
Invalid cropping coordinates for file: 0068.json
Invalid cropping coordinates for file: 0068.json
Invalid cropping coordinates for file: 00468.json
Invalid cropping coordinates for file: 0069.json
Invalid cropping coordinates for file: 00716.json
Invalid cropping coordinates for file: 00645.json
Invalid cropping coordinates for file: 00215.json
Invalid cropping coordinates for file: 00491.json
Invalid cropping coordinates for file: 00307.json
Invalid cropping coordinates for file: 00540.json
Invalid cropping coordinates for file: 00110.json
Invalid cropping coordinates for file: 00686.json
Invalid cropping coordinates for file: 0065.json
Inva

Arabic OCR Model

In [None]:
'''brew install tesseract
brew install tesseract-lang  # This will install language support including Arabic'''

# !pip install pytesseract

import pytesseract
pytesseract.pytesseract.tesseract_cmd = '/opt/homebrew/bin/tesseract'  # use the path found from your terminal

# test a single image

from PIL import Image

# Load an image from file
image_path = 'Book/cropped/001_Body text_1.png'
image = Image.open(image_path)

# Use Tesseract to do OCR on the image
text = pytesseract.image_to_string(image, lang='ara')

print(text)

In [None]:
from IPython.display import display

# Display the image
display(image)

# Print the extracted text
print(text)

In [None]:
# Further preprocessing
# OCR results exported to txt file

import cv2
import os
from PIL import Image, ImageFilter, ImageEnhance

def preprocess_image_for_ocr(image):
    # Convert image to grayscale
    image = image.convert('L')
    
    # Enhance the image
    contrast = ImageEnhance.Contrast(image)
    image = contrast.enhance(2)
    
    sharpness = ImageEnhance.Sharpness(image)
    image = sharpness.enhance(2)
    
    # Apply thresholding to get a binary image
    image = image.point(lambda x: 0 if x < 128 else 255)
    
    # Remove noise and smooth the image
    image = image.filter(ImageFilter.MedianFilter())
    
    return image

def ocr_arabic(image_path):
    # Load the image using Pillow
    image = Image.open(image_path)
    image = preprocess_image_for_ocr(image)
    
    # Perform OCR using Tesseract with Arabic language and appropriate configurations
    custom_config = r'--oem 3 --psm 6'
    text = pytesseract.image_to_string(image, lang='ara', config=custom_config)
    
    return text

# OCR results exported to txt file
cropped_images_folder = "Book/cropped/"  # Folder where cropped images are located
book_folder = "Book"  # The Book folder path where you want the OCR results text file
ocr_results_filename = "all_ocr_results.txt"
ocr_results_path = os.path.join(book_folder, ocr_results_filename)

# Make sure the Book directory exists, create if not
os.makedirs(book_folder, exist_ok=True)

# Open the file once, and write all OCR results to it
with open(ocr_results_path, 'w', encoding='utf-8') as results_file:
    for cropped_image_filename in os.listdir(cropped_images_folder):
        if cropped_image_filename.endswith(('.jpg', '.jpeg', '.png')):
            image_path = os.path.join(cropped_images_folder, cropped_image_filename)
            text = ocr_arabic(image_path)
            # Write the filename and its OCR result to the text file
            results_file.write(f"Extracted Text from {cropped_image_filename}:\n{text}\n\n")
            print(f"Processed OCR for {cropped_image_filename}")

# Inform the user where the results have been written
print(f"All OCR results have been written to: {ocr_results_path}")


In [None]:
# training the model with our own data

# create .box files from JSON

import json
import os

def create_box_files(annotations_folder, images_folder, output_folder):
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    for annotation_filename in os.listdir(annotations_folder):
        if annotation_filename.endswith('.json'):
            # Read JSON file
            json_path = os.path.join(annotations_folder, annotation_filename)
            with open(json_path, 'r', encoding='utf-8') as file:
                data = json.load(file)

            # Construct box file content
            box_content = ''
            for obj in data['objects']:
                character = obj['classTitle']
                points = obj['points']['exterior']
                x_min, y_min = points[0]
                x_max, y_max = points[1]
                # Box file format: char x_min y_min x_max y_max 0
                box_content += f"{character} {x_min} {y_min} {x_max} {y_max} 0\n"

            # Write to box file
            base_filename = os.path.splitext(annotation_filename)[0]
            box_file_path = os.path.join(output_folder, f"{base_filename}.box")
            with open(box_file_path, 'w', encoding='utf-8') as box_file:
                box_file.write(box_content)

# Example usage
image_folder = "Book/img/"
annotations_folder = "Book/ann/"
output_folder = "Book/box/"
create_box_files(annotations_folder, image_folder, output_folder)


In [3]:
# Use Tesstrain to train the model with our images

'''
git clone https://github.com/tesseract-ocr/tesstrain.git
cd tesstrain
pip install -r requirements.txt
'''

# convert images to tiff format

# Directory where your current images are stored
image_directory = "Book/img/"

# Directory where you want to save .tiff images
tif_directory = "Book/tiff/"

# Create the 'tiff' directory if it doesn't exist
if not os.path.exists(tif_directory):
    os.makedirs(tif_directory)

# Function to convert images
def convert_images_to_tif(source_dir, target_dir):
    # Supported formats
    formats = ('.png', '.jpeg', '.jpg')

    for image_filename in os.listdir(source_dir):
        if image_filename.endswith(formats):
            # Define the source and target file paths
            source_file_path = os.path.join(source_dir, image_filename)
            target_file_path = os.path.join(target_dir, os.path.splitext(image_filename)[0] + '.tif')

            # Open the image using PIL
            image = Image.open(source_file_path)
            # Convert the image to 'L' mode (this converts it to grayscale)
            image = image.convert('L')
            # Save the image in TIFF format
            image.save(target_file_path, 'TIFF')

# Run the conversion
convert_images_to_tif(image_directory, tif_directory)

print("Conversion complete. TIFF files are saved in", tif_directory)

Conversion complete. TIFF files are saved in Book/tiff/


In [10]:
# convert .box to .gt.txt

'''make tesseract-langdata'''
'''python generate_gt_from_box.py --help'''
'''python generate_gt_from_box.py -t -b''' # output not as expected

import json
import os

import json
import os

def json_to_gt(input_directory, output_directory, image_extension=".tif"):
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # List to hold the names of the .gt.txt files created
    gt_files_created = []

    # Loop through all files in the provided input directory
    for filename in os.listdir(input_directory):
        # Check for JSON file extension
        if filename.endswith(".json"):
            # Full path to the json file
            json_file_path = os.path.join(input_directory, filename)

            # Remove the file extension to get the base name
            base_name = os.path.splitext(filename)[0]

            # Open and load the JSON file
            with open(json_file_path, 'r', encoding='utf-8') as file:
                json_data = json.load(file)
            
            # Loop through each object in the JSON file
            for obj in json_data["objects"]:
                # Look for the transcription tag
                transcription_tags = [tag for tag in obj["tags"] if tag.get("name") == "Transcription"]
                if transcription_tags:
                    # Use the object ID to create a corresponding .gt.txt filename
                    gt_filename = f"{base_name}_{obj['id']}.gt.txt"
                    gt_file_path = os.path.join(output_directory, gt_filename)
                    
                    # Write the transcription to the .gt.txt file
                    with open(gt_file_path, 'w', encoding='utf-8') as gt_file:
                        gt_file.write(transcription_tags[0]["value"])
                    
                    # Add the created file name to the list
                    gt_files_created.append(gt_file_path)

    # Return the list of .gt.txt files created
    return gt_files_created

gt_files = json_to_gt('Book/ann/', 'Book/gt/')
print(gt_files)  # This will print out the list of .gt.txt files created


['Book/gt/0048_972014901.gt.txt', 'Book/gt/00808_972020930.gt.txt', 'Book/gt/00808_972020931.gt.txt', 'Book/gt/00222_972015802.gt.txt', 'Book/gt/00222_972015804.gt.txt', 'Book/gt/00672_972012959.gt.txt', 'Book/gt/00672_972012954.gt.txt', 'Book/gt/00388_972019955.gt.txt', 'Book/gt/00367_972023031.gt.txt', 'Book/gt/00367_972023037.gt.txt', 'Book/gt/00737_972012316.gt.txt', 'Book/gt/00449_972021501.gt.txt', 'Book/gt/00449_972021504.gt.txt', 'Book/gt/00449_972021494.gt.txt', 'Book/gt/00625_972013590.gt.txt', 'Book/gt/00625_972013589.gt.txt', 'Book/gt/00760_972012777.gt.txt', 'Book/gt/00330_972019794.gt.txt', 'Book/gt/007_972021051.gt.txt', 'Book/gt/00799_972022600.gt.txt', 'Book/gt/00633_972013526.gt.txt', 'Book/gt/00326_972019785.gt.txt', 'Book/gt/00776_972022733.gt.txt', 'Book/gt/00408_972022184.gt.txt', 'Book/gt/00664_972013030.gt.txt', 'Book/gt/00721_972012182.gt.txt', 'Book/gt/00371_972023117.gt.txt', 'Book/gt/00371_972023081.gt.txt', 'Book/gt/00166_972019195.gt.txt', 'Book/gt/00536_9

In [None]:
# make training model

%%bash 

cd tesstrain
make training MODEL_NAME=arabic_model START_MODEL=ara PSM=7 TESSDATA=/path/to/tessdata
