# Run OCR Pipeline

## Init
define here all constant variables

In [None]:
sys_path = r'C:\Users\sgala\PycharmProjects\OCR\OCR' # path to project
path_to_tesstrain_code = r"C:/Users/sgala/Documents/python_projects/tesseract/tesstrain" #download from https://github.com/tesseract-ocr/tesstrain/tree/main and follow instructions
#recomended - watch: https://www.youtube.com/watch?v=SvhoBT-PnME&ab_channel=SL7Tech
relative_path_to_starting_model = '../tessdata/' #default tesstrain data folder
tesseract_exe_path = r"C:\Program Files\Tesseract-OCR\tesseract.exe" #install tesseract from: https://github.com/UB-Mannheim/tesseract/wiki
tessdata_path = r"C:\Program Files\Tesseract-OCR\tessdata\script" #in tesseract folder after install
bash_path = r"C:/Program Files/Git/bin/bash.exe" #path to a bash on your computer (problem in windows mostly)

In [None]:
import shutil
import sys
sys.path += [sys_path]
from generate_pdfs.create_pdf_dataset import create_pdf_dataset
from tesseract.prepare_train_data import pdf_to_train_images
from tesseract.prepare_validation_data import process_pdf_to_images_and_data
from tesseract.metrics import evaluate_ocr_batch
from tesseract.match_gt_pred import match_predictions_to_gt
import pytesseract
import pandas as pd
import os
import cv2
import subprocess
import numpy as np


pytesseract.pytesseract.tesseract_cmd = tesseract_exe_path
os.environ["TESSDATA_PREFIX"] = tessdata_path

## Pipeline config
    define all the pipeline parameters:
    for every stage - weather to do that stage and also all its parameters

In [None]:
generate_pdfs = False
pdfs_dir = r"C:\Users\sgala\OCR\pipeline_2\random_pdfs"
corpus_path = os.path.join(sys_path,"generate_pdfs","top_500_words.csv")
n_files = 100
train_eval_ratio = 0.9
prepare_train_data = False
train_data_dir = r"C:\Users\sgala\OCR\pipeline_2\train_dataset"
augment = False
prepare_eval_data = False
eval_data_dir = r"C:\Users\sgala\OCR\pipeline_2\eval_dataset"
train_model = False
model_name = "heb_random_top_five_hundred_two" #apperently cannot contain numbers, go figure
model_dir = r"C:\Users\sgala\OCR\pipeline_2"

eval_model = True




## Generate pdfs

In [None]:
if generate_pdfs:
    create_pdf_dataset(pdfs_path=pdfs_dir,
                       corpus_path=corpus_path,
                       n_files=n_files)
    

## Prepare train eval data

In [None]:
from glob import glob
if prepare_train_data or prepare_eval_data:
    all_paths = glob(pdfs_dir+"\*.pdf")
    ratio_index = int(len(all_paths)*train_eval_ratio)
    train_paths = all_paths[:ratio_index]
    eval_paths = all_paths[ratio_index:]

    for tp in train_paths:
        pdf_to_train_images(tp, train_data_dir, augment=augment) #dpi?
    for ep in eval_paths:
        process_pdf_to_images_and_data(ep, output_dir=eval_data_dir)

## train

Training is a bit finecy due to workign with makefile - this is the best i managed

In [None]:
if train_model:
    #moving folders to default locations in tesstrain
    train_folder_location = os.path.join(path_to_tesstrain_code,'data',f"{model_name}-ground-truth")
    shutil.move(train_data_dir, train_folder_location)
    #creating makefile commands to run on bash
    make_command = f"make -C {path_to_tesstrain_code} training MODEL_NAME={model_name} START_MODEL=heb_best TESSDATA={relative_path_to_starting_model} MAX_ITERATIONS=2000 LANG_TYPE=RTL --debug"
    make_command_2 = f"make -C {path_to_tesstrain_code} traineddata MODEL_NAME={model_name} --debug"


In [None]:
if train_model:
    print("the next block will try to run these commands:")
    print(make_command)
    print(make_command_2)
    print("if this doesnt work - please open bash and run it manually")

In [None]:
if train_model:
    process = subprocess.Popen([bash_path, "-c", make_command], stdout=subprocess.PIPE, shell=False)
    for line in process.stdout:
        print(line.decode('utf-8'))

In [None]:
if train_model:
    #moving folders back to pipeline folder
    shutil.move(train_folder_location, train_data_dir)
    trained_model_folder = os.path.join(path_to_tesstrain_code,'data',f"{model_name}")
    trained_model_folder_new = os.path.join(model_dir,model_name)
    shutil.move(trained_model_folder,trained_model_folder_new)
    #moving best model to models folder in tesseract folder
    models_path = glob(os.path.join(trained_model_folder_new,"tessdata_best","*.traineddata"))
    performence = np.array([float(model_path.split("_")[-3]) for model_path in models_path])
    best_model_path = models_path[np.argmin(performence)]
    shutil.copy(best_model_path,os.path.join(tessdata_path,f"{model_name}.traineddata") )
    print("model name should be here:")
    print(pytesseract.get_languages())
    

## Eval model

In [None]:
from PIL import Image, ImageDraw, ImageFont
import cv2
import numpy as np
import matplotlib.pyplot as plt
#TODO: move to seperate file
def visualize_ocr_utf8(image, bboxes, words,wers, font_path="arial.ttf", font_size=20, right_box_color=(0, 255, 0), wrong_box_color=(255, 0, 0), text_color=(10, 10, 10), thickness=2):
    """
    Visualizes OCR results with support for UTF-8 characters.

    Parameters:
        image (numpy.ndarray): The image (read via OpenCV).
        bboxes (list of list of int): List of bounding boxes, where each box is [x1, y1, x2, y2].
        words (list of str): List of words corresponding to each bounding box.
        font_path (str): Path to a TrueType font file (supports UTF-8 characters).
        font_size (int): Font size for the text.
        right_box_color (tuple): Color of the bounding box (default is green).
        right_box_color (tuple): Color of the bounding box if the prediction is wrong(default is red).
        text_color (tuple): Color of the text (default is black).
        thickness (int): Thickness of the bounding box.

    Returns:
        None: Displays the image with OCR annotations.
    """
    # Convert the image to a PIL image
    pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    draw = ImageDraw.Draw(pil_image)
    
    # Load the specified font
    try:
        font = ImageFont.truetype(font_path, font_size)
    except IOError:
        raise IOError(f"Font not found at {font_path}. Please specify a valid TrueType font file.")
    
    # Draw bounding boxes and text
    for bbox, word,wer in zip(bboxes, words,wers):
        if not bbox:
            continue
        x1, y1, x2, y2 = bbox  # Unpack the bounding box
        
        bbox_color = right_box_color if (wer is not None) and (wer<=0) else wrong_box_color
        # Draw the bounding box
        draw.rectangle([x1, y1, x2, y2], outline=bbox_color, width=thickness)
        
        # Add the text above the bounding box
        draw.text((x1, y1 - font_size), word[::-1], fill=text_color, font=font)
    
    # Convert back to OpenCV format for displaying with Matplotlib
    result_image = np.array(pil_image)
    return result_image


In [None]:
from glob import glob
import os.path as osp
import json
if eval_model:
    results = []
    verbose=True
    for p in glob(osp.join(eval_data_dir,'*output_data.json')):
        with open(p, encoding="utf-8") as f:
            document_data = json.load(f)
            for page, page_data in document_data.items():
                image_data_pred = pytesseract.image_to_data(page_data['image_path'],output_type=pytesseract.Output.DICT,lang=model_name)
                image_data_pred = pd.DataFrame(image_data_pred)
                image_data_pred['bbox'] = image_data_pred.apply(lambda r: (r['left'],r['top'],r['left'] + r['width'] ,r['top'] + r['height']),axis=1)
                image_data_pred  = image_data_pred[image_data_pred['conf']!=-1]
                gt_bbox = page_data['bboxes']
                gt_text = page_data['words']
                pred_bbox = list(image_data_pred['bbox'])
                pred_text = list(image_data_pred['text'])
                pred_conf = list(image_data_pred['conf'])
                
                try:
                    pred_text, pred_bbox, gt_text, gt_bbox, pred_conf, eval_stats = match_predictions_to_gt(pred_text,
                                                                                                      pred_bbox,
                                                                                                      gt_text,
                                                                                                      gt_bbox,
                                                                                                      pred_conf,
                                                                                                      intersect_threshold=0.8)
                    batch_results = evaluate_ocr_batch(pred_text, gt_text, pred_bbox, gt_bbox, pred_conf)
                    for k,v in eval_stats.items():
                        batch_results[k] = v
                except Exception as e:
                    print(e)
                    continue
                results.append(batch_results)
                if verbose:
                    image = cv2.imread(page_data['image_path'])
                    img_rgb = visualize_ocr_utf8(image, pred_bbox, pred_text,wers = batch_results['WERs'],font_size=40)
                    # Display the image
                    plt.figure(figsize=(24, 16))
                    plt.imshow(img_rgb)
                    plt.axis('off')
                    plt.show()

    results = pd.DataFrame(results)
    results['word_count'] = results['WERs'].apply(lambda x: len(x))
    stats = results[[c for c in results.columns if "Avg" in c or c=='word_count']]
    stats = stats.multiply(stats['word_count'],axis=0).sum()/stats['word_count'].sum()
    for k in eval_stats.keys():
        stats[f"Total_{k}"] = results[k].sum()
    col_order = [c for c in stats.index if "Total" in c] + [c for c in stats.index if "Avg" in c]
    stats = stats[col_order]

    print(stats)