In [None]:
import os
import sys

from lxml import etree
from PIL import Image
from IPython.display import display

sys.path.append("..")

from kiebids.modules.evaluation import process_xml_files, load_image_from_url, draw_polygon_on_image
from kiebids import config

In [None]:
# Download XML files 
# output_path = "../data"

# process_xml_files(
#    os.path.join(config.shared_folder, 'hymdata_sample/20230511T160908__coll.mfn-berlin.de_u_78a081'),
#     output_path
# )

In [None]:

# from tqdm import tqdm
# from io import BytesIO
# from PIL import Image, ImageDraw, ImageFont
# import requests

In [None]:
path = config.shared_folder + '/hymdata_sample/20230511T160908__coll.mfn-berlin.de_u_78a081'
def get_root(folder_path, i): 

    files = [f for f in os.listdir(folder_path) if f.endswith('.xml')]
    
    file_path = os.path.join(folder_path, files[i])
    tree = etree.parse(file_path)
    return tree.getroot()


def get_image(path, i, display_image=False):

    root = get_root(path, i=i)
    ns = {'ns': root.nsmap[None]} if None in root.nsmap else {}
    comments = root.find('.//ns:Metadata/ns:Comments' if ns else './/Metadata/Comments', namespaces=ns)
    # excluding some fields without assignment
    comments = dict(item.split("=", 1) for item in comments.text.split(", ") if len(item.split("=", 1))==2)

    # loading from url  
    image_url = comments.get('imgUrl')
    image = None
    if image_url:
        image = load_image_from_url(image_url)

    if display_image: 
        display(image)

    return image
    


In [None]:
def save_images(folder_path, output_path = "../data/images"):
    files = [f for f in os.listdir(folder_path) if f.endswith('.xml')]

    for file in files: 
        file_path = os.path.join(folder_path, file)
        tree = etree.parse(file_path)
        root = tree.getroot()
        ns = {'ns': root.nsmap[None]} if None in root.nsmap else {}
        comments = root.find('.//ns:Metadata/ns:Comments' if ns else './/Metadata/Comments', namespaces=ns)
        # excluding so#
        comments = dict(item.split("=", 1) for item in comments.text.split(", ") if len(item.split("=", 1))==2)
        image_url = comments.get('imgUrl')
        if image_url:
            image = load_image_from_url(image_url)

        image.save(f"{output_path}/{file.replace('.xml', '.jpg')}")


In [None]:
def ground_truth_text(path, image_num, display_image=False):
    
    root = get_root(path,image_num)
    ns = {'ns': root.nsmap[None]} if None in root.nsmap else {}
    comments = root.find('.//ns:Metadata/ns:Comments' if ns else './/Metadata/Comments', namespaces=ns)
    # excluding some fields without assignment
    comments = dict(item.split("=", 1) for item in comments.text.split(", ") if len(item.split("=", 1))==2) 
    transcriptions = ""
    textlines = root.xpath('//ns:TextLine' if ns else '//TextLine', namespaces=ns)
    
    if display_image==True:
        image_url = comments.get('imgUrl')
        image = None
        if image_url:
            image = load_image_from_url(image_url)  

    for image_num, textline in enumerate(textlines): 
        coords = textline.find('ns:Coords' if ns else 'Coords', namespaces=ns)

        if coords is not None and display_image is True:
            # loading from url
            points = coords.get('points')
            x_values, y_values = zip(*[tuple(map(int, point.split(','))) for point in points.split()]) 

            x_min = min(x_values)
            x_max = max(x_values)
            y_min = min(y_values)
            y_max = max(y_values)
            # print(x_min, y_min, x_max, y_max)
            cropped_image = image.crop((x_min, y_min, x_max, y_max))
            display(cropped_image)

        unicode_elem = textline.find('.//ns:Unicode' if ns else './/Unicode', namespaces=ns)
        if unicode_elem is not None:
            print(f"{image_num+1}: {unicode_elem.text}")

# EasyOCR 

Initialize the easyOCR functions

In [None]:
import easyocr
import torch
import cv2

import numpy as np 

from PIL import ImageDraw 

gpu = torch.cuda.is_available()
easyocr_reader = easyocr.Reader(['en'], gpu=gpu)


# Convert RGB to BGR

def easyOCR(PIL_image):
    cv2_img = cv2.cvtColor(np.array(PIL_image), cv2.COLOR_RGB2BGR)
    result = easyocr_reader.readtext(cv2_img)

    bounding_box = []
    text = []
    prob = []

    for (b, t, p) in result: 
        bounding_box.append(b)
        text.append(t)
        prob.append(p)
    return text, bounding_box, prob

def easyOCR_bounding_boxes(PIL_image, display_image=False): 

    cv2_image = cv2.cvtColor(np.array(PIL_image), cv2.COLOR_RGB2BGR)
    output = easyocr_reader.readtext(cv2_image)
    bounding_boxes = [box for (box, text, probability) in output]

    if display_image: 
        draw_image = PIL_image.copy()
        draw = ImageDraw.Draw(draw_image)

        for coords in bounding_boxes: 
            for i, coord in enumerate(coords):
                if i+1==len(coords): 
                    draw.line([tuple(coord), tuple(coords[0])], fill="green", width=0)
                else: 
                    draw.line([tuple(coord), tuple(coords[i+1])], fill="green", width=0)
        display(draw_image)

    return output

def crop_image(PIL_image, coords, display_image=False):

    x_values, y_values = zip(*coords)
    
    x_min = min(x_values)
    x_max = max(x_values)
    y_min = min(y_values)
    y_max = max(y_values)

    cropped_image = PIL_image.crop((x_min, y_min, x_max, y_max))
    if display_image: 
        display(cropped_image)
    return cropped_image


# TrOCR handwritten 
- Different models Handwritten  
TrOCR-Small-IAM  
TrOCR-Base-IAM  
TrOCR-Large-IAM ('microsoft/trocr-large-handwritten') 

- Different models printed  
TrOCR-Small-SROIE  
TrOCR-Base-SROIE  
TrOCR-Large-SROIE





In [None]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')

model_name = 'microsoft/trocr-large-handwritten'
processor = TrOCRProcessor.from_pretrained(model_name)
model = VisionEncoderDecoderModel.from_pretrained(
    model_name
).to(device)

def TrOcr(image, processor, model):
    """
    :param image: PIL Image.
    :param processor: Huggingface OCR processor.
    :param model: Huggingface OCR model.
 
 
    Returns:
        generated_text: the OCR'd text string.
    """
    # We can directly perform OCR on cropped images.
    pixel_values = processor(image, return_tensors='pt').pixel_values.to(device)
    generated_ids = model.generate(pixel_values)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) #[0]
    return generated_text


# Bounding Boxes 
We use easyOCR to extract the bounding boxes 

In [None]:
# display image
image_num = 25 # which image
image = get_image(path, image_num, display_image=False)
easyOCR_output = easyOCR_bounding_boxes(image, display_image=True)
easyOCR_result = []
TrOcr_result = []
for (bb,text,prob) in easyOCR_output: 
    cropped_image = crop_image(image, bb, display_image=False)
    easyOCR_result.append((text,prob))
    trOCR_text = TrOcr(cropped_image, processor,model)
    TrOcr_result.append(trOCR_text)

print("----- EasyOcr result -----")
for i, result in enumerate(easyOCR_result): 
    print(f"{i+1}: {result[0]}         prob: {result[1]}")

print("------ Tr OCR Result --------")
for i, result in enumerate(TrOcr_result): 
    print(f"{i+1}:  {result}")

print("----- Ground truth text ------")
ground_truth_text(path, image_num, display_image=False)


# Tesseract - Not yet implemented

In [None]:
import pytesseract 

pytesseract.pytesseract.tesseract_cmd = r"/home/jupyter-lova/miniconda3/pkgs/tesseract-5.3.1-he1868e8_0/bin/tesseract" 
# TESSDATA_PREFIX = "/home/jupyter-lova/miniconda3/pkgs/tesseract-5.3.1-he1868e8_0/bin/" 
# r"/home/jupyter-lova/miniconda3/pkgs/tesseract-5.3.1-he1868e8_0/bin/tesseract"
# r"/home/jupyter-lova/miniconda3/envs/app-kiebids/bin/pytesseract" 
# r"/home/jupyter-lova/miniconda3/pkgs/tesseract-5.3.1-he1868e8_0/bin/tesseract" 



def test_tesseract(image): 
    return pytesseract.image_to_string(image)

image.size

In [None]:
# Testing tesseract 
import os
for r,s,f in os.walk("/"):
    for image_num in f:
        if "tesseract" in image_num:
            print(os.path.join(r,image_num))

In [None]:
import os

for r,s,f in os.walk("/"):
    for image_num in f:
        if "tessdata" in image_num:
            print(os.path.join(r,image_num))

In [None]:
pytesseract.pytesseract.tesseract_cmd = "/home/jupyter-lova/miniconda3/envs/app-kiebids/bin/pytesseract" 

# Test TrOCR Handwritten 

In [None]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel


In [None]:
	
device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')

In [None]:
	
device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')

processor = TrOCRProcessor.from_pretrained('microsoft/trocr-small-printed')
model = VisionEncoderDecoderModel.from_pretrained(
    'microsoft/trocr-small-printed'
).to(device)

image = get_image(path, 5, display_image=True)
test = ocr(image, processor, model)

def ocr(image, processor, model):
    """
    :param image: PIL Image.
    :param processor: Huggingface OCR processor.
    :param model: Huggingface OCR model.
 
 
    Returns:
        generated_text: the OCR'd text string.
    """
    # We can directly perform OCR on cropped images.
    pixel_values = processor(image, return_tensors='pt').pixel_values.to(device)
    generated_ids = model.generate(pixel_values)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_text

In [None]:
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-small-printed')
model = VisionEncoderDecoderModel.from_pretrained(
    'microsoft/trocr-small-printed'
).to(device)

In [None]:
image = get_image(path, 5, display_image=True)
test = ocr(image, processor, model)

In [None]:
test