***importing packages and libraries***

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import RandomBrightness, RandomContrast, RandomZoom
import pathlib as pl
import xml.etree.ElementTree as ET
from configs import modelConfigs

In [None]:
def pre_process_image(image, target_height):
    # Load image
    if type(image) == type(image) == str: # if it is a image file path
        image = tf.io.read_file(image)
    image = tf.image.decode_image(image, channels=1)  # Decode to grayscale
    
    # Convert image to float32 and normalize to [0, 1]
    image = tf.image.convert_image_dtype(image, tf.float32)

    shape = tf.shape(image)

    # Convert to numpy array for shape
    h, w, _ = shape.numpy()   
    # h, w = shape[0], shape[1] #this line is probs works and is more computationally effiiecnient than the line above

    # Calculate new width based on target height
    aspect_ratio = tf.cast(w, tf.float32) / tf.cast(h, tf.float32)
    new_width = tf.cast(target_height * aspect_ratio, tf.int32)

    # Resize image
    image_resized = tf.image.resize(image, [target_height, new_width])

    return image_resized

In [None]:
# a function to seperate the forms iamges computer text written parts from the hand written parts
# important as label would need to be doubled to train both parts and other training complications
# the name portion is also needs to be removed as their is not training data on the text
def forms_text_seporator(form_path, HW_bounding_box):
    image = tf.io.read_file(form_path)

    # configs for dimensions
    c = modelConfigs()
    # standardise handwritten bouding box coords for tensorflow
    for i in range(4):
        if i % 2 == 0:
            HW_bounding_box[i] = HW_bounding_box[i] / c.form_height
        else:
            HW_bounding_box[i] = HW_bounding_box[i] / c.form_width

    # handwritten part
    HW_y1, HW_x1, HW_y2, HW_x2 = HW_bounding_box
    CW_bounding_box = [0, 0, HW_y2 , 1 ]

    HW_output_dimensions = [
        c.form_height - (c.form_height * HW_y1 + c.form_height * HW_y2),
        c.form_width - (c.form_width * HW_x1 + c.form_width * HW_x2)
    ]

    CW_ouput_dimensions = [
        c.form_height * HW_y2,
        c.form_width
    ]

    HW_cropped_image = tf.image.crop_and_resize([image], [HW_bounding_box], [0], HW_output_dimensions)
    CW_cropped_image = tf.image.crop_and_resize([image], [CW_bounding_box], [0], CW_ouput_dimensions)
    
    return HW_cropped_image, CW_cropped_image

In [None]:
# TODO add more severe augmentations
def build_augmentation_model():
    augmentation_model = tf.keras.Sequential([
        RandomBrightness(factor=(-0.1, 0.1), value_range=(0, 1)),
        RandomContrast(factor=0.05),
        RandomZoom(height_factor=(-0.2, 0.2), width_factor=(-0.2, 0.2), fill_mode='constant', fill_value=1.0)     
    ])
    return augmentation_model

def random_pad(image, max_padding=50):
    if type(image) == str:
        image = tf.io.read_file(image)
    padding = tf.random.uniform([4], minval=0, maxval=max_padding, dtype=tf.int32)
    paddings = tf.stack([[padding[0], padding[1]], [padding[2], padding[3]], [0, 0]])
    padded_image = tf.pad(image, paddings, mode='CONSTANT')
    return padded_image

def form_pad_val_gen():
    return tf.random.uniform([], minval=100, maxval=250, dtype=tf.int32)

def line_pad_val_gen():
    return tf.random.uniform([], minval=20, maxval=70, dtype=tf.int32)


In [None]:
#a function to update the form croping bouding box based on new data from new line
def form_crop_bouding_box_updater(current_bounding_box, line, line_num, total_lin_num):
    
    words = line.findall('word')
    for word in words:
        chars = word.findall('cmp')
        for char in chars:
            # first x coord
            x_val = int(char.get('x'))
            if current_bounding_box[1] == 0:
                current_bounding_box[1] = x_val
            elif current_bounding_box[1] > x_val:
                current_bounding_box[1] = x_val
            #  second x coord 
            if current_bounding_box[3] == 0:
                current_bounding_box[3] = x_val
            elif current_bounding_box[3] < x_val:
                current_bounding_box[3] = x_val
            # handling y coords cases
            y_val = int(char.get('y'))
            if line_num == 0:
                if current_bounding_box[0] == 0:
                    current_bounding_box[0] = y_val
                elif current_bounding_box[0] > y_val:
                    current_bounding_box[0] = y_val
            elif line_num == total_lin_num:
                if current_bounding_box[2] == 0:
                    current_bounding_box[2] = y_val
                elif current_bounding_box[2] < y_val:
                    current_bounding_box[2] = y_val
    return current_bounding_box


In [None]:
def data_preparator(X_image_paths, Y_image_path , data_length = 1000 , image_target_height = 512, augmentation_probability = 0.3):
    # directory containing labels for training data
    label_dir = pl.Path(Y_image_path)
    # forms and lines paths
    forms_path = X_image_paths[0]
    lines_path = X_image_paths[1]
    # training example and label data X and Y
    X = []
    Y = []
    # keep track of the number files are being added to the data batch
    data_counter = 0
    # data augmentor
    base_augmentation_model = build_augmentation_model()

    for XML_path in label_dir.iterdir():
        if data_counter >= data_length:
            print(data_counter)
            break
        # get XML root element 
        root = ET.parse(str(XML_path)).getroot()
        # a lines in the XML file
        all_line_ele = root.find('handwritten-part')
        lines = all_line_ele.findall('line')
        # get bounding boxes feor handwritten part 
        # bouding box in the convention [y1, x1, y2, x2]
        form_crop_bounding_box = [0] * 4
        form_full_text = '' # will be added onto this string 
        line_counter = 0
        line_nums = len(lines) - 1 # the number of lines
        # sub foilder for form that contains the lines for that form
        subf_path = root.get('id')

        # for lines
        for line in lines:
            
            line_text = line.get('text') #TODO figure out how to make into sequence label
            form_full_text += line_text

            # image path in the subfolder
            image_subf_path = line.get('id')
            # sulber folder name (is the first 3 chars)
            subf_name = image_subf_path[:3]
            
            full_line_image_path = f'{lines_path}/{subf_name}/{subf_path}/{image_subf_path}.png'
            #process the image 
            line_image = random_pad(full_line_image_path)
            line_image = pre_process_image(line_image, image_target_height)
            #randomly with a chosen probability augment
            if np.random.rand() < augmentation_probability:
                line_image = base_augmentation_model(line_image)
            X.append(line_image)
            data_counter += 1
            
            form_crop_bounding_box = form_crop_bouding_box_updater(
                form_crop_bounding_box,
                line,
                line_counter,
                line_nums
            )
            line_counter += 1

        full_form_image_path = f'{forms_path}/{str(XML_path)}.png'
        # get computer generated 
        CW_cropped_form_image, HW_cropped_form_image = forms_text_seporator(
            full_form_image_path, 
            form_crop_bounding_box
        )
        # randomly pad all form image for richer training data
        CW_form_image = random_pad(CW_cropped_form_image, form_pad_val_gen())
        HW_form_image = random_pad(HW_cropped_form_image, form_pad_val_gen())

        CW_form_image = pre_process_image(CW_cropped_form_image)
        HW_form_image = pre_process_image(HW_cropped_form_image)

        if np.random.rand() < augmentation_probability:
            CW_form_image = base_augmentation_model(CW_form_image)
            HW_form_image = base_augmentation_model(HW_form_image)

        X.append(HW_form_image)
        X.append(CW_form_image)

        counter += 2
        
    """
    TODO figure how to stop inputs at exactly the right amount although it shouldnt matter to much as keras should handle extra data 
    TODO figure out sequence data using the string ouputs we have 
    """
        
    # return np.array(X), np.array(Y)



In [None]:
c = modelConfigs()
data_preparator(c.image_paths, c.label_path, data_length= 2 )