In [1]:
import numpy as np
import torch
import tfrecord
import cv2
from time import process_time
import tensorflow as tf
from tfrecord.torch.dataset import TFRecordDataset
import os

In [2]:
def visibility_matrix(torch_df,num_words):
    '''indentify neighbours to the right and down and generate visibility matrix / neighbourhood graph.
        for each node, we indentify it's closest neighbour to the right and the closest neighbour below.
    input: numpy array of shape (words, [x1, x2, y1, y2])
    output: visibility matrix of shape (words, words)'''
    
    #remove last column (word_length)
    npdf = torch_df.numpy()
    
    #Only create matrix of size matching number of words
    matrix = np.zeros((num_words, num_words))

    for i,row1 in enumerate(npdf):
        if i == num_words:
            break

        #xmin = 0
        #ymin = 1
        #xmax = 2
        #ymax = 3 

        min_down = 10**6
        min_right = 10**6
        min_down_idx = None
        min_right_idx = None

        for j,row2 in enumerate(npdf):
            if j == num_words:
                break
            if i != j:
                #Right neighbour
                if row1[1] <= row2[1] <= row1[3] or row1[1] <= row2[3] <= row1[3] or row2[1] <= row1[1] <= row2[3] or row2[1] <= row1[3] <= row2[3]:
                    if  0 <= row2[0]-row1[2] <= min_right:
                        min_right_idx, min_right = j, row2[0]-row1[2]

                #Down neighbour
                if row1[0] <= row2[0] <= row1[2] or row1[0] <= row2[2] <= row1[2] or row2[0] <= row1[0] <= row2[2] or row2[0] <= row1[2] <= row2[2]:
                    if 0 <= row2[1]-row1[3] <= min_down:
                        min_down_idx, min_down = j, row2[1]-row1[3]

        if min_right_idx != None:
            matrix[i,min_right_idx] = 1
            matrix[min_right_idx, i] = 1    
        if min_down_idx != None:
            matrix[i,min_down_idx] = 1
            matrix[min_down_idx, i] = 1
            
    source = []
    target = []

    for i, row in enumerate(matrix):
        for j, edge in enumerate(row):
            if edge == 1:
                source.append(i)
                target.append(j)

    edge_index = torch.tensor([source, target], dtype=torch.long)

    return edge_index

In [3]:
def tfrecord_transforms(elem,
                   max_height = 768,
                   max_width = 1366,
                   num_of_max_vertices = 250,
                   max_length_of_word = 30,
                   batch_size = 8):
    """
    Function used to transform the data loaded by the TFRecord dataloader.
    Parameters are defind in TIES datageneration, defines the size and complexity of the generated tables. DO NOT CHANGE  
    """
    reshape = 0
    xnumwords = 0
    feat_reshap = 0
    visimat = 0
    adjmats = 0

    with torch.no_grad():
        #Everything is flattened in tfrecord, so needs to be reshaped. 

        #Images are in range [0,255], need to be in [0,1]
        #If image max is over 1 , then normalize: 
        data_dict =  {}

        
        #Torch dimensions: B x C x H x W
        #inputting grayscale, so only 1 dimension
        t = process_time()
        if torch.max(elem['image']) > 1:
            data_dict['imgs'] = (elem['image']/255).reshape(batch_size,1,max_height,max_width)
        else:
            data_dict['imgs'] = elem['image'].reshape(batch_size,1,max_height,max_width)
        reshape+=process_time()-t

        #Extract number of words for each image:
        t = process_time()
        num_words = elem['global_features'][:,2]
        data_dict['num_words'] = num_words
        xnumwords += process_time()-t
        
        t = process_time()
        v = elem['vertex_features'].reshape(batch_size,num_of_max_vertices,5).float()
        feat_reshap += process_time()-t
        #normalizaing words coordinates to be invariant to image size 
        v[:,:,0] = v[:,:,0]/max_width
        v[:,:,1] = v[:,:,1]/max_height
        v[:,:,2] = v[:,:,2]/max_width
        v[:,:,3] = v[:,:,3]/max_height

        #data_dict['vertex_features'] = v

        vertex_feats = []
        for idx,vf in enumerate(v):
          
            vertex_feats.append(vf.numpy())

        data_dict['vertex_features'] = vertex_feats  
                
        #Calculate visibility matrix for each batch element
        t = process_time()
        edge_index = []
        for idx,vex in enumerate(v):
            edge_index.append(visibility_matrix(vex,num_words[idx]))
        visimat += process_time()-t
         
        data_dict['edge_index'] = edge_index

        
        adj_cells = []
        adj_cols = []
        adj_rows = []
        for idx,nw in enumerate(num_words):
            adj_cells.append(elem['adjacency_matrix_cells'][idx].reshape(num_of_max_vertices,num_of_max_vertices).numpy())
            adj_cols.append(elem['adjacency_matrix_cols'][idx].reshape(num_of_max_vertices,num_of_max_vertices).numpy())
            adj_rows.append(elem['adjacency_matrix_rows'][idx].reshape(num_of_max_vertices,num_of_max_vertices).numpy())

        data_dict['adjacency_matrix_cells'] = adj_cells
        data_dict['adjacency_matrix_cols'] = adj_cols
        data_dict['adjacency_matrix_rows'] = adj_rows
        

        
        #print(f'#####TRANSFORMS: reshape: {reshape}, extract number of words: {xnumwords}, feat_reshape: {feat_reshap}, visibility matrix: {visimat}, adjacency matrix: {adjmats}')

        return data_dict

In [4]:
#variables for tfrecord loader
batchsize = 8
index_path = None
tfrecord_description = {"image": "float", 
               "global_features": "int",
               "vertex_features": "int",
               "adjacency_matrix_cells":"int",
               "adjacency_matrix_cols":"int",
               "adjacency_matrix_rows":"int",
               "vertex_text":'int'}

In [5]:
#Load list of tfRecords from folder: 
folder_path = os.getcwd()+r'\tfrecords'
#folder_path = "C:\Users\Jesper\Desktop\DataGeneration\Data_Outputs"

#load filenames of folder: 
tfrecord_files = os.listdir(folder_path)

In [7]:
outtfpath = os.getcwd()+r'\processed_tfrecords'

In [11]:
#imgs: float
#num_words: int
#vf: float
#edge index: int
#adj int 

options = tf.compat.v1.io.TFRecordOptions(tf.compat.v1.io.TFRecordCompressionType.GZIP)

for idx, record in enumerate(tfrecord_files):
    tfrecord_path = os.path.join(folder_path,record)
    dataset = TFRecordDataset(tfrecord_path, index_path, tfrecord_description)
    loader = torch.utils.data.DataLoader(dataset, batch_size=batchsize)
    for elem in loader:
        dd = tfrecord_transforms(elem,batch_size=batchsize)
        #
        imgs = dd['imgs'].numpy()
        
        #
        nw = dd['num_words'].numpy()
        
        #
        vf = np.array(dd['vertex_features'])
        
        #
        edge_num = []
        for e in dd['edge_index']:
            edge_num.append(e.shape[1])
        
        edge_num = np.array(edge_num,dtype=int)
        
        #
        edge_indexes = []
        maxed = np.max(edge_num)
        for i,ed in enumerate(dd['edge_index']):
            a = np.zeros(maxed)
            b = np.zeros(maxed)

            a[:edge_num[i]] = ed[0]
            b[:edge_num[i]] = ed[1]

            edge_indexes.append(np.array([a,b],dtype=int))
            
        
        
        
        out = dict()
        
        out['imgs'] = tf.train.Feature(float_list=tf.train.FloatList(value=imgs.flatten()))
        out['num_words'] = tf.train.Feature(int64_list=tf.train.Int64List(value=nw.flatten()))
        out['vertex_features'] = tf.train.Feature(float_list=tf.train.FloatList(value=vf.flatten()))
        out['num_edges'] =  tf.train.Feature(int64_list=tf.train.Int64List(value=edge_num.flatten()))
        out['edge_indexes'] =  tf.train.Feature(int64_list=tf.train.Int64List(value=np.array(edge_indexes).flatten()))
        
        out['adjacency_matrix_cells'] = tf.train.Feature(int64_list=tf.train.Int64List(value=np.array(dd['adjacency_matrix_cells']).flatten()))
        out['adjacency_matrix_cols'] = tf.train.Feature(int64_list=tf.train.Int64List(value=np.array(dd['adjacency_matrix_cols']).flatten()))
        out['adjacency_matrix_rows'] = tf.train.Feature(int64_list=tf.train.Int64List(value=np.array(dd['adjacency_matrix_rows']).flatten()))
        
    with tf.io.TFRecordWriter(os.path.join(outtfpath,record+".gz"),options=options) as writer:
        
        all_features = tf.train.Features(feature=out)


        seq_ex = tf.train.Example(features=all_features)
        writer.write(seq_ex.SerializeToString()) 
    break