<a href="https://colab.research.google.com/github/GruAna/VU/blob/master/xml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import xml.etree.ElementTree as ET
import numpy as np

## Historical poster dataset

In [None]:
def read_gt_poster(xml_file):
  # poster dataset
  # returns labels in a tuple - first contains coordinates (8 numbers), second word (string)
  tree = ET.parse(xml_file)
  root = tree.getroot()
  root.iter('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}TextRegion')

  labels = []
  for word in root.iter('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}Word'):
    coords = word.find('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}Coords').get('points')
    text = word.find('{http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15}TextEquiv')
    labels.append((coords, text[0].text))

  return labels

labels = getLabelsXml('P-21007.xml')
print(labels)

('242,1942 242,2077 595,2077 595,1942', 'durch')


KAISTdetectiondataset

In [None]:
def read_gt_kaist(xml_file, scaling_ratio=1):
    """
    KAISTdetectiondataset dataset (XML - labels parser)
    Returns ground truth in a tuple - first contains coordinates (8 numbers), second word (string).
    If image was previously scaled, one might need to scale also gt coordinates by given ratio.
    """
    gt = []

    tree = ET.parse(xml_file)
    root = tree.getroot()
    print(root[0][2])
    # get values in this order: height, width, x (left) coordinate, y (top) coordinate
    for i, bbox in enumerate(root[0][2].findall('word')):
        # create list of integers with bounding box values, sort by attribute name
        # in case in different document there is a different order of attributes
        bbox_integer = [int(val) for key, val in sorted(bbox.attrib.items(), key = lambda el: el[0])]
        
        # calculate bottom coordinate of bounding rectangle x+width, y+height
        x_right= int((bbox_integer[2] + bbox_integer[1]) * scaling_ratio)
        y_bottom = int((bbox_integer[3] + bbox_integer[0]) * scaling_ratio)
        x_left = int(bbox_integer[2] * scaling_ratio)
        y_top = int(bbox_integer[3] * scaling_ratio)

        bbox_coords = np.array([[x_left, y_top], [x_right, y_bottom]])

        # get label
        label = ""
        for char in root[0][2][i].findall('character'):
            ch = char.get('char')
            print(char)
            label += ch
        # create list of labels and corresponding boundin boxes
        gt.append((label, bbox_coords))

    return gt

In [None]:
labels = read_gt_kaist('/content/DSC02423.xml')
print(labels)

<Element 'words' at 0x7f3b72997710>
0 <Element 'word' at 0x7f3b729ae350>
<Element 'character' at 0x7f3b729ae9b0>
<Element 'character' at 0x7f3b729aefb0>
<Element 'character' at 0x7f3b729ae530>
<Element 'character' at 0x7f3b729aeef0>
<Element 'character' at 0x7f3b729ae890>
<Element 'character' at 0x7f3b729ae830>
<Element 'character' at 0x7f3b729ae1d0>
<Element 'character' at 0x7f3b729aebf0>
<Element 'character' at 0x7f3b729aee90>
1 <Element 'word' at 0x7f3b729ae710>
<Element 'character' at 0x7f3b729aea70>
<Element 'character' at 0x7f3b729ae590>
<Element 'character' at 0x7f3b729ae7d0>
<Element 'character' at 0x7f3b729aed10>
<Element 'character' at 0x7f3b72a6e710>
<Element 'character' at 0x7f3b72a6e410>
<Element 'character' at 0x7f3b729be170>
<Element 'character' at 0x7f3b729be0b0>
<Element 'character' at 0x7f3b729be110>
<Element 'character' at 0x7f3b729be050>
[('MONOKRAFT', array([[  0, 148],
       [627, 290]])), ('HANDICRAFT', array([[137, 233],
       [514, 290]]))]


## CTW 1500

In [None]:
def read_gt_ctw_train(xml_file, scaling_ratio=1):
    """
    SCUT-CTW1500 dataset (XML - train labels parser)
    Returns ground truth in a tuple - first contains coordinates (8 numbers), second word (string).
    If image was previously scaled, one might need to scale also gt coordinates by given ratio.
    """
    gt = []

    tree = ET.parse(xml_file)
    root = tree.getroot()

    # get values in this order: height, left coordinate, top coordinate, width
    for i, bbox in enumerate(root[0].findall('box')):
        # from dict.values to list of integers
        bbox_integer = [int(val) for key, val in sorted(bbox.attrib.items(), key = lambda el: el[0])]
        
        # calculate bottom coordinate of bounding rectangle x+width, y+height
        x_right= int((bbox_integer[1] + bbox_integer[3]) * scaling_ratio)
        y_bottom = int((bbox_integer[2] + bbox_integer[0]) * scaling_ratio)
        x_left = int(bbox_integer[1] * scaling_ratio)
        y_top = int(bbox_integer[2] * scaling_ratio)

        bbox_coords = np.array([[x_left, y_top], [x_right, y_bottom]])

        # get label
        label = root[0][i].find('label').text

        # create list of labels and corresponding boundin boxes
        gt.append((label, bbox_coords))

    return gt
    

In [None]:
def read_gt_ctw_test(data, scaling_ratio=1):
    """
    SCUT-CTW1500 dataset (test labels parser)
    """
    # one line = one bounding polygon : list of coordinates, each separated by commas, last is the text inside 
    # there are #### before each text, two additional ## no text recognized


    annotations = []
    with open(data, "r") as file:
        for line in file:
            line = line.rstrip('\n')
            text = line.split("####")
            label = text[-1]
            coordinates = text[0].split(",")[:-1]
            c = [int(i) for i in coordinates]
            minX = min(c[::2])*scaling_ratio
            maxX = max(c[::2])*scaling_ratio
            minY = min(c[1::2])*scaling_ratio
            maxY = max(c[1::2])*scaling_ratio

            bbox_coords = np.array( [[minX, minY], [maxX, maxY]] )
            annotations.append((label, bbox_coords))

    return annotations