In [None]:
import os 
import shutil
import xml.etree.ElementTree as ET
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw, ImageFont
from sklearn.model_selection import train_test_split

plt.rcParams['figure.figsize'] = (16, 12)
matplotlib.rc('xtick', labelsize=14) 
matplotlib.rc('ytick', labelsize=14) 

In [None]:
# locations of data
base_dir = '../datasets/FinUI'
label_dir = 'xml'
image_dir = 'images'
target = os.path.join(os.path.join(os.getcwd(), 'datasets'), '100')

In [None]:
# Function to get the data from xml annotation
def parse_xml(xml_file):
    root = ET.parse(xml_file).getroot()
    
    # Initialise the annotations dict 
    a_dict = {}
    a_dict['bboxes'] = []

    # Parse the XML Tree
    for elem in root:
        # Get the file name 
        if elem.tag == 'filename':
            a_dict['filename'] = elem.text
        if elem.tag == 'file':
            a_dict['filename'] = elem.text + '.png'
            
        # Get the image size
        elif elem.tag == 'size':
            image_size = []
            for subelem in elem:
                image_size.append(int(subelem.text.split('.')[0]))
            
            a_dict['image_size'] = tuple(image_size)
        
        # Get details of the bounding box 
        elif elem.tag == 'object':
            bbox = {}
            for subelem in elem:
                if subelem.tag == 'name':
                    bbox['class'] = subelem.text
                    
                elif subelem.tag == 'bndbox':
                    for subsubelem in subelem:
                        bbox[subsubelem.tag] = int(subsubelem.text.split('.')[0])            
            a_dict['bboxes'].append(bbox)
    
    return a_dict

In [None]:
# Function to convert the annotations dict to YOLO format
def dic2yolo(a_dict):
    out = []
    
    # For each bounding box
    for b in a_dict['bboxes']:
        if not b['class'] in class_name_to_id_mapping:
            continue
        class_id = class_name_to_id_mapping[b["class"]]
        
        # Transform the bbox co-ordinates to YOLO format
        b_center_x = (b['xmin'] + b['xmax']) / 2 
        b_center_y = (b['ymin'] + b['ymax']) / 2
        b_width    = (b['xmax'] - b['xmin'])
        b_height   = (b['ymax'] - b['ymin'])
        
        # Normalise the co-ordinates by the dimensions of the image
        image_w, image_h, image_c = a_dict['image_size']
        b_center_x /= image_w 
        b_center_y /= image_h 
        b_width    /= image_w 
        b_height   /= image_h 
        
        # format the output
        out.append("{} {:.3f} {:.3f} {:.3f} {:.3f}".format(class_id, b_center_x, b_center_y, b_width, b_height))
        
    return out

## Getting all classes and plotting frequency histogram

In [None]:
classes = []

data_dir = os.path.join(base_dir, label_dir)

for fn in os.listdir(data_dir):
    if fn.split('.')[1] != 'xml':
        continue
    classes += [d['class'] for d in parse_xml(os.path.join(data_dir, fn))['bboxes']]
        
c = pd.Series(classes).value_counts()
c /= c.sum()

# select only classes with decent stats
c = c[c>0.01]

plt.bar(c.index, c.values)
plt.ylabel('Probability of Occurence', labelpad=10, fontsize=18)
plt.xticks(rotation=90)
plt.show()

# maps class names to IDs
class_name_to_id_mapping = {}
for i in range(len(c)):
    class_name_to_id_mapping[c.index[i].capitalize()] = i

class_name_to_id_mapping = {
    'chart': 0,
    'table': 1
}
print(class_name_to_id_mapping)

## Coverting all annotations from xml to YOLO txt

In [None]:
data_dir = os.path.join(base_dir, label_dir)
annotations = [os.path.join(data_dir, x) for x in os.listdir(data_dir) if x[-3:] == "xml"]
annotations.sort()

for fn in annotations:
    a_dict = parse_xml(fn)
    b = a_dict['bboxes']
    bb = []
    for e in b:
        if e['class'] in ['chart', 'table']:
            bb.append(e)
    if len(bb)>0:
        a_dict['bboxes'] = bb        
    
        out = dic2yolo(a_dict)
        if len(out) > 0:
            out_fn = os.path.join('100\\yolo', a_dict['filename'].replace('png', 'txt'))
            print('\n'.join(out), file= open(out_fn, 'w'))  

## Dataset Partitioning

In [None]:
im_dir = os.path.join(base_dir, image_dir)
images = [os.path.join(im_dir, x) for x in os.listdir(im_dir)]
ann_dir = os.path.join(base_dir, 'yolo')
annotations = [os.path.join(ann_dir, x) for x in os.listdir(ann_dir)]

images = images[:85] + images[86:]

train_images, val_images, train_labels, val_labels = train_test_split(
    images, annotations, test_size = 0.3, random_state = 1
)

my_vars = locals()
for data_split in ['train', 'val']:
    for data_type in ['images', 'labels']:
        base_dir = os.path.join(target, data_split)
        destination = os.path.join(base_dir, data_type)
        if os.path.exists(destination):
            shutil.rmtree(destination, ignore_errors=True)
        os.makedirs(destination)
        for fn in my_vars[f'{data_split}_{data_type}']:
            try:
                shutil.copy(fn, destination)
            except:
                print(f)
                assert False

In [None]:
# create yaml
out = [
    'train: datasets/100/train/images',
    'val: datasets/100/val/images',
    f'nc: {len(class_name_to_id_mapping)}',
    f'names: {list(class_name_to_id_mapping)}'
]
print('\n'.join(out), file= open('datasets/100.yaml', 'w'))  