# Import needs

In [2]:
import pandas as pd
import numpy as np
import json
import xml.etree.ElementTree as ET

# Start transformations

The reason we need transformations is that our data can't be used directly by the model, we need to transform it to the right format. More specifically, `the VOC format`

In [3]:
train = pd.read_csv('data/train.csv')
train.head(5)

Unnamed: 0,video_id,sequence,video_frame,sequence_frame,image_id,annotations
0,0,40258,0,0,0-0,[]
1,0,40258,1,1,0-1,[]
2,0,40258,2,2,0-2,[]
3,0,40258,3,3,0-3,[]
4,0,40258,4,4,0-4,[]


In [9]:
for i, row in train.iterrows():
    print(row)
    print(row.video_id)
    break

video_id              0
sequence          40258
video_frame           0
sequence_frame        0
image_id            0-0
annotations          []
Name: 0, dtype: object
0


In [60]:
annotations = train.iloc[50].annotations
annotations

"[{'x': 539, 'y': 205, 'width': 105, 'height': 98}, {'x': 630, 'y': 351, 'width': 95, 'height': 56}]"

In [61]:
#transform annotations to list
annotations = annotations.replace("\'", "\"")
annotations_json = json.loads(annotations)
annotations_json

[{'x': 539, 'y': 205, 'width': 105, 'height': 98},
 {'x': 630, 'y': 351, 'width': 95, 'height': 56}]

# XML tests

In [64]:
# hello = ET.Element("hello")
# ET.SubElement(hello, "world")
# ET.ElementTree(hello).write("hello.xml")

# VOC format

In [None]:
def create_xml_file(df, image_dir, train_dir, out_dir, display=False):
    """ Converts an images annotations to Pascal VOC (XML) and writes to disk
    
    Args:
        df (pd.DataFrame): dataframe
        image_id (str): The image_id that we want to generate an XML file for
        train_dir (str): Path to the directory containing training
            dicom image files.
        out_dir (str): Path to the directory to save the XML file
        display (bool, optional): Whether to pretty-print the XML
            file prior to writing it to file.
    
    Returns:
        None;   It writes the Pascal VOC converted information to file
                as an XML file in the directed output directory.
    """
    def get_json_annotation(annotation):
        return json.loads(annotation.replace("\'", "\""))
    def _get_image_size(path):
        """ Get the image shape from a path to a dicom image """
        meta = pydicom.read_file(path)
        width = meta.Columns
        height = meta.Rows
        return str(int(width)), str(int(height)), "1"
    
    def resize(old_img_w, old_img_h, new_img_w=1280, new_img_h=1280):
        w_ratio = new_img_w/int(old_img_w)
        h_ratio = new_img_h/int(old_img_h)
        return new_img_w, new_img_h, w_ratio, h_ratio
        
    
    def _create_object_subtree(annotation, obj, w_ratio, h_ratio):
        """ Create the sub-tree related to a given object and update the root """
        # Object Sub-Element
        # Check that bbox is not NaN
        if not np.isnan(obj[3]):
            _object = ET.SubElement(annotation, "object")
            ET.SubElement(_object, "name").text = int_2_str[obj[1]].lower().replace(" ", "_")
            ET.SubElement(_object, "radiologist").text = obj[2]
            ET.SubElement(_object, "pose").text = "Unspecified"
            ET.SubElement(_object, "truncated").text = "0"
            ET.SubElement(_object, "difficult").text = "0"

            _bndbox = ET.SubElement(_object, "bndbox")
            ET.SubElement(_bndbox, "xmin").text = str(int(obj[3]*w_ratio))
            ET.SubElement(_bndbox, "ymin").text = str(int(obj[4]*h_ratio))
            ET.SubElement(_bndbox, "xmax").text = str(int(obj[5]*w_ratio))
            ET.SubElement(_bndbox, "ymax").text = str(int(obj[6]*h_ratio))
        return annotation
    
    for index, row in df.iterrows():
        # Initalize and create the objects array
        objects = train_df[train_df.image_id==DEMO_IMG_ID].to_numpy()
        annotation = ET.Element('annotation')
        img_w, img_h, img_d = _get_image_size(os.path.join(train_dir, image_id+".dicom"))
        img_w, img_h, w_ratio, h_ratio = resize(img_w, img_h)
        
        # ##### Beginning of XML #####
        
        # File Sub-Elements
        ET.SubElement(annotation, "folder").text="train"
        ET.SubElement(annotation, "filename").text=image_id+".dicom"
        ET.SubElement(annotation, "path").text=os.path.join(train_dir, image_id+".dicom")

        # SRC Sub-Element
        _src = ET.SubElement(annotation, "src")
        ET.SubElement(_src, "database").text="train"
        
        # Size Sub-Element
        _size = ET.SubElement(annotation, "size")
        ET.SubElement(_size, "width").text=str(img_w)
        ET.SubElement(_size, "height").text=str(img_h)
        ET.SubElement(_size, "depth").text=str(img_d)
        
        # Segmented Sub-Element
        ET.SubElement(annotation, "segmented").text="0"
        
        # Loop over every object and create the annotation for that bounding box
        for obj in objects:
            annotation = _create_object_subtree(annotation, obj, w_ratio, h_ratio)
        
        # Display if requested
        if display:
            pprint_elem_tree(annotation)
            
        # Save to output directory
        ET.ElementTree(annotation).write(os.path.join(out_dir, image_id+'.xml'))
    
# DEMO_IMG_ID = "9a5094b2563a1ef3ff50dc5c7ff71345"
# DEMO_PATH = os.path.join(TRAIN_DIR, DEMO_IMG_ID+".dicom")

# create_xml_file(train_df, DEMO_IMG_ID, TRAIN_DIR, out_dir="/tmp", display=True)

# print("Look into the tmp folder to see that the xml was created...")
# for file in [f for f in os.listdir("/tmp") if f.endswith(".xml")]: print("\t– /tmp/"+file)