Import module


In [4]:
import xml.etree.ElementTree as ET
import re

In [5]:
xml_str = """<?xml version="1.0"?>
<data>
    <country name="Liechtenstein">
        <rank>1</rank>
        <year>2008</year>
        <gdppc>141100</gdppc>
        <neighbor name="Austria" direction="E"/>
        <neighbor name="Switzerland" direction="W"/>
    </country>
    <country name="Singapore">
        <rank>4</rank>
        <year>2011</year>
        <gdppc>59900</gdppc>
        <neighbor name="Malaysia" direction="N"/>
    </country>
    <country name="Panama">
        <rank>68</rank>
        <year>2011</year>
        <gdppc>13600</gdppc>
        <neighbor name="Costa Rica" direction="W"/>
        <neighbor name="Colombia" direction="E"/>
    </country>
</data>"""

```fromstring()``` parses XML from a string directly into an ``Element``, which is the root element of the parsed tree. Other parsing functions may create an ``ElementTree``.

In [6]:
tree = ET.parse('sample.txt')
root = tree.getroot()
# root = ET.fromstring(xml_str) # return Element object


FileNotFoundError: [Errno 2] No such file or directory: 'sample.txt'

You can read from file via ```root = ET.parse('sample.xml')```

Root has a tag and a dictionary of attributes

In [None]:
root.tag

'data'

In [None]:
root.attrib

{}

Show all children nodes

In [None]:
for child in root:
  print(child.tag)
  print(child.attrib)

country
{'name': 'Liechtenstein'}
country
{'name': 'Singapore'}
country
{'name': 'Panama'}


In [None]:
root[0][1].text

'2008'

Finding elements

In [None]:
for neighbor in root.iter('neighbor'):
  print(neighbor.attrib)

{'name': 'Austria', 'direction': 'E'}
{'name': 'Switzerland', 'direction': 'W'}
{'name': 'Malaysia', 'direction': 'N'}
{'name': 'Costa Rica', 'direction': 'W'}
{'name': 'Colombia', 'direction': 'E'}


In [None]:
for country in root.findall('country'):
  rank = country.find('rank').text
  name = country.get('name')
  print(name, rank)
  print(type(country))

Liechtenstein 1
<class 'xml.etree.ElementTree.Element'>
Singapore 4
<class 'xml.etree.ElementTree.Element'>
Panama 68
<class 'xml.etree.ElementTree.Element'>


Modifying an XML file

In [None]:
for rank in tree.iter('rank'):
  new_rank = int(rank.text)+1
  rank.text = str(new_rank)
  rank.set('updated', 'yes')

In [None]:
tree.write('output.txt')

In [None]:
for country in root.findall('country'):
  rank = int(country.find('rank').text)
  if rank > 50:
      root.remove(country)

In [None]:
print(ET.tostring(root, encoding='utf8').decode('utf8'))


<?xml version='1.0' encoding='utf8'?>
<data>
    <country name="Liechtenstein">
        <rank updated="yes">2</rank>
        <year>2008</year>
        <gdppc>141100</gdppc>
        <neighbor direction="E" name="Austria" />
        <neighbor direction="W" name="Switzerland" />
    </country>
    <country name="Singapore">
        <rank updated="yes">5</rank>
        <year>2011</year>
        <gdppc>59900</gdppc>
        <neighbor direction="N" name="Malaysia" />
    </country>
    </data>


**Building XML documents**

In [None]:
a = ET.Element('a')
b = ET.SubElement(a, 'b')
b.text = 'fpt1'
c = ET.SubElement(a, 'c')
d = ET.SubElement(c, 'd')
d.text = 'fpt2'
print(ET.tostring(a, encoding='utf8').decode('utf8'))

<?xml version='1.0' encoding='utf8'?>
<a><b>fpt1</b><c><d>fpt2</d></c></a>


**XPath support**

In [None]:
import xml.etree.ElementTree as ET

root = ET.fromstring(xml_str)

# Top-level elements
elememts = root.findall(".")
for el in elememts:
  print(el.attrib)


{}


In [None]:
# All 'neighbor' grand-children of 'country' children of the top-level
# elements
elememts = root.findall("country")
for el in elememts:
  print(el.attrib, el.tag)


{'name': 'Liechtenstein'} country
{'name': 'Singapore'} country
{'name': 'Panama'} country


In [None]:
# Nodes with name='Singapore' that have a 'year' child
elememts = root.findall(".//year/..[@name='Singapore']")
for el in elememts:
  print(el.attrib)


{'name': 'Singapore'}


In [None]:
# 'year' nodes that are children of nodes with name='Singapore'
elememts = root.findall(".//*[@name='Singapore']/year")
for el in elememts:
  print(el.text)


2011


In [None]:
# All 'neighbor' nodes that are the second child of their parent
elememts = root.findall(".//neighbor[2]")
for el in elememts:
  print(el.attrib)

{'name': 'Switzerland', 'direction': 'W'}
{'name': 'Colombia', 'direction': 'E'}


**Python Json quick start**

In [None]:
import json

Python JSON to dict

In [None]:
person = '{"name": "Bob", "languages": ["English", "Fench"]}'
person_dict = json.loads(person)
# Output: {'name': 'Bob', 'languages': ['English', 'Fench']}
print( person_dict)

# Output: ['English', 'French']
print(person_dict['languages'])

{'name': 'Bob', 'languages': ['English', 'Fench']}
['English', 'Fench']


Python read JSON file

In [None]:
data_jon = """{"name": "Bob", 
"languages": ["English", "Fench"]
}"""

In [None]:
with open('person.txt') as f:
  data = json.load(f)

# Output: {'name': 'Bob', 'languages': ['English', 'Fench']}
print(data)

FileNotFoundError: ignored

Convert dict to JSON

In [None]:
person_dict = {'name': 'Bob',
'age': 12,
'children': None
}
person_json = json.dumps(person_dict)

# Output: {"name": "Bob", "age": 12, "children": null}
print(person_json)

Writing JSON to a file

In [None]:
person_dict = {"name": "Bob",
"languages": ["English", "Fench"],
"married": True,
"age": 32
}

with open('person_2.txt', 'w') as json_file:
  json.dump(person_dict, json_file)

Python pretty print JSON

In [None]:
person_string = '{"name": "Bob", "languages": "English", "numbers": [2, 1.6, null]}'

# Getting dictionary
person_dict = json.loads(person_string)

# Pretty Printing JSON string back
print(json.dumps(person_dict, indent = 4, sort_keys=True))

**Quick test**

In [None]:
import xml.etree.ElementTree as ET
import json

In [None]:
xml_in = """
<case>
    <number>1</number>
    <age></age>
    <sex></sex>
    <composition></composition>
    <echogenicity></echogenicity>
    <margins></margins>
    <calcifications></calcifications>
    <tirads></tirads>
    <reportbacaf></reportbacaf>
    <reporteco></reporteco>
    <mark>
        <image>1</image>
        <svg>[{"points": [{"x": 385, "y": 182}, {"x": 398, "y": 179}, {"x": 404, "y": 176}, {"x": 409, "y": 173}, {"x":
            415, "y": 159}, {"x": 417, "y": 153}, {"x": 421, "y": 143}, {"x": 421, "y": 136}, {"x": 419, "y": 76}, {"x":
            412, "y": 73}, {"x": 406, "y": 69}, {"x": 393, "y": 61}, {"x": 390, "y": 57}, {"x": 383, "y": 53}, {"x":
            378, "y": 50}, {"x": 369, "y": 44}, {"x": 360, "y": 42}, {"x": 352, "y": 42}, {"x": 348, "y": 42}, {"x":
            340, "y": 41}, {"x": 331, "y": 40}, {"x": 321, "y": 40}, {"x": 311, "y": 39}, {"x": 304, "y": 39}, {"x":
            294, "y": 42}, {"x": 270, "y": 43}, {"x": 264, "y": 42}, {"x": 253, "y": 41}, {"x": 242, "y": 39}, {"x":
            231, "y": 37}, {"x": 221, "y": 36}, {"x": 209, "y": 35}, {"x": 190, "y": 39}, {"x": 164, "y": 49}, {"x":
            158, "y": 51}, {"x": 146, "y": 69}, {"x": 137, "y": 80}, {"x": 133, "y": 89}, {"x": 136, "y": 236}, {"x":
            142, "y": 244}, {"x": 149, "y": 251}, {"x": 161, "y": 261}, {"x": 210, "y": 271}, {"x": 217, "y": 274},
            {"x": 223, "y": 279}, {"x": 228, "y": 284}, {"x": 235, "y": 290}, {"x": 261, "y": 294}, {"x": 301, "y":
            296}, {"x": 309, "y": 294}, {"x": 320, "y": 287}, {"x": 326, "y": 283}, {"x": 327, "y": 281}, {"x": 332,
            "y": 271}, {"x": 338, "y": 263}, {"x": 347, "y": 253}, {"x": 348, "y": 240}, {"x": 349, "y": 222}, {"x":
            360, "y": 212}, {"x": 369, "y": 207}, {"x": 372, "y": 198}, {"x": 382, "y": 183}, {"x": 383, "y": 181},
            {"x": 386, "y": 181}], "annotation": {}, "regionType": "freehand"}]
        </svg>
    </mark>
    <mark>
        <image>2</image>
        <svg>[{"points": [{"x": 417, "y": 155}, {"x": 418, "y": 76}, {"x": 413, "y": 73}, {"x": 406, "y": 70}, {"x":
            401, "y": 62}, {"x": 393, "y": 56}, {"x": 388, "y": 52}, {"x": 383, "y": 42}, {"x": 374, "y": 41}, {"x":
            365, "y": 41}, {"x": 355, "y": 41}, {"x": 347, "y": 40}, {"x": 340, "y": 40}, {"x": 330, "y": 40}, {"x":
            325, "y": 39}, {"x": 318, "y": 39}, {"x": 313, "y": 39}, {"x": 301, "y": 39}, {"x": 277, "y": 39}, {"x":
            264, "y": 39}, {"x": 249, "y": 41}, {"x": 241, "y": 41}, {"x": 236, "y": 37}, {"x": 220, "y": 35}, {"x":
            207, "y": 35}, {"x": 194, "y": 42}, {"x": 183, "y": 46}, {"x": 172, "y": 52}, {"x": 160, "y": 57}, {"x":
            137, "y": 88}, {"x": 136, "y": 104}, {"x": 144, "y": 241}, {"x": 148, "y": 242}, {"x": 159, "y": 243}, {"x":
            173, "y": 244}, {"x": 181, "y": 250}, {"x": 188, "y": 258}, {"x": 192, "y": 264}, {"x": 200, "y": 274},
            {"x": 208, "y": 277}, {"x": 232, "y": 285}, {"x": 241, "y": 288}, {"x": 288, "y": 296}, {"x": 310, "y":
            296}, {"x": 322, "y": 290}, {"x": 327, "y": 284}, {"x": 332, "y": 274}, {"x": 338, "y": 264}, {"x": 344,
            "y": 258}, {"x": 348, "y": 251}, {"x": 350, "y": 242}, {"x": 354, "y": 226}, {"x": 356, "y": 217}, {"x":
            367, "y": 206}, {"x": 390, "y": 204}, {"x": 395, "y": 204}, {"x": 404, "y": 203}, {"x": 410, "y": 200},
            {"x": 417, "y": 193}, {"x": 419, "y": 190}, {"x": 417, "y": 153}], "annotation": {}, "regionType":
            "freehand"}]
        </svg>
    </mark>
</case>"""

In [None]:
xml_out = """
<annotations>
    <folder>images</folder>
    <filename>1_1.jpg</filename>
    <folder>VOC</folder>
    <size>
        <width>560</width>
        <height>360</height>
        <depth>3</depth>
    </size>
    <object>
        <name>thyroid_cancer</name>
        <pose>Unspecified</pose>
        <truncated>0</truncated>
        <difficult>0</difficult>
        <bndbox>
            <xmin>142</xmin>
            <ymin>161</ymin>
            <xmax>430</xmax>
            <ymax>287</ymax>
        </bndbox>
    </object>
</annotations>"""

In [None]:
tree_in = ET.parse('input.txt')


root_in = tree_in.getroot()


In [None]:
def get_box(points):
    xmin = min(points, key = lambda item: item[0])[0]
    ymin = min(points, key = lambda item: item[1])[1]
    xmax = max(points, key = lambda item: item[0])[0]
    ymax = max(points, key = lambda item: item[1])[1]
    return (xmin, ymin, xmax, ymax)

In [None]:
marks = root_in.findall('.//mark')
number = root_in.find('number').text

In [None]:
def get_bbox_from_segment(svg):
    data = json.loads(svg)
    # print(data)
    points = data[0]['points']
    points_coord = []
    for point in points:
        points_coord.append((int(point['x']), int(point['y'])))
    bbox = get_box(points_coord)
    return bbox

In [None]:
bbox_element = """
<object>
        <name>thyroid_cancer</name>
        <pose>Unspecified</pose>
        <truncated>0</truncated>
        <difficult>0</difficult>
        <bndbox>
            <xmin>142</xmin>
            <ymin>161</ymin>
            <xmax>430</xmax>
            <ymax>287</ymax>
        </bndbox>
</object>
    """
# print(ET.tostring(root_out, encoding='utf8').decode('utf8'))
for mark in marks:
  tree_out = ET.parse('pascal.txt')
  root_out = tree_out.getroot()
  bbox_el = ET.fromstring(bbox_element)
  image = mark.find('image').text
  image_file = '{}_{}'.format(number, image)
  print(image_file)
  svg = mark.find('svg').text
  root_out.find('filename').text = '{}.jpg'.format(image_file)
  root_out.find('.//width').text = "560"
  root_out.find('.//height').text = "360"
  bbox = get_bbox_from_segment(svg)
  print(bbox)
  bbox_el.find('.//name').text = 'thyroid_cancer'
  bbox_el.find('.//xmin').text = str(bbox[0])
  bbox_el.find('.//ymin').text = str(bbox[1])
  bbox_el.find('.//xmax').text = str(bbox[2])
  bbox_el.find('.//ymax').text = str(bbox[3])
  # print(ET.tostring(bbox_el, encoding='utf8').decode('utf8'))
  root_out.insert(1, bbox_el)
  # print(svg)
  object_xml = root_out.findall('.')
  # print(ET.tostring(object_xml[0], encoding='utf8').decode('utf8'))
  tree_out.write('{}.txt'.format(image_file))

1_1
(133, 35, 421, 296)
1_2
(136, 35, 419, 296)
